captcha.py
No OneTemporary
Actions

Size

13 KB

Referenced Files

None

Subscribers

None

captcha.py
View Options

	#!/usr/bin/python3
	#
	# Script to generate distorted text images for a captcha system.
	#
	# Copyright (C) 2005 Neil Harris
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License along
	# with this program; if not, write to the Free Software Foundation, Inc.,
	# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
	# http://www.gnu.org/copyleft/gpl.html
	#
	# Further tweaks by Brion Vibber <brion@pobox.com>:
	# 2006-01-26: Add command-line options for the various parameters
	# 2007-02-19: Add --dirs param for hash subdirectory splits
	# Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
	# 2008-01-06: Add regex check to skip words containing other than a-z

	from optparse import OptionParser
	import hashlib
	import json
	import math
	import multiprocessing
	import os
	import random
	import re
	import sys

	try:
	from PIL import Image, ImageDraw, ImageEnhance, ImageFont, ImageOps
	except ImportError:
	sys.exit(
	"This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/"
	)

	# regex to test for suitability of words
	nonalpha = re.compile("[^a-z]")

	# when il beside each other, hard to read
	confusedletters = re.compile(
	"[ijtlr][ijtl]\|r[nompqr]\|[il]"
	)

	# Pillow 9.2 added getbbox to replace getsize, and getsize() was removed in Pillow 10
	# https://pillow.readthedocs.io/en/stable/releasenotes/10.0.0.html#font-size-and-offset-methods
	# We don't have a requirements.txt, and therefore don't declare any specific supported or min version...
	IMAGEFONT_HAS_GETBBOX = hasattr(ImageFont.ImageFont, "getbbox")


	# Does X-axis wobbly copy, sandwiched between two rotates
	def wobbly_copy(src, wob, col, scale, ang):
	x, y = src.size
	f = random.uniform(4 * scale, 5 * scale)
	p = random.uniform(0, math.pi * 2)
	rr = ang + random.uniform(-30, 30) # vary, but not too much
	int_d = Image.new("RGB", src.size, 0) # a black rectangle
	rot = src.rotate(rr, Image.BILINEAR)
	# Do a cheap bounding-box op here to try to limit work below
	bbx = rot.getbbox()
	if bbx is None:
	return src
	else:
	l, t, r, b = bbx
	# and only do lines with content on
	for i in range(t, b + 1):
	# Drop a scan line in
	xoff = int(math.sin(p + (i * f / y)) * wob)
	xoff += int(random.uniform(-wob * 0.5, wob * 0.5))
	int_d.paste(rot.crop((0, i, x, i + 1)), (xoff, i))
	# try to stop blurring from building up
	int_d = int_d.rotate(-rr, Image.BILINEAR)
	enh = ImageEnhance.Sharpness(int_d)
	return enh.enhance(2)


	def gen_captcha(text, fontname, fontsize, file_name):
	"""Generate a captcha image"""
	# white text on a black background
	bgcolor = 0x0
	fgcolor = 0xFFFFFF
	# create a font object
	font = ImageFont.truetype(fontname, fontsize)

	# determine dimensions of the text
	if IMAGEFONT_HAS_GETBBOX:
	dim = font.getbbox(text)[2:]
	else:
	dim = font.getsize(text)

	# create a new image significantly larger that the text
	edge = max(dim[0], dim[1]) + 2 * min(dim[0], dim[1])
	im = Image.new("RGB", (edge, edge), bgcolor)
	d = ImageDraw.Draw(im)
	x, y = im.size
	# add the text to the image
	# Using between 5-6 pixels of negative kerning seemed
	# enough to confuse tesseract but still be very readable
	offset = 0
	for c in text:
	d.text(
	(x / 2 - dim[0] / 2 + offset, y / 2 - dim[1] / 2 + random.uniform(-3, 7)),
	c,
	font=font,
	fill=fgcolor,
	)
	if IMAGEFONT_HAS_GETBBOX:
	offset += font.getbbox(c)[2:][0]
	else:
	offset += font.getsize(c)[0]

	offset -= random.uniform(5, 6)

	for i in range(10):
	x0 = int(
	offset * ((i / 2) - 1) / 5
	+ x / 2
	- dim[0] / 2
	+ random.uniform(0, 10)
	)
	y0 = int(y / 2 - dim[1] + 30 + random.uniform(-10, 15))

	x1 = int(offset * i / 7 + x / 2 - dim[0] / 2 + random.uniform(-5, 5))
	y1 = int(y / 2 - dim[1] + 30 + random.uniform(-10, 30))

	if x1 < x0:
	x0, x1 = x1, x0

	if y1 < y0:
	y0, y1 = y1, y0

	d.arc(
	(x0, y0, x1, y1),
	int(random.uniform(-30, 30)),
	int(random.uniform(160, 300)),
	fill=fgcolor,
	)

	# now get the bounding box of the nonzero parts of the image
	bbox = im.getbbox()
	bord = min(dim[0], dim[1]) / 4 # a bit of a border
	im = im.crop((bbox[0] - bord, bbox[1] - bord, bbox[2] + bord, bbox[3] + bord))

	# and turn into black on white
	im = ImageOps.invert(im)

	# save the image, in format determined from filename
	im.save(file_name)


	def gen_subdir(basedir, md5hash, levels):
	"""Generate a subdirectory path out of the first _levels_
	characters of _hash_, and ensure the directories exist
	under _basedir_."""
	subdir = None
	for i in range(0, levels):
	char = md5hash[i]
	if subdir:
	subdir = os.path.join(subdir, char)
	else:
	subdir = char
	fulldir = os.path.join(basedir, subdir)
	if not os.path.exists(fulldir):
	os.mkdir(fulldir)
	return subdir


	def try_pick_word(words, badwordlist, verbose, nwords, min_length, max_length):
	if words is not None:
	word = words[random.randint(0, len(words) - 1)]
	while nwords > 1:
	word2 = words[random.randint(0, len(words) - 1)]
	word = word + word2
	nwords = nwords - 1
	else:
	word = ""
	max_length = max_length if max_length > 0 else 10
	for i in range(0, random.randint(min_length, max_length)):
	word = word + chr(97 + random.randint(0, 25))

	if verbose:
	print("word is %s" % word)

	if len(word) < min_length:
	if verbose:
	print(
	"skipping word pair '%s' because it has fewer than %d characters"
	% (word, min_length)
	)
	return None

	if max_length > 0 and len(word) > max_length:
	if verbose:
	print(
	"skipping word pair '%s' because it has more than %d characters"
	% (word, max_length)
	)
	return None

	if nonalpha.search(word):
	if verbose:
	print(
	"skipping word pair '%s' because it contains non-alphabetic characters"
	% word
	)
	return None
	if confusedletters.search(word):
	if verbose:
	print(
	"skipping word pair '%s' because it contains confusing letters beside each other"
	% word
	)
	return None

	for naughty in badwordlist:
	if naughty in word:
	if verbose:
	print(
	"skipping word pair '%s' because it contains word '%s'"
	% (word, naughty)
	)
	return None
	return word


	def pick_word(words, badwordlist, verbose, nwords, min_length, max_length):
	for x in range(
	1000
	): # If we can't find a valid combination in 1000 tries, just give up
	word = try_pick_word(
	words, badwordlist, verbose, nwords, min_length, max_length
	)
	if word:
	return word
	sys.exit("Unable to find valid word combinations")


	def read_wordlist(filename):
	if not os.path.isfile(filename):
	return []
	f = open(filename)
	words = [x.strip().lower() for x in f.readlines()]
	f.close()
	return words


	def run_in_thread(object):
	count = object[0]
	words = object[1]
	badwordlist = object[2]
	opts = object[3]
	font = object[4]
	fontsize = object[5]
	jsonmap = object[6]

	for i in range(count):
	word = pick_word(
	words,
	badwordlist,
	opts.verbose,
	opts.number_words,
	opts.min_length,
	opts.max_length,
	)
	salt = "%08x" % random.randrange(2**32)
	# 64 bits of hash is plenty for this purpose
	md5hash = hashlib.md5(
	(opts.key + salt + word + opts.key + salt).encode("utf-8")
	).hexdigest()[:16]
	filename = "image_%s_%s.png" % (salt, md5hash)
	if opts.dirs:
	subdir = gen_subdir(opts.output, md5hash, opts.dirs)
	filename = os.path.join(subdir, filename)
	if opts.verbose:
	print(filename)
	if opts.jsonmap:
	jsonmap[filename] = word

	gen_captcha(word, font, fontsize, os.path.join(opts.output, filename))


	if __name__ == "__main__":
	"""This grabs random words from the dictionary 'words' (one
	word per line) and generates a captcha image for each one,
	with a keyed salted hash of the correct answer in the filename.

	To check a reply, hash it in the same way with the same salt and
	secret key, then compare with the hash value given.
	"""
	script_dir = os.path.dirname(os.path.realpath(__file__))
	parser = OptionParser()
	parser.add_option(
	"--wordlist",
	help="A list of words (required)",
	metavar="WORDS.txt"
	)
	parser.add_option(
	"--random",
	help="Use random characters instead of a wordlist",
	action="store_true",
	)
	parser.add_option(
	"--key",
	help="The passphrase set as $wgCaptchaSecret (required)",
	metavar="KEY"
	)
	parser.add_option(
	"--output",
	help="The directory to put the images in - $wgCaptchaDirectory (required)",
	metavar="DIR",
	)
	parser.add_option(
	"--font",
	help="The font to use (required)",
	metavar="FONT.ttf"
	)
	parser.add_option(
	"--font-size",
	help="The font size (default 40)",
	metavar="N",
	type="int",
	default=40,
	)
	parser.add_option(
	"--count",
	help="The maximum number of images to make (default 20)",
	metavar="N",
	type="int",
	default=20,
	)
	parser.add_option(
	"--badwordlist",
	help="A list of words that should not be used",
	metavar="FILE",
	default=os.path.join(script_dir, "badwordlist"),
	)
	parser.add_option(
	"--fill",
	help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs",
	metavar="N",
	type="int",
	)
	parser.add_option(
	"--dirs",
	help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels",
	metavar="N",
	type="int",
	)
	parser.add_option(
	"--verbose",
	"-v",
	help="Show debugging information",
	action="store_true"
	)
	parser.add_option(
	"--number-words",
	help="Number of words from the wordlist which make a captcha challenge (default 2)",
	type="int",
	default=2,
	)
	parser.add_option(
	"--min-length",
	help="Minimum length for a captcha challenge",
	type="int",
	default=1,
	)
	parser.add_option(
	"--max-length",
	help="Maximum length for a captcha challenge",
	type="int",
	default=-1,
	)
	parser.add_option(
	"--threads",
	help="Maximum number of threads to be used to generate captchas",
	type="int",
	default=1,
	)
	parser.add_option(
	"--jsonmap",
	help="Outputs \"filename\": \"word\" mapping for test/debug purposes",
	action="store_true"
	)

	opts, args = parser.parse_args()

	if opts.wordlist:
	wordlist = opts.wordlist
	elif opts.random:
	wordlist = None
	else:
	sys.exit("Need to specify a wordlist")
	if opts.key:
	key = opts.key
	else:
	sys.exit("Need to specify a key")
	if opts.output:
	output = opts.output
	else:
	sys.exit("Need to specify an output directory")
	if opts.font and os.path.exists(opts.font):
	font = opts.font
	else:
	sys.exit("Need to specify the location of a font")

	badwordlist = read_wordlist(opts.badwordlist)
	count = opts.count
	fill = opts.fill
	fontsize = opts.font_size
	threads = opts.threads

	if fill:
	count = max(0, fill - len(os.listdir(output)))

	words = None
	if wordlist:
	words = read_wordlist(wordlist)
	words = [
	x
	for x in words
	if len(x) in (4, 5) and x[0] != "f" and x[0] != x[1] and x[-1] != x[-2]
	]

	if count == 0:
	sys.exit("No need to generate CAPTCHA images.")

	if count < threads:
	chunks = 1
	threads = 1
	else:
	chunks = count // threads

	p = multiprocessing.Pool(threads)
	data = []
	print(
	"Generating %s CAPTCHA images separated in %s image(s) per chunk run by %s threads..."
	% (count, chunks, threads)
	)
	jsonmap = multiprocessing.Manager().dict()
	for i in range(0, threads):
	data.append([chunks, words, badwordlist, opts, font, fontsize, jsonmap])

	result = p.map_async(run_in_thread, data)
	result.wait()

	if opts.jsonmap:
	with open("map.json", "w") as outfile:
	json.dump(jsonmap.copy(), outfile, indent=4)

File Metadata

Mime Type: text/x-script.python
Expires: Sat, May 16, 14:51 (1 d, 3 h)
Storage Engine: local-disk
Storage Format: Raw Data
Storage Handle: 90/d8/cc4dbb2d993b80590a055560e0f6
Default Alt Text: captcha.py (13 KB)

captcha.pyNo OneTemporaryActions

captcha.pyView Options

File Metadata

Event Timeline

captcha.py
No OneTemporary
Actions

captcha.py
View Options