mediawiki-extensions-Confir.../captcha.py
Tim Starling 6f286e52db Add a gradient to FancyCaptcha
This defeats naive thresholding, giving Tesseract break rate of 0 out of
1000, even if a sensible threshold value is hand-chosen. Reduced the
text value and noise to make room for the gradient, but kept an SNR of
1.3, as before, which provides good legibility.

Obviously the gradient can be removed with custom preprocessing -- the
point of these changes is to raise the bar from "unconfigured Tessearct"
to "some small amount of developer effort".

Change-Id: I30ebc904ca59bf29a2aa812f881a077a13493e68
2014-09-26 10:41:26 +10:00

271 lines
9.4 KiB
Python

#!/usr/bin/python
#
# Script to generate distorted text images for a captcha system.
#
# Copyright (C) 2005 Neil Harris
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# http://www.gnu.org/copyleft/gpl.html
#
# Further tweaks by Brion Vibber <brion@pobox.com>:
# 2006-01-26: Add command-line options for the various parameters
# 2007-02-19: Add --dirs param for hash subdirectory splits
# Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
# 2008-01-06: Add regex check to skip words containing other than a-z
import random
import math
import hashlib
from optparse import OptionParser
import os
import sys
import re
try:
import Image
import ImageFont
import ImageDraw
import ImageEnhance
import ImageOps
import ImageMath
except:
sys.exit("This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/")
nonalpha = re.compile('[^a-z]') # regex to test for suitability of words
# Does X-axis wobbly copy, sandwiched between two rotates
def wobbly_copy(src, wob, col, scale, ang):
x, y = src.size
f = random.uniform(4*scale, 5*scale)
p = random.uniform(0, math.pi*2)
rr = ang+random.uniform(-10, 10) # vary, but not too much
int_d = Image.new('RGB', src.size, 0) # a black rectangle
rot = src.rotate(rr, Image.BILINEAR)
# Do a cheap bounding-box op here to try to limit work below
bbx = rot.getbbox()
if bbx == None:
return src
else:
l, t, r, b= bbx
# and only do lines with content on
for i in range(t, b+1):
# Drop a scan line in
xoff = int(math.sin(p+(i*f/y))*wob)
xoff += int(random.uniform(-wob*0.5, wob*0.5))
int_d.paste(rot.crop((0, i, x, i+1)), (xoff, i))
# try to stop blurring from building up
int_d = int_d.rotate(-rr, Image.BILINEAR)
enh = ImageEnhance.Sharpness(int_d)
return enh.enhance(2)
def gen_captcha(text, fontname, fontsize, file_name):
"""Generate a captcha image"""
# white text on a black background
bgcolor = 0x0
fgcolor = 0xffffff
# create a font object
font = ImageFont.truetype(fontname,fontsize)
# determine dimensions of the text
dim = font.getsize(text)
# create a new image significantly larger that the text
edge = max(dim[0], dim[1]) + 2*min(dim[0], dim[1])
im = Image.new('RGB', (edge, edge), bgcolor)
d = ImageDraw.Draw(im)
x, y = im.size
# add the text to the image
d.text((x/2-dim[0]/2, y/2-dim[1]/2), text, font=font, fill=fgcolor)
k = 2
wob = 0.09*dim[1]
rot = 45
# Apply lots of small stirring operations, rather than a few large ones
# in order to get some uniformity of treatment, whilst
# maintaining randomness
for i in range(k):
im = wobbly_copy(im, wob, bgcolor, i*2+3, rot+0)
im = wobbly_copy(im, wob, bgcolor, i*2+1, rot+45)
im = wobbly_copy(im, wob, bgcolor, i*2+2, rot+90)
rot += 30
# now get the bounding box of the nonzero parts of the image
bbox = im.getbbox()
bord = min(dim[0], dim[1])/4 # a bit of a border
im = im.crop((bbox[0]-bord, bbox[1]-bord, bbox[2]+bord, bbox[3]+bord))
# Create noise
nblock = 4
nsize = (im.size[0] / nblock, im.size[1] / nblock)
noise = Image.new('L', nsize, bgcolor)
data = noise.load()
for x in range(nsize[0]):
for y in range(nsize[1]):
r = random.randint(0, 65)
gradient = 70 * x / nsize[0]
data[x, y] = r + gradient
# Turn speckles into blobs
noise = noise.resize(im.size, Image.BILINEAR)
# Add to the image
im = ImageMath.eval('convert(convert(a, "L") / 3 + b, "RGB")', a=im, b=noise)
# and turn into black on white
im = ImageOps.invert(im)
# save the image, in format determined from filename
im.save(file_name)
def gen_subdir(basedir, md5hash, levels):
"""Generate a subdirectory path out of the first _levels_
characters of _hash_, and ensure the directories exist
under _basedir_."""
subdir = None
for i in range(0, levels):
char = md5hash[i]
if subdir:
subdir = os.path.join(subdir, char)
else:
subdir = char
fulldir = os.path.join(basedir, subdir)
if not os.path.exists(fulldir):
os.mkdir(fulldir)
return subdir
def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length):
if words is not None:
word = words[random.randint(0,len(words)-1)]
while nwords > 1:
word2 = words[random.randint(0,len(words)-1)]
word = word + word2
nwords = nwords - 1
else:
word = ''
max_length = max_length if max_length > 0 else 10
for i in range(0, random.randint(min_length, max_length)):
word = word + chr(97 + random.randint(0,25))
if verbose:
print "word is %s" % word
if len(word) < min_length:
if verbose:
print "skipping word pair '%s' because it has fewer than %d characters" % (word, min_length)
return None
if max_length > 0 and len(word) > max_length:
if verbose:
print "skipping word pair '%s' because it has more than %d characters" % (word, max_length)
return None
if nonalpha.search(word):
if verbose:
print "skipping word pair '%s' because it contains non-alphabetic characters" % word
return None
for naughty in blacklist:
if naughty in word:
if verbose:
print "skipping word pair '%s' because it contains blacklisted word '%s'" % (word, naughty)
return None
return word
def pick_word(words, blacklist, verbose, nwords, min_length, max_length):
for x in range(1000): # If we can't find a valid combination in 1000 tries, just give up
word = try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
if word:
return word
sys.exit("Unable to find valid word combinations")
def read_wordlist(filename):
f = open(filename)
words = [x.strip().lower() for x in f.readlines()]
f.close()
return words
if __name__ == '__main__':
"""This grabs random words from the dictionary 'words' (one
word per line) and generates a captcha image for each one,
with a keyed salted hash of the correct answer in the filename.
To check a reply, hash it in the same way with the same salt and
secret key, then compare with the hash value given.
"""
script_dir = os.path.dirname(os.path.realpath(__file__))
parser = OptionParser()
parser.add_option("--wordlist", help="A list of words (required)", metavar="WORDS.txt")
parser.add_option("--random", help="Use random charcters instead of a wordlist", action="store_true")
parser.add_option("--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY")
parser.add_option("--output", help="The directory to put the images in - $wgCaptchaDirectory (required)", metavar="DIR")
parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
parser.add_option("--font-size", help="The font size (default 40)", metavar="N", type='int', default=40)
parser.add_option("--count", help="The maximum number of images to make (default 20)", metavar="N", type='int', default=20)
parser.add_option("--blacklist", help="A blacklist of words that should not be used", metavar="FILE", default=os.path.join(script_dir, "blacklist"))
parser.add_option("--fill", help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs", metavar="N", type='int')
parser.add_option("--dirs", help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels", metavar="N", type='int')
parser.add_option("--verbose", "-v", help="Show debugging information", action='store_true')
parser.add_option("--number-words", help="Number of words from the wordlist which make a captcha challenge (default 2)", type='int', default=2)
parser.add_option("--min-length", help="Minimum length for a captcha challenge", type='int', default=1)
parser.add_option("--max-length", help="Maximum length for a captcha challenge", type='int', default=-1)
opts, args = parser.parse_args()
if opts.wordlist:
wordlist = opts.wordlist
elif opts.random:
wordlist = None
else:
sys.exit("Need to specify a wordlist")
if opts.key:
key = opts.key
else:
sys.exit("Need to specify a key")
if opts.output:
output = opts.output
else:
sys.exit("Need to specify an output directory")
if opts.font and os.path.exists(opts.font):
font = opts.font
else:
sys.exit("Need to specify the location of a font")
blacklist = read_wordlist(opts.blacklist)
count = opts.count
fill = opts.fill
dirs = opts.dirs
verbose = opts.verbose
fontsize = opts.font_size
if fill:
count = max(0, fill - len(os.listdir(output)))
words = None
if wordlist:
words = read_wordlist(wordlist)
words = [x for x in words
if len(x) in (4,5) and x[0] != "f"
and x[0] != x[1] and x[-1] != x[-2]]
for i in range(count):
word = pick_word(words, blacklist, verbose, opts.number_words, opts.min_length, opts.max_length)
salt = "%08x" % random.randrange(2**32)
# 64 bits of hash is plenty for this purpose
md5hash = hashlib.md5(key+salt+word+key+salt).hexdigest()[:16]
filename = "image_%s_%s.png" % (salt, md5hash)
if dirs:
subdir = gen_subdir(output, md5hash, dirs)
filename = os.path.join(subdir, filename)
if verbose:
print filename
gen_captcha(word, font, fontsize, os.path.join(output, filename))