mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/ConfirmEdit
synced 2024-11-14 11:16:50 +00:00
6f286e52db
This defeats naive thresholding, giving Tesseract break rate of 0 out of 1000, even if a sensible threshold value is hand-chosen. Reduced the text value and noise to make room for the gradient, but kept an SNR of 1.3, as before, which provides good legibility. Obviously the gradient can be removed with custom preprocessing -- the point of these changes is to raise the bar from "unconfigured Tessearct" to "some small amount of developer effort". Change-Id: I30ebc904ca59bf29a2aa812f881a077a13493e68
271 lines
9.4 KiB
Python
271 lines
9.4 KiB
Python
#!/usr/bin/python
|
|
#
|
|
# Script to generate distorted text images for a captcha system.
|
|
#
|
|
# Copyright (C) 2005 Neil Harris
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
# http://www.gnu.org/copyleft/gpl.html
|
|
#
|
|
# Further tweaks by Brion Vibber <brion@pobox.com>:
|
|
# 2006-01-26: Add command-line options for the various parameters
|
|
# 2007-02-19: Add --dirs param for hash subdirectory splits
|
|
# Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
|
|
# 2008-01-06: Add regex check to skip words containing other than a-z
|
|
|
|
import random
|
|
import math
|
|
import hashlib
|
|
from optparse import OptionParser
|
|
import os
|
|
import sys
|
|
import re
|
|
|
|
try:
|
|
import Image
|
|
import ImageFont
|
|
import ImageDraw
|
|
import ImageEnhance
|
|
import ImageOps
|
|
import ImageMath
|
|
except:
|
|
sys.exit("This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/")
|
|
|
|
nonalpha = re.compile('[^a-z]') # regex to test for suitability of words
|
|
|
|
# Does X-axis wobbly copy, sandwiched between two rotates
|
|
def wobbly_copy(src, wob, col, scale, ang):
|
|
x, y = src.size
|
|
f = random.uniform(4*scale, 5*scale)
|
|
p = random.uniform(0, math.pi*2)
|
|
rr = ang+random.uniform(-10, 10) # vary, but not too much
|
|
int_d = Image.new('RGB', src.size, 0) # a black rectangle
|
|
rot = src.rotate(rr, Image.BILINEAR)
|
|
# Do a cheap bounding-box op here to try to limit work below
|
|
bbx = rot.getbbox()
|
|
if bbx == None:
|
|
return src
|
|
else:
|
|
l, t, r, b= bbx
|
|
# and only do lines with content on
|
|
for i in range(t, b+1):
|
|
# Drop a scan line in
|
|
xoff = int(math.sin(p+(i*f/y))*wob)
|
|
xoff += int(random.uniform(-wob*0.5, wob*0.5))
|
|
int_d.paste(rot.crop((0, i, x, i+1)), (xoff, i))
|
|
# try to stop blurring from building up
|
|
int_d = int_d.rotate(-rr, Image.BILINEAR)
|
|
enh = ImageEnhance.Sharpness(int_d)
|
|
return enh.enhance(2)
|
|
|
|
|
|
def gen_captcha(text, fontname, fontsize, file_name):
|
|
"""Generate a captcha image"""
|
|
# white text on a black background
|
|
bgcolor = 0x0
|
|
fgcolor = 0xffffff
|
|
# create a font object
|
|
font = ImageFont.truetype(fontname,fontsize)
|
|
# determine dimensions of the text
|
|
dim = font.getsize(text)
|
|
# create a new image significantly larger that the text
|
|
edge = max(dim[0], dim[1]) + 2*min(dim[0], dim[1])
|
|
im = Image.new('RGB', (edge, edge), bgcolor)
|
|
d = ImageDraw.Draw(im)
|
|
x, y = im.size
|
|
# add the text to the image
|
|
d.text((x/2-dim[0]/2, y/2-dim[1]/2), text, font=font, fill=fgcolor)
|
|
k = 2
|
|
wob = 0.09*dim[1]
|
|
rot = 45
|
|
# Apply lots of small stirring operations, rather than a few large ones
|
|
# in order to get some uniformity of treatment, whilst
|
|
# maintaining randomness
|
|
for i in range(k):
|
|
im = wobbly_copy(im, wob, bgcolor, i*2+3, rot+0)
|
|
im = wobbly_copy(im, wob, bgcolor, i*2+1, rot+45)
|
|
im = wobbly_copy(im, wob, bgcolor, i*2+2, rot+90)
|
|
rot += 30
|
|
|
|
# now get the bounding box of the nonzero parts of the image
|
|
bbox = im.getbbox()
|
|
bord = min(dim[0], dim[1])/4 # a bit of a border
|
|
im = im.crop((bbox[0]-bord, bbox[1]-bord, bbox[2]+bord, bbox[3]+bord))
|
|
|
|
# Create noise
|
|
nblock = 4
|
|
nsize = (im.size[0] / nblock, im.size[1] / nblock)
|
|
noise = Image.new('L', nsize, bgcolor)
|
|
data = noise.load()
|
|
for x in range(nsize[0]):
|
|
for y in range(nsize[1]):
|
|
r = random.randint(0, 65)
|
|
gradient = 70 * x / nsize[0]
|
|
data[x, y] = r + gradient
|
|
# Turn speckles into blobs
|
|
noise = noise.resize(im.size, Image.BILINEAR)
|
|
# Add to the image
|
|
im = ImageMath.eval('convert(convert(a, "L") / 3 + b, "RGB")', a=im, b=noise)
|
|
|
|
# and turn into black on white
|
|
im = ImageOps.invert(im)
|
|
|
|
# save the image, in format determined from filename
|
|
im.save(file_name)
|
|
|
|
def gen_subdir(basedir, md5hash, levels):
|
|
"""Generate a subdirectory path out of the first _levels_
|
|
characters of _hash_, and ensure the directories exist
|
|
under _basedir_."""
|
|
subdir = None
|
|
for i in range(0, levels):
|
|
char = md5hash[i]
|
|
if subdir:
|
|
subdir = os.path.join(subdir, char)
|
|
else:
|
|
subdir = char
|
|
fulldir = os.path.join(basedir, subdir)
|
|
if not os.path.exists(fulldir):
|
|
os.mkdir(fulldir)
|
|
return subdir
|
|
|
|
def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length):
|
|
if words is not None:
|
|
word = words[random.randint(0,len(words)-1)]
|
|
while nwords > 1:
|
|
word2 = words[random.randint(0,len(words)-1)]
|
|
word = word + word2
|
|
nwords = nwords - 1
|
|
else:
|
|
word = ''
|
|
max_length = max_length if max_length > 0 else 10
|
|
for i in range(0, random.randint(min_length, max_length)):
|
|
word = word + chr(97 + random.randint(0,25))
|
|
|
|
if verbose:
|
|
print "word is %s" % word
|
|
|
|
if len(word) < min_length:
|
|
if verbose:
|
|
print "skipping word pair '%s' because it has fewer than %d characters" % (word, min_length)
|
|
return None
|
|
|
|
if max_length > 0 and len(word) > max_length:
|
|
if verbose:
|
|
print "skipping word pair '%s' because it has more than %d characters" % (word, max_length)
|
|
return None
|
|
|
|
if nonalpha.search(word):
|
|
if verbose:
|
|
print "skipping word pair '%s' because it contains non-alphabetic characters" % word
|
|
return None
|
|
|
|
for naughty in blacklist:
|
|
if naughty in word:
|
|
if verbose:
|
|
print "skipping word pair '%s' because it contains blacklisted word '%s'" % (word, naughty)
|
|
return None
|
|
return word
|
|
|
|
def pick_word(words, blacklist, verbose, nwords, min_length, max_length):
|
|
for x in range(1000): # If we can't find a valid combination in 1000 tries, just give up
|
|
word = try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
|
|
if word:
|
|
return word
|
|
sys.exit("Unable to find valid word combinations")
|
|
|
|
def read_wordlist(filename):
|
|
f = open(filename)
|
|
words = [x.strip().lower() for x in f.readlines()]
|
|
f.close()
|
|
return words
|
|
|
|
if __name__ == '__main__':
|
|
"""This grabs random words from the dictionary 'words' (one
|
|
word per line) and generates a captcha image for each one,
|
|
with a keyed salted hash of the correct answer in the filename.
|
|
|
|
To check a reply, hash it in the same way with the same salt and
|
|
secret key, then compare with the hash value given.
|
|
"""
|
|
script_dir = os.path.dirname(os.path.realpath(__file__))
|
|
parser = OptionParser()
|
|
parser.add_option("--wordlist", help="A list of words (required)", metavar="WORDS.txt")
|
|
parser.add_option("--random", help="Use random charcters instead of a wordlist", action="store_true")
|
|
parser.add_option("--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY")
|
|
parser.add_option("--output", help="The directory to put the images in - $wgCaptchaDirectory (required)", metavar="DIR")
|
|
parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
|
|
parser.add_option("--font-size", help="The font size (default 40)", metavar="N", type='int', default=40)
|
|
parser.add_option("--count", help="The maximum number of images to make (default 20)", metavar="N", type='int', default=20)
|
|
parser.add_option("--blacklist", help="A blacklist of words that should not be used", metavar="FILE", default=os.path.join(script_dir, "blacklist"))
|
|
parser.add_option("--fill", help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs", metavar="N", type='int')
|
|
parser.add_option("--dirs", help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels", metavar="N", type='int')
|
|
parser.add_option("--verbose", "-v", help="Show debugging information", action='store_true')
|
|
parser.add_option("--number-words", help="Number of words from the wordlist which make a captcha challenge (default 2)", type='int', default=2)
|
|
parser.add_option("--min-length", help="Minimum length for a captcha challenge", type='int', default=1)
|
|
parser.add_option("--max-length", help="Maximum length for a captcha challenge", type='int', default=-1)
|
|
|
|
opts, args = parser.parse_args()
|
|
|
|
if opts.wordlist:
|
|
wordlist = opts.wordlist
|
|
elif opts.random:
|
|
wordlist = None
|
|
else:
|
|
sys.exit("Need to specify a wordlist")
|
|
if opts.key:
|
|
key = opts.key
|
|
else:
|
|
sys.exit("Need to specify a key")
|
|
if opts.output:
|
|
output = opts.output
|
|
else:
|
|
sys.exit("Need to specify an output directory")
|
|
if opts.font and os.path.exists(opts.font):
|
|
font = opts.font
|
|
else:
|
|
sys.exit("Need to specify the location of a font")
|
|
|
|
blacklist = read_wordlist(opts.blacklist)
|
|
count = opts.count
|
|
fill = opts.fill
|
|
dirs = opts.dirs
|
|
verbose = opts.verbose
|
|
fontsize = opts.font_size
|
|
|
|
if fill:
|
|
count = max(0, fill - len(os.listdir(output)))
|
|
|
|
words = None
|
|
if wordlist:
|
|
words = read_wordlist(wordlist)
|
|
words = [x for x in words
|
|
if len(x) in (4,5) and x[0] != "f"
|
|
and x[0] != x[1] and x[-1] != x[-2]]
|
|
|
|
for i in range(count):
|
|
word = pick_word(words, blacklist, verbose, opts.number_words, opts.min_length, opts.max_length)
|
|
salt = "%08x" % random.randrange(2**32)
|
|
# 64 bits of hash is plenty for this purpose
|
|
md5hash = hashlib.md5(key+salt+word+key+salt).hexdigest()[:16]
|
|
filename = "image_%s_%s.png" % (salt, md5hash)
|
|
if dirs:
|
|
subdir = gen_subdir(output, md5hash, dirs)
|
|
filename = os.path.join(subdir, filename)
|
|
if verbose:
|
|
print filename
|
|
gen_captcha(word, font, fontsize, os.path.join(output, filename))
|
|
|