2006-01-27 10:22:37 +00:00
|
|
|
#!/usr/bin/python
|
|
|
|
#
|
|
|
|
# Script to generate distorted text images for a captcha system.
|
|
|
|
#
|
|
|
|
# Copyright (C) 2005 Neil Harris
|
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
2010-06-21 13:45:17 +00:00
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
2006-01-27 10:22:37 +00:00
|
|
|
# http://www.gnu.org/copyleft/gpl.html
|
|
|
|
#
|
|
|
|
# Further tweaks by Brion Vibber <brion@pobox.com>:
|
|
|
|
# 2006-01-26: Add command-line options for the various parameters
|
2007-02-19 20:09:03 +00:00
|
|
|
# 2007-02-19: Add --dirs param for hash subdirectory splits
|
2008-01-07 03:28:38 +00:00
|
|
|
# Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
|
|
|
|
# 2008-01-06: Add regex check to skip words containing other than a-z
|
2006-01-27 10:22:37 +00:00
|
|
|
|
|
|
|
import random
|
2009-09-08 01:11:52 +00:00
|
|
|
import math
|
|
|
|
import hashlib
|
|
|
|
from optparse import OptionParser
|
2006-01-27 10:22:37 +00:00
|
|
|
import os
|
|
|
|
import sys
|
2008-01-07 03:28:38 +00:00
|
|
|
import re
|
2017-02-10 18:04:12 +00:00
|
|
|
import multiprocessing
|
|
|
|
import time
|
2006-01-27 10:22:37 +00:00
|
|
|
|
2009-09-08 01:11:52 +00:00
|
|
|
try:
|
2017-02-12 00:39:09 +00:00
|
|
|
from PIL import Image
|
|
|
|
from PIL import ImageFont
|
|
|
|
from PIL import ImageDraw
|
|
|
|
from PIL import ImageEnhance
|
|
|
|
from PIL import ImageOps
|
|
|
|
from PIL import ImageMath
|
2009-09-08 01:11:52 +00:00
|
|
|
except:
|
|
|
|
sys.exit("This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/")
|
|
|
|
|
|
|
|
nonalpha = re.compile('[^a-z]') # regex to test for suitability of words
|
|
|
|
|
2006-01-27 10:22:37 +00:00
|
|
|
# Does X-axis wobbly copy, sandwiched between two rotates
|
|
|
|
def wobbly_copy(src, wob, col, scale, ang):
|
|
|
|
x, y = src.size
|
|
|
|
f = random.uniform(4*scale, 5*scale)
|
|
|
|
p = random.uniform(0, math.pi*2)
|
Improve FancyCaptcha resistance to OCR
Tesseract is a popular open source OCR package. Running it on
FancyCaptcha images, with no training or configuration, yielded a 56%
break rate. By restricting the character set, the OCR break rate was
improved to 66%.
So:
* Reduce k, increase wob scale, increase rr fuzz. The net effect of
these three changes is to more reliably bend the baseline. In the old
captcha, the baseline would often be bent by chance, but when it
wasn't bent, it provided a very easy challenge for the OCR engine.
This reduced the break rate from 66% to around 40%.
* Introduce additive noise, based on a bilinear upscale of a random
greyscale image. This, combined with the above change, reduces the
Tesseract break rate to 6%.
Change-Id: I05b5bb6475de9378cd89cce13b1b2f28b32cd405
2014-09-25 10:56:50 +00:00
|
|
|
rr = ang+random.uniform(-10, 10) # vary, but not too much
|
2006-01-27 10:22:37 +00:00
|
|
|
int_d = Image.new('RGB', src.size, 0) # a black rectangle
|
|
|
|
rot = src.rotate(rr, Image.BILINEAR)
|
|
|
|
# Do a cheap bounding-box op here to try to limit work below
|
|
|
|
bbx = rot.getbbox()
|
|
|
|
if bbx == None:
|
|
|
|
return src
|
|
|
|
else:
|
|
|
|
l, t, r, b= bbx
|
|
|
|
# and only do lines with content on
|
|
|
|
for i in range(t, b+1):
|
|
|
|
# Drop a scan line in
|
|
|
|
xoff = int(math.sin(p+(i*f/y))*wob)
|
|
|
|
xoff += int(random.uniform(-wob*0.5, wob*0.5))
|
|
|
|
int_d.paste(rot.crop((0, i, x, i+1)), (xoff, i))
|
|
|
|
# try to stop blurring from building up
|
|
|
|
int_d = int_d.rotate(-rr, Image.BILINEAR)
|
|
|
|
enh = ImageEnhance.Sharpness(int_d)
|
|
|
|
return enh.enhance(2)
|
|
|
|
|
|
|
|
|
|
|
|
def gen_captcha(text, fontname, fontsize, file_name):
|
|
|
|
"""Generate a captcha image"""
|
|
|
|
# white text on a black background
|
|
|
|
bgcolor = 0x0
|
|
|
|
fgcolor = 0xffffff
|
2017-02-12 00:39:09 +00:00
|
|
|
# create a font object
|
2006-01-27 10:22:37 +00:00
|
|
|
font = ImageFont.truetype(fontname,fontsize)
|
|
|
|
# determine dimensions of the text
|
|
|
|
dim = font.getsize(text)
|
|
|
|
# create a new image significantly larger that the text
|
|
|
|
edge = max(dim[0], dim[1]) + 2*min(dim[0], dim[1])
|
|
|
|
im = Image.new('RGB', (edge, edge), bgcolor)
|
|
|
|
d = ImageDraw.Draw(im)
|
|
|
|
x, y = im.size
|
|
|
|
# add the text to the image
|
|
|
|
d.text((x/2-dim[0]/2, y/2-dim[1]/2), text, font=font, fill=fgcolor)
|
Improve FancyCaptcha resistance to OCR
Tesseract is a popular open source OCR package. Running it on
FancyCaptcha images, with no training or configuration, yielded a 56%
break rate. By restricting the character set, the OCR break rate was
improved to 66%.
So:
* Reduce k, increase wob scale, increase rr fuzz. The net effect of
these three changes is to more reliably bend the baseline. In the old
captcha, the baseline would often be bent by chance, but when it
wasn't bent, it provided a very easy challenge for the OCR engine.
This reduced the break rate from 66% to around 40%.
* Introduce additive noise, based on a bilinear upscale of a random
greyscale image. This, combined with the above change, reduces the
Tesseract break rate to 6%.
Change-Id: I05b5bb6475de9378cd89cce13b1b2f28b32cd405
2014-09-25 10:56:50 +00:00
|
|
|
k = 2
|
|
|
|
wob = 0.09*dim[1]
|
2006-01-27 10:22:37 +00:00
|
|
|
rot = 45
|
|
|
|
# Apply lots of small stirring operations, rather than a few large ones
|
|
|
|
# in order to get some uniformity of treatment, whilst
|
|
|
|
# maintaining randomness
|
|
|
|
for i in range(k):
|
|
|
|
im = wobbly_copy(im, wob, bgcolor, i*2+3, rot+0)
|
|
|
|
im = wobbly_copy(im, wob, bgcolor, i*2+1, rot+45)
|
|
|
|
im = wobbly_copy(im, wob, bgcolor, i*2+2, rot+90)
|
|
|
|
rot += 30
|
2017-02-12 00:39:09 +00:00
|
|
|
|
2006-01-27 10:22:37 +00:00
|
|
|
# now get the bounding box of the nonzero parts of the image
|
|
|
|
bbox = im.getbbox()
|
|
|
|
bord = min(dim[0], dim[1])/4 # a bit of a border
|
|
|
|
im = im.crop((bbox[0]-bord, bbox[1]-bord, bbox[2]+bord, bbox[3]+bord))
|
Improve FancyCaptcha resistance to OCR
Tesseract is a popular open source OCR package. Running it on
FancyCaptcha images, with no training or configuration, yielded a 56%
break rate. By restricting the character set, the OCR break rate was
improved to 66%.
So:
* Reduce k, increase wob scale, increase rr fuzz. The net effect of
these three changes is to more reliably bend the baseline. In the old
captcha, the baseline would often be bent by chance, but when it
wasn't bent, it provided a very easy challenge for the OCR engine.
This reduced the break rate from 66% to around 40%.
* Introduce additive noise, based on a bilinear upscale of a random
greyscale image. This, combined with the above change, reduces the
Tesseract break rate to 6%.
Change-Id: I05b5bb6475de9378cd89cce13b1b2f28b32cd405
2014-09-25 10:56:50 +00:00
|
|
|
|
|
|
|
# Create noise
|
|
|
|
nblock = 4
|
|
|
|
nsize = (im.size[0] / nblock, im.size[1] / nblock)
|
|
|
|
noise = Image.new('L', nsize, bgcolor)
|
|
|
|
data = noise.load()
|
|
|
|
for x in range(nsize[0]):
|
|
|
|
for y in range(nsize[1]):
|
2014-09-26 00:37:37 +00:00
|
|
|
r = random.randint(0, 65)
|
|
|
|
gradient = 70 * x / nsize[0]
|
|
|
|
data[x, y] = r + gradient
|
Improve FancyCaptcha resistance to OCR
Tesseract is a popular open source OCR package. Running it on
FancyCaptcha images, with no training or configuration, yielded a 56%
break rate. By restricting the character set, the OCR break rate was
improved to 66%.
So:
* Reduce k, increase wob scale, increase rr fuzz. The net effect of
these three changes is to more reliably bend the baseline. In the old
captcha, the baseline would often be bent by chance, but when it
wasn't bent, it provided a very easy challenge for the OCR engine.
This reduced the break rate from 66% to around 40%.
* Introduce additive noise, based on a bilinear upscale of a random
greyscale image. This, combined with the above change, reduces the
Tesseract break rate to 6%.
Change-Id: I05b5bb6475de9378cd89cce13b1b2f28b32cd405
2014-09-25 10:56:50 +00:00
|
|
|
# Turn speckles into blobs
|
|
|
|
noise = noise.resize(im.size, Image.BILINEAR)
|
|
|
|
# Add to the image
|
2014-09-26 00:37:37 +00:00
|
|
|
im = ImageMath.eval('convert(convert(a, "L") / 3 + b, "RGB")', a=im, b=noise)
|
Improve FancyCaptcha resistance to OCR
Tesseract is a popular open source OCR package. Running it on
FancyCaptcha images, with no training or configuration, yielded a 56%
break rate. By restricting the character set, the OCR break rate was
improved to 66%.
So:
* Reduce k, increase wob scale, increase rr fuzz. The net effect of
these three changes is to more reliably bend the baseline. In the old
captcha, the baseline would often be bent by chance, but when it
wasn't bent, it provided a very easy challenge for the OCR engine.
This reduced the break rate from 66% to around 40%.
* Introduce additive noise, based on a bilinear upscale of a random
greyscale image. This, combined with the above change, reduces the
Tesseract break rate to 6%.
Change-Id: I05b5bb6475de9378cd89cce13b1b2f28b32cd405
2014-09-25 10:56:50 +00:00
|
|
|
|
2006-01-27 10:22:37 +00:00
|
|
|
# and turn into black on white
|
|
|
|
im = ImageOps.invert(im)
|
Improve FancyCaptcha resistance to OCR
Tesseract is a popular open source OCR package. Running it on
FancyCaptcha images, with no training or configuration, yielded a 56%
break rate. By restricting the character set, the OCR break rate was
improved to 66%.
So:
* Reduce k, increase wob scale, increase rr fuzz. The net effect of
these three changes is to more reliably bend the baseline. In the old
captcha, the baseline would often be bent by chance, but when it
wasn't bent, it provided a very easy challenge for the OCR engine.
This reduced the break rate from 66% to around 40%.
* Introduce additive noise, based on a bilinear upscale of a random
greyscale image. This, combined with the above change, reduces the
Tesseract break rate to 6%.
Change-Id: I05b5bb6475de9378cd89cce13b1b2f28b32cd405
2014-09-25 10:56:50 +00:00
|
|
|
|
2006-01-27 10:22:37 +00:00
|
|
|
# save the image, in format determined from filename
|
|
|
|
im.save(file_name)
|
|
|
|
|
2009-09-08 01:11:52 +00:00
|
|
|
def gen_subdir(basedir, md5hash, levels):
|
2007-02-19 20:09:03 +00:00
|
|
|
"""Generate a subdirectory path out of the first _levels_
|
|
|
|
characters of _hash_, and ensure the directories exist
|
|
|
|
under _basedir_."""
|
|
|
|
subdir = None
|
|
|
|
for i in range(0, levels):
|
2009-09-08 01:11:52 +00:00
|
|
|
char = md5hash[i]
|
2007-02-19 20:09:03 +00:00
|
|
|
if subdir:
|
|
|
|
subdir = os.path.join(subdir, char)
|
|
|
|
else:
|
|
|
|
subdir = char
|
|
|
|
fulldir = os.path.join(basedir, subdir)
|
|
|
|
if not os.path.exists(fulldir):
|
|
|
|
os.mkdir(fulldir)
|
|
|
|
return subdir
|
2007-06-29 19:57:01 +00:00
|
|
|
|
2012-07-27 23:54:17 +00:00
|
|
|
def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length):
|
2012-07-28 00:10:04 +00:00
|
|
|
if words is not None:
|
|
|
|
word = words[random.randint(0,len(words)-1)]
|
|
|
|
while nwords > 1:
|
|
|
|
word2 = words[random.randint(0,len(words)-1)]
|
|
|
|
word = word + word2
|
|
|
|
nwords = nwords - 1
|
|
|
|
else:
|
|
|
|
word = ''
|
|
|
|
max_length = max_length if max_length > 0 else 10
|
|
|
|
for i in range(0, random.randint(min_length, max_length)):
|
|
|
|
word = word + chr(97 + random.randint(0,25))
|
2012-07-27 23:54:17 +00:00
|
|
|
|
2008-01-07 03:28:38 +00:00
|
|
|
if verbose:
|
2017-02-12 00:39:09 +00:00
|
|
|
print("word is %s" % word)
|
2012-07-27 23:47:21 +00:00
|
|
|
|
|
|
|
if len(word) < min_length:
|
|
|
|
if verbose:
|
2017-02-12 00:39:09 +00:00
|
|
|
print("skipping word pair '%s' because it has fewer than %d characters" % (word, min_length))
|
2012-07-27 23:47:21 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
if max_length > 0 and len(word) > max_length:
|
|
|
|
if verbose:
|
2017-02-12 00:39:09 +00:00
|
|
|
print("skipping word pair '%s' because it has more than %d characters" % (word, max_length))
|
2012-07-27 23:47:21 +00:00
|
|
|
return None
|
|
|
|
|
2009-09-08 01:11:52 +00:00
|
|
|
if nonalpha.search(word):
|
|
|
|
if verbose:
|
2017-02-12 00:39:09 +00:00
|
|
|
print("skipping word pair '%s' because it contains non-alphabetic characters" % word)
|
2008-01-07 03:28:38 +00:00
|
|
|
return None
|
|
|
|
|
2007-06-29 19:57:01 +00:00
|
|
|
for naughty in blacklist:
|
|
|
|
if naughty in word:
|
|
|
|
if verbose:
|
2017-02-12 00:39:09 +00:00
|
|
|
print("skipping word pair '%s' because it contains blacklisted word '%s'" % (word, naughty))
|
2007-06-29 19:57:01 +00:00
|
|
|
return None
|
|
|
|
return word
|
|
|
|
|
2012-07-27 23:54:17 +00:00
|
|
|
def pick_word(words, blacklist, verbose, nwords, min_length, max_length):
|
2009-09-08 01:11:52 +00:00
|
|
|
for x in range(1000): # If we can't find a valid combination in 1000 tries, just give up
|
2012-07-27 23:54:17 +00:00
|
|
|
word = try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
|
2007-06-29 19:57:01 +00:00
|
|
|
if word:
|
|
|
|
return word
|
2009-09-08 01:11:52 +00:00
|
|
|
sys.exit("Unable to find valid word combinations")
|
2007-06-29 19:57:01 +00:00
|
|
|
|
|
|
|
def read_wordlist(filename):
|
2012-12-18 21:20:34 +00:00
|
|
|
f = open(filename)
|
|
|
|
words = [x.strip().lower() for x in f.readlines()]
|
|
|
|
f.close()
|
|
|
|
return words
|
2007-06-29 19:57:01 +00:00
|
|
|
|
2017-02-10 18:04:12 +00:00
|
|
|
def run_in_thread(object):
|
|
|
|
count = object[0];
|
|
|
|
words = object[1]
|
|
|
|
blacklist = object[2]
|
|
|
|
opts = object[3]
|
|
|
|
font = object[4]
|
|
|
|
fontsize = object[5]
|
|
|
|
|
|
|
|
for i in range(count):
|
|
|
|
word = pick_word(words, blacklist, verbose, opts.number_words, opts.min_length, opts.max_length)
|
|
|
|
salt = "%08x" % random.randrange(2**32)
|
|
|
|
# 64 bits of hash is plenty for this purpose
|
|
|
|
md5hash = hashlib.md5((key+salt+word+key+salt).encode('utf-8')).hexdigest()[:16]
|
|
|
|
filename = "image_%s_%s.png" % (salt, md5hash)
|
|
|
|
if dirs:
|
|
|
|
subdir = gen_subdir(output, md5hash, dirs)
|
|
|
|
filename = os.path.join(subdir, filename)
|
|
|
|
if verbose:
|
|
|
|
print(filename)
|
|
|
|
gen_captcha(word, font, fontsize, os.path.join(output, filename))
|
|
|
|
|
2006-01-27 10:22:37 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
"""This grabs random words from the dictionary 'words' (one
|
|
|
|
word per line) and generates a captcha image for each one,
|
|
|
|
with a keyed salted hash of the correct answer in the filename.
|
2017-02-12 00:39:09 +00:00
|
|
|
|
2006-01-27 10:22:37 +00:00
|
|
|
To check a reply, hash it in the same way with the same salt and
|
|
|
|
secret key, then compare with the hash value given.
|
|
|
|
"""
|
2013-03-11 00:57:12 +00:00
|
|
|
script_dir = os.path.dirname(os.path.realpath(__file__))
|
2009-09-08 01:11:52 +00:00
|
|
|
parser = OptionParser()
|
|
|
|
parser.add_option("--wordlist", help="A list of words (required)", metavar="WORDS.txt")
|
2012-07-28 00:10:04 +00:00
|
|
|
parser.add_option("--random", help="Use random charcters instead of a wordlist", action="store_true")
|
2009-09-08 01:11:52 +00:00
|
|
|
parser.add_option("--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY")
|
|
|
|
parser.add_option("--output", help="The directory to put the images in - $wgCaptchaDirectory (required)", metavar="DIR")
|
|
|
|
parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
|
|
|
|
parser.add_option("--font-size", help="The font size (default 40)", metavar="N", type='int', default=40)
|
|
|
|
parser.add_option("--count", help="The maximum number of images to make (default 20)", metavar="N", type='int', default=20)
|
2013-03-11 00:57:12 +00:00
|
|
|
parser.add_option("--blacklist", help="A blacklist of words that should not be used", metavar="FILE", default=os.path.join(script_dir, "blacklist"))
|
2009-09-08 01:11:52 +00:00
|
|
|
parser.add_option("--fill", help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs", metavar="N", type='int')
|
|
|
|
parser.add_option("--dirs", help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels", metavar="N", type='int')
|
2012-07-27 23:47:21 +00:00
|
|
|
parser.add_option("--verbose", "-v", help="Show debugging information", action='store_true')
|
2012-07-27 23:54:17 +00:00
|
|
|
parser.add_option("--number-words", help="Number of words from the wordlist which make a captcha challenge (default 2)", type='int', default=2)
|
2012-07-27 23:47:21 +00:00
|
|
|
parser.add_option("--min-length", help="Minimum length for a captcha challenge", type='int', default=1)
|
|
|
|
parser.add_option("--max-length", help="Maximum length for a captcha challenge", type='int', default=-1)
|
2017-02-10 18:04:12 +00:00
|
|
|
parser.add_option("--threads", help="Maximum number of threads to be used to generate captchas.", type='int', default=1)
|
2017-02-12 00:39:09 +00:00
|
|
|
|
2009-09-08 01:11:52 +00:00
|
|
|
opts, args = parser.parse_args()
|
|
|
|
|
|
|
|
if opts.wordlist:
|
|
|
|
wordlist = opts.wordlist
|
2012-07-28 00:10:04 +00:00
|
|
|
elif opts.random:
|
|
|
|
wordlist = None
|
2009-09-08 01:11:52 +00:00
|
|
|
else:
|
|
|
|
sys.exit("Need to specify a wordlist")
|
|
|
|
if opts.key:
|
|
|
|
key = opts.key
|
|
|
|
else:
|
|
|
|
sys.exit("Need to specify a key")
|
|
|
|
if opts.output:
|
|
|
|
output = opts.output
|
|
|
|
else:
|
|
|
|
sys.exit("Need to specify an output directory")
|
|
|
|
if opts.font and os.path.exists(opts.font):
|
|
|
|
font = opts.font
|
|
|
|
else:
|
|
|
|
sys.exit("Need to specify the location of a font")
|
2013-03-11 00:57:12 +00:00
|
|
|
|
|
|
|
blacklist = read_wordlist(opts.blacklist)
|
2009-09-08 01:11:52 +00:00
|
|
|
count = opts.count
|
|
|
|
fill = opts.fill
|
|
|
|
dirs = opts.dirs
|
|
|
|
verbose = opts.verbose
|
|
|
|
fontsize = opts.font_size
|
2017-02-10 18:04:12 +00:00
|
|
|
threads = opts.threads
|
2012-07-27 23:47:21 +00:00
|
|
|
|
2006-01-27 10:22:37 +00:00
|
|
|
if fill:
|
|
|
|
count = max(0, fill - len(os.listdir(output)))
|
2012-07-28 00:10:04 +00:00
|
|
|
|
|
|
|
words = None
|
|
|
|
if wordlist:
|
|
|
|
words = read_wordlist(wordlist)
|
|
|
|
words = [x for x in words
|
|
|
|
if len(x) in (4,5) and x[0] != "f"
|
|
|
|
and x[0] != x[1] and x[-1] != x[-2]]
|
2013-03-11 00:57:12 +00:00
|
|
|
|
2017-02-10 18:04:12 +00:00
|
|
|
if count == 0:
|
|
|
|
sys.exit("No need to generate CAPTCHA images.")
|
|
|
|
|
|
|
|
if count < threads:
|
|
|
|
chunks = 1
|
|
|
|
threads = 1
|
|
|
|
else:
|
|
|
|
chunks = (count / threads)
|
|
|
|
|
|
|
|
p = multiprocessing.Pool(threads);
|
|
|
|
data = []
|
|
|
|
print("Generating %s CAPTCHA images separated in %s image(s) per chunk run by %s threads..." % (count, chunks, threads))
|
|
|
|
for i in range(0, threads):
|
|
|
|
data.append([chunks, words, blacklist, opts, font, fontsize])
|
2009-09-08 01:11:52 +00:00
|
|
|
|
2017-02-10 18:04:12 +00:00
|
|
|
p.map(run_in_thread, data)
|