mediawiki-extensions-Confir.../captcha.py
Reedy bdd127a276 captca.py: Fix PIL 10 support again
Bug: T354099
Follows-Up: Ia17157d45995b78c6a73f844dfe7d20d09564748
Change-Id: I9d1fe7ad7f12fd79c960574daf79e558d88cb02c
2024-01-27 14:26:02 +00:00

433 lines
13 KiB
Python

#!/usr/bin/python3
#
# Script to generate distorted text images for a captcha system.
#
# Copyright (C) 2005 Neil Harris
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# http://www.gnu.org/copyleft/gpl.html
#
# Further tweaks by Brion Vibber <brion@pobox.com>:
# 2006-01-26: Add command-line options for the various parameters
# 2007-02-19: Add --dirs param for hash subdirectory splits
# Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
# 2008-01-06: Add regex check to skip words containing other than a-z
import random
import math
import hashlib
from optparse import OptionParser
import os
import sys
import re
import multiprocessing
try:
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw
from PIL import ImageEnhance
from PIL import ImageOps
except ImportError:
sys.exit(
"This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/"
)
# regex to test for suitability of words
nonalpha = re.compile("[^a-z]")
# when il beside each other, hard to read
confusedletters = re.compile(
"[ijtlr][ijtl]|r[nompqr]|[il]"
)
# Pillow 9.2 added getbbox to replace getsize, and getsize() was removed in Pillow 10
# https://pillow.readthedocs.io/en/stable/releasenotes/10.0.0.html#font-size-and-offset-methods
# We don't have a requirements.txt, and therefore don't declare any specific supported or min version...
IMAGEFONT_HAS_GETBBOX = hasattr(ImageFont.ImageFont, "getbbox")
# Does X-axis wobbly copy, sandwiched between two rotates
def wobbly_copy(src, wob, col, scale, ang):
x, y = src.size
f = random.uniform(4 * scale, 5 * scale)
p = random.uniform(0, math.pi * 2)
rr = ang + random.uniform(-30, 30) # vary, but not too much
int_d = Image.new("RGB", src.size, 0) # a black rectangle
rot = src.rotate(rr, Image.BILINEAR)
# Do a cheap bounding-box op here to try to limit work below
bbx = rot.getbbox()
if bbx is None:
return src
else:
l, t, r, b = bbx
# and only do lines with content on
for i in range(t, b + 1):
# Drop a scan line in
xoff = int(math.sin(p + (i * f / y)) * wob)
xoff += int(random.uniform(-wob * 0.5, wob * 0.5))
int_d.paste(rot.crop((0, i, x, i + 1)), (xoff, i))
# try to stop blurring from building up
int_d = int_d.rotate(-rr, Image.BILINEAR)
enh = ImageEnhance.Sharpness(int_d)
return enh.enhance(2)
def gen_captcha(text, fontname, fontsize, file_name):
"""Generate a captcha image"""
# white text on a black background
bgcolor = 0x0
fgcolor = 0xFFFFFF
# create a font object
font = ImageFont.truetype(fontname, fontsize)
# determine dimensions of the text
if IMAGEFONT_HAS_GETBBOX:
dim = font.getbbox(text)[2:]
else:
dim = font.getsize(text)
# create a new image significantly larger that the text
edge = max(dim[0], dim[1]) + 2 * min(dim[0], dim[1])
im = Image.new("RGB", (edge, edge), bgcolor)
d = ImageDraw.Draw(im)
x, y = im.size
# add the text to the image
# Using between 5-6 pixels of negative kerning seemed
# enough to confuse tesseract but still be very readable
offset = 0
for c in text:
d.text(
(x / 2 - dim[0] / 2 + offset, y / 2 - dim[1] / 2 + random.uniform(-3, 7)),
c,
font=font,
fill=fgcolor,
)
if IMAGEFONT_HAS_GETBBOX:
offset += font.getbbox(c)[2:][0]
else:
offset += font.getsize(c)[0]
offset -= random.uniform(5, 6)
for i in range(10):
d.arc(
(
int(
offset * ((i / 2) - 1) / 5
+ x / 2
- dim[0] / 2
+ random.uniform(0, 10)
),
int(y / 2 - dim[1] + 30 + random.uniform(-10, 15)),
int(offset * i / 7 + x / 2 - dim[0] / 2 + random.uniform(-5, 5)),
int(y / 2 - dim[1] + 30 + random.uniform(-10, 30)),
),
int(random.uniform(-30, 30)),
int(random.uniform(160, 300)),
fill=fgcolor,
)
# now get the bounding box of the nonzero parts of the image
bbox = im.getbbox()
bord = min(dim[0], dim[1]) / 4 # a bit of a border
im = im.crop((bbox[0] - bord, bbox[1] - bord, bbox[2] + bord, bbox[3] + bord))
# and turn into black on white
im = ImageOps.invert(im)
# save the image, in format determined from filename
im.save(file_name)
def gen_subdir(basedir, md5hash, levels):
"""Generate a subdirectory path out of the first _levels_
characters of _hash_, and ensure the directories exist
under _basedir_."""
subdir = None
for i in range(0, levels):
char = md5hash[i]
if subdir:
subdir = os.path.join(subdir, char)
else:
subdir = char
fulldir = os.path.join(basedir, subdir)
if not os.path.exists(fulldir):
os.mkdir(fulldir)
return subdir
def try_pick_word(words, badwordlist, verbose, nwords, min_length, max_length):
if words is not None:
word = words[random.randint(0, len(words) - 1)]
while nwords > 1:
word2 = words[random.randint(0, len(words) - 1)]
word = word + word2
nwords = nwords - 1
else:
word = ""
max_length = max_length if max_length > 0 else 10
for i in range(0, random.randint(min_length, max_length)):
word = word + chr(97 + random.randint(0, 25))
if verbose:
print("word is %s" % word)
if len(word) < min_length:
if verbose:
print(
"skipping word pair '%s' because it has fewer than %d characters"
% (word, min_length)
)
return None
if max_length > 0 and len(word) > max_length:
if verbose:
print(
"skipping word pair '%s' because it has more than %d characters"
% (word, max_length)
)
return None
if nonalpha.search(word):
if verbose:
print(
"skipping word pair '%s' because it contains non-alphabetic characters"
% word
)
return None
if confusedletters.search(word):
if verbose:
print(
"skipping word pair '%s' because it contains confusing letters beside each other"
% word
)
return None
for naughty in badwordlist:
if naughty in word:
if verbose:
print(
"skipping word pair '%s' because it contains word '%s'"
% (word, naughty)
)
return None
return word
def pick_word(words, badwordlist, verbose, nwords, min_length, max_length):
for x in range(
1000
): # If we can't find a valid combination in 1000 tries, just give up
word = try_pick_word(
words, badwordlist, verbose, nwords, min_length, max_length
)
if word:
return word
sys.exit("Unable to find valid word combinations")
def read_wordlist(filename):
if not os.path.isfile(filename):
return []
f = open(filename)
words = [x.strip().lower() for x in f.readlines()]
f.close()
return words
def run_in_thread(object):
count = object[0]
words = object[1]
badwordlist = object[2]
opts = object[3]
font = object[4]
fontsize = object[5]
for i in range(count):
word = pick_word(
words,
badwordlist,
opts.verbose,
opts.number_words,
opts.min_length,
opts.max_length,
)
salt = "%08x" % random.randrange(2**32)
# 64 bits of hash is plenty for this purpose
md5hash = hashlib.md5(
(opts.key + salt + word + opts.key + salt).encode("utf-8")
).hexdigest()[:16]
filename = "image_%s_%s.png" % (salt, md5hash)
if opts.dirs:
subdir = gen_subdir(opts.output, md5hash, opts.dirs)
filename = os.path.join(subdir, filename)
if opts.verbose:
print(filename)
gen_captcha(word, font, fontsize, os.path.join(opts.output, filename))
if __name__ == "__main__":
"""This grabs random words from the dictionary 'words' (one
word per line) and generates a captcha image for each one,
with a keyed salted hash of the correct answer in the filename.
To check a reply, hash it in the same way with the same salt and
secret key, then compare with the hash value given.
"""
script_dir = os.path.dirname(os.path.realpath(__file__))
parser = OptionParser()
parser.add_option(
"--wordlist", help="A list of words (required)", metavar="WORDS.txt"
)
parser.add_option(
"--random",
help="Use random characters instead of a wordlist",
action="store_true",
)
parser.add_option(
"--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY"
)
parser.add_option(
"--output",
help="The directory to put the images in - $wgCaptchaDirectory (required)",
metavar="DIR",
)
parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
parser.add_option(
"--font-size",
help="The font size (default 40)",
metavar="N",
type="int",
default=40,
)
parser.add_option(
"--count",
help="The maximum number of images to make (default 20)",
metavar="N",
type="int",
default=20,
)
parser.add_option(
"--badwordlist",
help="A list of words that should not be used",
metavar="FILE",
default=os.path.join(script_dir, "badwordlist"),
)
parser.add_option(
"--blacklist",
help="DEPRECATED: list of words that should not be used",
metavar="FILE",
default=os.path.join(script_dir, "blacklist"),
)
parser.add_option(
"--fill",
help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs",
metavar="N",
type="int",
)
parser.add_option(
"--dirs",
help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels",
metavar="N",
type="int",
)
parser.add_option(
"--verbose", "-v", help="Show debugging information", action="store_true"
)
parser.add_option(
"--number-words",
help="Number of words from the wordlist which make a captcha challenge (default 2)",
type="int",
default=2,
)
parser.add_option(
"--min-length",
help="Minimum length for a captcha challenge",
type="int",
default=1,
)
parser.add_option(
"--max-length",
help="Maximum length for a captcha challenge",
type="int",
default=-1,
)
parser.add_option(
"--threads",
help="Maximum number of threads to be used to generate captchas.",
type="int",
default=1,
)
opts, args = parser.parse_args()
if opts.wordlist:
wordlist = opts.wordlist
elif opts.random:
wordlist = None
else:
sys.exit("Need to specify a wordlist")
if opts.key:
key = opts.key
else:
sys.exit("Need to specify a key")
if opts.output:
output = opts.output
else:
sys.exit("Need to specify an output directory")
if opts.font and os.path.exists(opts.font):
font = opts.font
else:
sys.exit("Need to specify the location of a font")
badwordlist = read_wordlist(opts.blacklist) + read_wordlist(opts.badwordlist)
count = opts.count
fill = opts.fill
fontsize = opts.font_size
threads = opts.threads
if fill:
count = max(0, fill - len(os.listdir(output)))
words = None
if wordlist:
words = read_wordlist(wordlist)
words = [
x
for x in words
if len(x) in (4, 5) and x[0] != "f" and x[0] != x[1] and x[-1] != x[-2]
]
if count == 0:
sys.exit("No need to generate CAPTCHA images.")
if count < threads:
chunks = 1
threads = 1
else:
chunks = count // threads
p = multiprocessing.Pool(threads)
data = []
print(
"Generating %s CAPTCHA images separated in %s image(s) per chunk run by %s threads..."
% (count, chunks, threads)
)
for i in range(0, threads):
data.append([chunks, words, badwordlist, opts, font, fontsize])
p.map(run_in_thread, data)