mediawiki-extensions-Confir.../captcha.py

#!/usr/bin/python3
#
# Script to generate distorted text images for a captcha system.
#
# Copyright (C) 2005 Neil Harris
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# http://www.gnu.org/copyleft/gpl.html
#
# Further tweaks by Brion Vibber <brion@pobox.com>:
# 2006-01-26: Add command-line options for the various parameters
# 2007-02-19: Add --dirs param for hash subdirectory splits
# Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
# 2008-01-06: Add regex check to skip words containing other than a-z

from optparse import OptionParser
import hashlib
import json
import math
import multiprocessing
import os
import random
import re
import sys

try:
    from PIL import Image, ImageDraw, ImageEnhance, ImageFont, ImageOps
except ImportError:
    sys.exit(
        "This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/"
    )

# regex to test for suitability of words
nonalpha = re.compile("[^a-z]")

# when il beside each other, hard to read
confusedletters = re.compile(
    "[ijtlr][ijtl]|r[nompqr]|[il]"
)

# Pillow 9.2 added getbbox to replace getsize, and getsize() was removed in Pillow 10
# https://pillow.readthedocs.io/en/stable/releasenotes/10.0.0.html#font-size-and-offset-methods
# We don't have a requirements.txt, and therefore don't declare any specific supported or min version...
IMAGEFONT_HAS_GETBBOX = hasattr(ImageFont.ImageFont, "getbbox")


# Does X-axis wobbly copy, sandwiched between two rotates
def wobbly_copy(src, wob, col, scale, ang):
    x, y = src.size
    f = random.uniform(4 * scale, 5 * scale)
    p = random.uniform(0, math.pi * 2)
    rr = ang + random.uniform(-30, 30)  # vary, but not too much
    int_d = Image.new("RGB", src.size, 0)  # a black rectangle
    rot = src.rotate(rr, Image.BILINEAR)
    # Do a cheap bounding-box op here to try to limit work below
    bbx = rot.getbbox()
    if bbx is None:
        return src
    else:
        l, t, r, b = bbx
    # and only do lines with content on
    for i in range(t, b + 1):
        # Drop a scan line in
        xoff = int(math.sin(p + (i * f / y)) * wob)
        xoff += int(random.uniform(-wob * 0.5, wob * 0.5))
        int_d.paste(rot.crop((0, i, x, i + 1)), (xoff, i))
    # try to stop blurring from building up
    int_d = int_d.rotate(-rr, Image.BILINEAR)
    enh = ImageEnhance.Sharpness(int_d)
    return enh.enhance(2)


def gen_captcha(text, fontname, fontsize, file_name):
    """Generate a captcha image"""
    # white text on a black background
    bgcolor = 0x0
    fgcolor = 0xFFFFFF
    # create a font object
    font = ImageFont.truetype(fontname, fontsize)

    # determine dimensions of the text
    if IMAGEFONT_HAS_GETBBOX:
        dim = font.getbbox(text)[2:]
    else:
        dim = font.getsize(text)

    # create a new image significantly larger that the text
    edge = max(dim[0], dim[1]) + 2 * min(dim[0], dim[1])
    im = Image.new("RGB", (edge, edge), bgcolor)
    d = ImageDraw.Draw(im)
    x, y = im.size
    # add the text to the image
    # Using between 5-6 pixels of negative kerning seemed
    # enough to confuse tesseract but still be very readable
    offset = 0
    for c in text:
        d.text(
            (x / 2 - dim[0] / 2 + offset, y / 2 - dim[1] / 2 + random.uniform(-3, 7)),
            c,
            font=font,
            fill=fgcolor,
        )
        if IMAGEFONT_HAS_GETBBOX:
            offset += font.getbbox(c)[2:][0]
        else:
            offset += font.getsize(c)[0]

        offset -= random.uniform(5, 6)

    for i in range(10):
        x0 = int(
            offset * ((i / 2) - 1) / 5
            + x / 2
            - dim[0] / 2
            + random.uniform(0, 10)
        )
        y0 = int(y / 2 - dim[1] + 30 + random.uniform(-10, 15))

        x1 = int(offset * i / 7 + x / 2 - dim[0] / 2 + random.uniform(-5, 5))
        y1 = int(y / 2 - dim[1] + 30 + random.uniform(-10, 30))

        if x1 < x0:
            x0, x1 = x1, x0

        if y1 < y0:
            y0, y1 = y1, y0

        d.arc(
            (x0, y0, x1, y1),
            int(random.uniform(-30, 30)),
            int(random.uniform(160, 300)),
            fill=fgcolor,
        )

    # now get the bounding box of the nonzero parts of the image
    bbox = im.getbbox()
    bord = min(dim[0], dim[1]) / 4  # a bit of a border
    im = im.crop((bbox[0] - bord, bbox[1] - bord, bbox[2] + bord, bbox[3] + bord))

    # and turn into black on white
    im = ImageOps.invert(im)

    # save the image, in format determined from filename
    im.save(file_name)


def gen_subdir(basedir, md5hash, levels):
    """Generate a subdirectory path out of the first _levels_
    characters of _hash_, and ensure the directories exist
    under _basedir_."""
    subdir = None
    for i in range(0, levels):
        char = md5hash[i]
        if subdir:
            subdir = os.path.join(subdir, char)
        else:
            subdir = char
        fulldir = os.path.join(basedir, subdir)
        if not os.path.exists(fulldir):
            os.mkdir(fulldir)
    return subdir


def try_pick_word(words, badwordlist, verbose, nwords, min_length, max_length):
    if words is not None:
        word = words[random.randint(0, len(words) - 1)]
        while nwords > 1:
            word2 = words[random.randint(0, len(words) - 1)]
            word = word + word2
            nwords = nwords - 1
    else:
        word = ""
        max_length = max_length if max_length > 0 else 10
        for i in range(0, random.randint(min_length, max_length)):
            word = word + chr(97 + random.randint(0, 25))

    if verbose:
        print("word is %s" % word)

    if len(word) < min_length:
        if verbose:
            print(
                "skipping word pair '%s' because it has fewer than %d characters"
                % (word, min_length)
            )
        return None

    if max_length > 0 and len(word) > max_length:
        if verbose:
            print(
                "skipping word pair '%s' because it has more than %d characters"
                % (word, max_length)
            )
        return None

    if nonalpha.search(word):
        if verbose:
            print(
                "skipping word pair '%s' because it contains non-alphabetic characters"
                % word
            )
        return None
    if confusedletters.search(word):
        if verbose:
            print(
                "skipping word pair '%s' because it contains confusing letters beside each other"
                % word
            )
        return None

    for naughty in badwordlist:
        if naughty in word:
            if verbose:
                print(
                    "skipping word pair '%s' because it contains word '%s'"
                    % (word, naughty)
                )
            return None
    return word


def pick_word(words, badwordlist, verbose, nwords, min_length, max_length):
    for x in range(
        1000
    ):  # If we can't find a valid combination in 1000 tries, just give up
        word = try_pick_word(
            words, badwordlist, verbose, nwords, min_length, max_length
        )
        if word:
            return word
    sys.exit("Unable to find valid word combinations")


def read_wordlist(filename):
    if not os.path.isfile(filename):
        return []
    f = open(filename)
    words = [x.strip().lower() for x in f.readlines()]
    f.close()
    return words


def run_in_thread(object):
    count = object[0]
    words = object[1]
    badwordlist = object[2]
    opts = object[3]
    font = object[4]
    fontsize = object[5]
    jsonmap = object[6]

    for i in range(count):
        word = pick_word(
            words,
            badwordlist,
            opts.verbose,
            opts.number_words,
            opts.min_length,
            opts.max_length,
        )
        salt = "%08x" % random.randrange(2**32)
        # 64 bits of hash is plenty for this purpose
        md5hash = hashlib.md5(
            (opts.key + salt + word + opts.key + salt).encode("utf-8")
        ).hexdigest()[:16]
        filename = "image_%s_%s.png" % (salt, md5hash)
        if opts.dirs:
            subdir = gen_subdir(opts.output, md5hash, opts.dirs)
            filename = os.path.join(subdir, filename)
        if opts.verbose:
            print(filename)
        if opts.jsonmap:
            jsonmap[filename] = word

        gen_captcha(word, font, fontsize, os.path.join(opts.output, filename))


if __name__ == "__main__":
    """This grabs random words from the dictionary 'words' (one
    word per line) and generates a captcha image for each one,
    with a keyed salted hash of the correct answer in the filename.

    To check a reply, hash it in the same way with the same salt and
    secret key, then compare with the hash value given.
    """
    script_dir = os.path.dirname(os.path.realpath(__file__))
    parser = OptionParser()
    parser.add_option(
        "--wordlist",
        help="A list of words (required)",
        metavar="WORDS.txt"
    )
    parser.add_option(
        "--random",
        help="Use random characters instead of a wordlist",
        action="store_true",
    )
    parser.add_option(
        "--key",
        help="The passphrase set as $wgCaptchaSecret (required)",
        metavar="KEY"
    )
    parser.add_option(
        "--output",
        help="The directory to put the images in - $wgCaptchaDirectory (required)",
        metavar="DIR",
    )
    parser.add_option(
        "--font",
        help="The font to use (required)",
        metavar="FONT.ttf"
    )
    parser.add_option(
        "--font-size",
        help="The font size (default 40)",
        metavar="N",
        type="int",
        default=40,
    )
    parser.add_option(
        "--count",
        help="The maximum number of images to make (default 20)",
        metavar="N",
        type="int",
        default=20,
    )
    parser.add_option(
        "--badwordlist",
        help="A list of words that should not be used",
        metavar="FILE",
        default=os.path.join(script_dir, "badwordlist"),
    )
    parser.add_option(
        "--fill",
        help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs",
        metavar="N",
        type="int",
    )
    parser.add_option(
        "--dirs",
        help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels",
        metavar="N",
        type="int",
    )
    parser.add_option(
        "--verbose",
        "-v",
        help="Show debugging information",
        action="store_true"
    )
    parser.add_option(
        "--number-words",
        help="Number of words from the wordlist which make a captcha challenge (default 2)",
        type="int",
        default=2,
    )
    parser.add_option(
        "--min-length",
        help="Minimum length for a captcha challenge",
        type="int",
        default=1,
    )
    parser.add_option(
        "--max-length",
        help="Maximum length for a captcha challenge",
        type="int",
        default=-1,
    )
    parser.add_option(
        "--threads",
        help="Maximum number of threads to be used to generate captchas",
        type="int",
        default=1,
    )
    parser.add_option(
        "--jsonmap",
        help="Outputs \"filename\": \"word\" mapping for test/debug purposes",
        action="store_true"
    )

    opts, args = parser.parse_args()

    if opts.wordlist:
        wordlist = opts.wordlist
    elif opts.random:
        wordlist = None
    else:
        sys.exit("Need to specify a wordlist")
    if opts.key:
        key = opts.key
    else:
        sys.exit("Need to specify a key")
    if opts.output:
        output = opts.output
    else:
        sys.exit("Need to specify an output directory")
    if opts.font and os.path.exists(opts.font):
        font = opts.font
    else:
        sys.exit("Need to specify the location of a font")

    badwordlist = read_wordlist(opts.badwordlist)
    count = opts.count
    fill = opts.fill
    fontsize = opts.font_size
    threads = opts.threads

    if fill:
        count = max(0, fill - len(os.listdir(output)))

    words = None
    if wordlist:
        words = read_wordlist(wordlist)
        words = [
            x
            for x in words
            if len(x) in (4, 5) and x[0] != "f" and x[0] != x[1] and x[-1] != x[-2]
        ]

    if count == 0:
        sys.exit("No need to generate CAPTCHA images.")

    if count < threads:
        chunks = 1
        threads = 1
    else:
        chunks = count // threads

    p = multiprocessing.Pool(threads)
    data = []
    print(
        "Generating %s CAPTCHA images separated in %s image(s) per chunk run by %s threads..."
        % (count, chunks, threads)
    )
    jsonmap = multiprocessing.Manager().dict()
    for i in range(0, threads):
        data.append([chunks, words, badwordlist, opts, font, fontsize, jsonmap])

    result = p.map_async(run_in_thread, data)
    result.wait()

    if opts.jsonmap:
        with open("map.json", "w") as outfile:
            json.dump(jsonmap.copy(), outfile, indent=4)