#!/usr/bin/php <?php if ( PHP_SAPI !== 'cli' ) { die( "This script may only be executed from the command line.\n" ); } $chars=array(); for ( $i = 0; $i <= 0x10ffff; $i++ ) { if ( $i < 0xd800 || $i > 0xdfff ) { // Skip UTF-16 surrogates $chars[$i] = mb_convert_encoding( pack( 'N', $i ), 'UTF-8', 'UTF-32BE' ); } } ### Uppercase and Lowercase mappings echo "Creating upper and lower tables...\n"; $L = fopen( __DIR__ . '/lower.lua', 'w' ); if ( !$L ) { die( "Failed to open lower.lua\n" ); } $U = fopen( __DIR__ . '/upper.lua', 'w' ); if ( !$U ) { die( "Failed to open upper.lua\n" ); } fprintf( $L, "-- This file is automatically generated by make-tables.php\n" ); fprintf( $L, "return {\n" ); fprintf( $U, "-- This file is automatically generated by make-tables.php\n" ); fprintf( $U, "return {\n" ); foreach ( $chars as $i => $c ) { $l = mb_strtolower( $c, 'UTF-8' ); $u = mb_strtoupper( $c, 'UTF-8' ); if ( $c !== $l ) { fprintf( $L, "\t[\"%s\"] = \"%s\",\n", $c, $l ); } if ( $c !== $u ) { fprintf( $U, "\t[\"%s\"] = \"%s\",\n", $c, $u ); } } fprintf( $L, "}\n" ); fprintf( $U, "}\n" ); fclose( $L ); fclose( $U ); ### Pattern code mappings echo "Creating charsets table...\n"; $X = fopen( __DIR__ . '/charsets.lua', 'w' ); if ( !$X ) { die( "Failed to open charsets.lua\n" ); } $pats = array( // These should match the expressions in UstringLibrary::patternToRegex() 'a' => array( '\p{L}', 'lu' ), 'c' => array( '\p{Cc}', null ), 'd' => array( '\p{Nd}', null ), 'l' => array( '\p{Ll}', null ), 'p' => array( '\p{P}', null ), 's' => array( '\p{Xps}', null ), 'u' => array( '\p{Lu}', null ), 'w' => array( null, 'da' ), # '[\p{L}\p{Nd}]' exactly matches 'a' + 'd' 'x' => array( '[0-9A-Fa-f0-9A-Fa-f]', null ), 'z' => array( '\0', null ), ); $ranges = array(); function addRange( $k, $start, $end ) { global $X, $ranges; // Speed/memory tradeoff if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) { $ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end ); } else { for ( $i = $start; $i < $end; $i++ ) { fprintf( $X, "\t\t[0x%06x] = 1,\n", $i ); } } } fprintf( $X, "-- This file is automatically generated by make-tables.php\n" ); fprintf( $X, "local pats = {\n" ); foreach ( $pats as $k => $pp ) { $ranges[$k] = array(); $re=$pp[0]; if ( !$re ) { fprintf( $X, "\t[0x%02x] = {},\n", ord( $k ) ); continue; } $re2='fail'; if ( $pp[1] ) { $re2 = array(); foreach ( str_split( $pp[1] ) as $p ) { $re2[] = $pats[$p][0]; } $re2=join( '|', $re2 ); } fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) ); $rstart = null; foreach ( $chars as $i => $c ) { if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) { if ( $rstart === null ) { $rstart = $i; } } else { if ( $rstart !== null ) { addRange( $k, $rstart, $i ); $rstart = null; } } } if ( $rstart !== null ) { addRange( $k, $rstart, 0x110000 ); } fprintf( $X, "\t},\n" ); } foreach ( $pats as $k => $pp ) { $kk = strtoupper( $k ); fprintf( $X, "\t[0x%02x] = {},\n", ord( $kk ) ); } fprintf( $X, "}\n" ); foreach ( $pats as $k => $pp ) { $body = ''; $check = array(); if ( $pp[1] ) { foreach ( str_split( $pp[1] ) as $p ) { $check[] = sprintf( "pats[0x%02x][k]", ord( $p ) ); } } if ( $ranges[$k] ) { $body = "\tlocal c = tonumber( k ) or 0/0;\n"; $check = array_merge( $check, $ranges[$k] ); } if ( $check ) { $body .= "\treturn " . join( " or\n\t\t", $check ); fprintf( $X, "setmetatable( pats[0x%02x], { __index = function ( t, k )\n%s\nend } )\n", ord( $k ), $body ); } } foreach ( $pats as $k => $pp ) { fprintf( $X, "setmetatable( pats[0x%02x], { __index = function ( t, k ) return k and not pats[0x%02x][k] end } )\n", ord( strtoupper( $k ) ), ord( $k ) ); } fprintf( $X, "\n-- For speed, cache printable ASCII characters in main tables\n" ); fprintf( $X, "for k, t in pairs( pats ) do\n" ); fprintf( $X, "\tif k >= 0x61 then\n" ); fprintf( $X, "\t\tfor i = 0x20, 0x7e do\n" ); fprintf( $X, "\t\t\tt[i] = t[i] or false\n" ); fprintf( $X, "\t\tend\n" ); fprintf( $X, "\tend\n" ); fprintf( $X, "end\n" ); fprintf( $X, "\nreturn pats\n" ); fclose( $X );