2012-12-11 02:53:43 +00:00
|
|
|
|
#!/usr/bin/php
|
2015-03-18 01:16:20 +00:00
|
|
|
|
<?php
|
2012-12-11 02:53:43 +00:00
|
|
|
|
|
2018-01-10 14:16:00 +00:00
|
|
|
|
if ( PHP_SAPI !== 'cli' && PHP_SAPI !== 'phpdbg' ) {
|
2012-12-11 02:53:43 +00:00
|
|
|
|
die( "This script may only be executed from the command line.\n" );
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-15 17:19:00 +00:00
|
|
|
|
$chars = [];
|
2012-12-11 02:53:43 +00:00
|
|
|
|
for ( $i = 0; $i <= 0x10ffff; $i++ ) {
|
|
|
|
|
if ( $i < 0xd800 || $i > 0xdfff ) { // Skip UTF-16 surrogates
|
|
|
|
|
$chars[$i] = mb_convert_encoding( pack( 'N', $i ), 'UTF-8', 'UTF-32BE' );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
### Uppercase and Lowercase mappings
|
|
|
|
|
echo "Creating upper and lower tables...\n";
|
|
|
|
|
$L = fopen( __DIR__ . '/lower.lua', 'w' );
|
|
|
|
|
if ( !$L ) {
|
|
|
|
|
die( "Failed to open lower.lua\n" );
|
|
|
|
|
}
|
|
|
|
|
$U = fopen( __DIR__ . '/upper.lua', 'w' );
|
|
|
|
|
if ( !$U ) {
|
|
|
|
|
die( "Failed to open upper.lua\n" );
|
|
|
|
|
}
|
|
|
|
|
fprintf( $L, "-- This file is automatically generated by make-tables.php\n" );
|
|
|
|
|
fprintf( $L, "return {\n" );
|
|
|
|
|
fprintf( $U, "-- This file is automatically generated by make-tables.php\n" );
|
|
|
|
|
fprintf( $U, "return {\n" );
|
|
|
|
|
foreach ( $chars as $i => $c ) {
|
|
|
|
|
$l = mb_strtolower( $c, 'UTF-8' );
|
|
|
|
|
$u = mb_strtoupper( $c, 'UTF-8' );
|
|
|
|
|
if ( $c !== $l ) {
|
|
|
|
|
fprintf( $L, "\t[\"%s\"] = \"%s\",\n", $c, $l );
|
|
|
|
|
}
|
|
|
|
|
if ( $c !== $u ) {
|
|
|
|
|
fprintf( $U, "\t[\"%s\"] = \"%s\",\n", $c, $u );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
fprintf( $L, "}\n" );
|
|
|
|
|
fprintf( $U, "}\n" );
|
|
|
|
|
fclose( $L );
|
|
|
|
|
fclose( $U );
|
|
|
|
|
|
|
|
|
|
### Pattern code mappings
|
|
|
|
|
echo "Creating charsets table...\n";
|
2016-05-17 14:52:05 +00:00
|
|
|
|
$fh = fopen( __DIR__ . '/charsets.lua', 'w' );
|
|
|
|
|
if ( !$fh ) {
|
2012-12-11 02:53:43 +00:00
|
|
|
|
die( "Failed to open charsets.lua\n" );
|
|
|
|
|
}
|
2017-06-15 17:19:00 +00:00
|
|
|
|
$pats = [
|
2012-12-11 02:53:43 +00:00
|
|
|
|
// These should match the expressions in UstringLibrary::patternToRegex()
|
2017-06-15 17:19:00 +00:00
|
|
|
|
'a' => [ '\p{L}', 'lu' ],
|
|
|
|
|
'c' => [ '\p{Cc}', null ],
|
|
|
|
|
'd' => [ '\p{Nd}', null ],
|
|
|
|
|
'l' => [ '\p{Ll}', null ],
|
|
|
|
|
'p' => [ '\p{P}', null ],
|
|
|
|
|
's' => [ '\p{Xps}', null ],
|
|
|
|
|
'u' => [ '\p{Lu}', null ],
|
|
|
|
|
'w' => [ null, 'da' ], # '[\p{L}\p{Nd}]' exactly matches 'a' + 'd'
|
|
|
|
|
'x' => [ '[0-9A-Fa-f0-9A-Fa-f]', null ],
|
|
|
|
|
'z' => [ '\0', null ],
|
|
|
|
|
];
|
2012-12-11 02:53:43 +00:00
|
|
|
|
|
2017-06-15 17:19:00 +00:00
|
|
|
|
$ranges = [];
|
2016-05-17 14:52:05 +00:00
|
|
|
|
// @codingStandardsIgnoreLine MediaWiki.NamingConventions.PrefixedGlobalFunctions
|
2013-04-19 19:26:45 +00:00
|
|
|
|
function addRange( $k, $start, $end ) {
|
2016-05-17 14:52:05 +00:00
|
|
|
|
// @codingStandardsIgnoreLine MediaWiki.NamingConventions.ValidGlobalName
|
|
|
|
|
global $fh, $ranges;
|
2012-12-11 02:53:43 +00:00
|
|
|
|
// Speed/memory tradeoff
|
|
|
|
|
if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) {
|
|
|
|
|
$ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end );
|
|
|
|
|
} else {
|
|
|
|
|
for ( $i = $start; $i < $end; $i++ ) {
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "\t\t[0x%06x] = 1,\n", $i );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "-- This file is automatically generated by make-tables.php\n" );
|
|
|
|
|
fprintf( $fh, "local pats = {\n" );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
foreach ( $pats as $k => $pp ) {
|
2017-06-15 17:19:00 +00:00
|
|
|
|
$ranges[$k] = [];
|
2016-05-17 14:52:05 +00:00
|
|
|
|
$re = $pp[0];
|
2012-12-11 02:53:43 +00:00
|
|
|
|
if ( !$re ) {
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "\t[0x%02x] = {},\n", ord( $k ) );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 14:52:05 +00:00
|
|
|
|
$re2 = 'fail';
|
2012-12-11 02:53:43 +00:00
|
|
|
|
if ( $pp[1] ) {
|
2017-06-15 17:19:00 +00:00
|
|
|
|
$re2 = [];
|
2012-12-11 02:53:43 +00:00
|
|
|
|
foreach ( str_split( $pp[1] ) as $p ) {
|
|
|
|
|
$re2[] = $pats[$p][0];
|
|
|
|
|
}
|
2018-02-15 13:57:18 +00:00
|
|
|
|
$re2 = implode( '|', $re2 );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "\t[0x%02x] = {\n", ord( $k ) );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
$rstart = null;
|
|
|
|
|
foreach ( $chars as $i => $c ) {
|
|
|
|
|
if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) {
|
|
|
|
|
if ( $rstart === null ) {
|
|
|
|
|
$rstart = $i;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if ( $rstart !== null ) {
|
2013-04-19 19:26:45 +00:00
|
|
|
|
addRange( $k, $rstart, $i );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
$rstart = null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if ( $rstart !== null ) {
|
2013-04-19 19:26:45 +00:00
|
|
|
|
addRange( $k, $rstart, 0x110000 );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
}
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "\t},\n" );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
}
|
|
|
|
|
foreach ( $pats as $k => $pp ) {
|
|
|
|
|
$kk = strtoupper( $k );
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "\t[0x%02x] = {},\n", ord( $kk ) );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
}
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "}\n" );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
foreach ( $pats as $k => $pp ) {
|
|
|
|
|
$body = '';
|
2017-06-15 17:19:00 +00:00
|
|
|
|
$check = [];
|
2012-12-11 02:53:43 +00:00
|
|
|
|
if ( $pp[1] ) {
|
|
|
|
|
foreach ( str_split( $pp[1] ) as $p ) {
|
|
|
|
|
$check[] = sprintf( "pats[0x%02x][k]", ord( $p ) );
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-12-22 02:14:03 +00:00
|
|
|
|
// @phan-suppress-next-line PhanImpossibleConditionInGlobalScope
|
2012-12-11 02:53:43 +00:00
|
|
|
|
if ( $ranges[$k] ) {
|
|
|
|
|
$body = "\tlocal c = tonumber( k ) or 0/0;\n";
|
|
|
|
|
$check = array_merge( $check, $ranges[$k] );
|
|
|
|
|
}
|
|
|
|
|
if ( $check ) {
|
2018-02-15 13:57:18 +00:00
|
|
|
|
$body .= "\treturn " . implode( " or\n\t\t", $check );
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "setmetatable( pats[0x%02x], { __index = function ( t, k )\n%s\nend } )\n",
|
2012-12-11 02:53:43 +00:00
|
|
|
|
ord( $k ), $body );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
foreach ( $pats as $k => $pp ) {
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "setmetatable( pats[0x%02x], { ", ord( strtoupper( $k ) ) );
|
|
|
|
|
fprintf( $fh, "__index = function ( t, k ) return k and not pats[0x%02x][k] end", ord( $k ) );
|
|
|
|
|
fprintf( $fh, " } )\n" );
|
2012-12-11 02:53:43 +00:00
|
|
|
|
}
|
2016-05-17 14:52:05 +00:00
|
|
|
|
fprintf( $fh, "\n-- For speed, cache printable ASCII characters in main tables\n" );
|
|
|
|
|
fprintf( $fh, "for k, t in pairs( pats ) do\n" );
|
|
|
|
|
fprintf( $fh, "\tif k >= 0x61 then\n" );
|
|
|
|
|
fprintf( $fh, "\t\tfor i = 0x20, 0x7e do\n" );
|
|
|
|
|
fprintf( $fh, "\t\t\tt[i] = t[i] or false\n" );
|
|
|
|
|
fprintf( $fh, "\t\tend\n" );
|
|
|
|
|
fprintf( $fh, "\tend\n" );
|
|
|
|
|
fprintf( $fh, "end\n" );
|
|
|
|
|
fprintf( $fh, "\nreturn pats\n" );
|
|
|
|
|
fclose( $fh );
|