mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-30 11:04:17 +00:00
82820aafc8
The "%f[set]" frontier pattern has been in Lua 5.1 since the beginning, but was undocumented until Lua 5.2. And the code is even unchanged from 5.1.0 to 5.2.1. So there's no reason not to implement it in ustring too. Note the changes to UstringLibrary.php are somewhat large, because it splits the "convert a Lua bracketed charset to PCRE" code into a separate function and it changes the handling of mw.ustring.find's and mw.ustring.match's 'init' parameter from "substring, match from 0, then add back on $init" to "use preg_match's $offset and use \G instead of ^ where this matters". Both of these are necessary to properly support %f. This also fixes a bug in the pure-Lua code (not used in Scribunto) exposed by the unit tests for %f where %z was matching '\1' rather than '\0' and %Z everything except '\1' instead of everything except '\0'. Bug: 48331 Change-Id: Ie0b95ef5b734db53d6adc9de5dae4874f8944c08
152 lines
4.1 KiB
PHP
Executable file
152 lines
4.1 KiB
PHP
Executable file
#!/usr/bin/php
|
||
<?
|
||
|
||
if ( PHP_SAPI !== 'cli' ) {
|
||
die( "This script may only be executed from the command line.\n" );
|
||
}
|
||
|
||
$chars=array();
|
||
for ( $i = 0; $i <= 0x10ffff; $i++ ) {
|
||
if ( $i < 0xd800 || $i > 0xdfff ) { // Skip UTF-16 surrogates
|
||
$chars[$i] = mb_convert_encoding( pack( 'N', $i ), 'UTF-8', 'UTF-32BE' );
|
||
}
|
||
}
|
||
|
||
### Uppercase and Lowercase mappings
|
||
echo "Creating upper and lower tables...\n";
|
||
$L = fopen( __DIR__ . '/lower.lua', 'w' );
|
||
if ( !$L ) {
|
||
die( "Failed to open lower.lua\n" );
|
||
}
|
||
$U = fopen( __DIR__ . '/upper.lua', 'w' );
|
||
if ( !$U ) {
|
||
die( "Failed to open upper.lua\n" );
|
||
}
|
||
fprintf( $L, "-- This file is automatically generated by make-tables.php\n" );
|
||
fprintf( $L, "return {\n" );
|
||
fprintf( $U, "-- This file is automatically generated by make-tables.php\n" );
|
||
fprintf( $U, "return {\n" );
|
||
foreach ( $chars as $i => $c ) {
|
||
$l = mb_strtolower( $c, 'UTF-8' );
|
||
$u = mb_strtoupper( $c, 'UTF-8' );
|
||
if ( $c !== $l ) {
|
||
fprintf( $L, "\t[\"%s\"] = \"%s\",\n", $c, $l );
|
||
}
|
||
if ( $c !== $u ) {
|
||
fprintf( $U, "\t[\"%s\"] = \"%s\",\n", $c, $u );
|
||
}
|
||
}
|
||
fprintf( $L, "}\n" );
|
||
fprintf( $U, "}\n" );
|
||
fclose( $L );
|
||
fclose( $U );
|
||
|
||
### Pattern code mappings
|
||
echo "Creating charsets table...\n";
|
||
$X = fopen( __DIR__ . '/charsets.lua', 'w' );
|
||
if ( !$X ) {
|
||
die( "Failed to open charsets.lua\n" );
|
||
}
|
||
$pats = array(
|
||
// These should match the expressions in UstringLibrary::patternToRegex()
|
||
'a' => array( '\p{L}', 'lu' ),
|
||
'c' => array( '\p{Cc}', null ),
|
||
'd' => array( '\p{Nd}', null ),
|
||
'l' => array( '\p{Ll}', null ),
|
||
'p' => array( '\p{P}', null ),
|
||
's' => array( '\p{Xps}', null ),
|
||
'u' => array( '\p{Lu}', null ),
|
||
'w' => array( null, 'da' ), # '[\p{L}\p{Nd}]' exactly matches 'a' + 'd'
|
||
'x' => array( '[0-9A-Fa-f0-9A-Fa-f]', null ),
|
||
'z' => array( '\0', null ),
|
||
);
|
||
|
||
$ranges = array();
|
||
function addRange( $k, $start, $end ) {
|
||
global $X, $ranges;
|
||
// Speed/memory tradeoff
|
||
if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) {
|
||
$ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end );
|
||
} else {
|
||
for ( $i = $start; $i < $end; $i++ ) {
|
||
fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
|
||
}
|
||
}
|
||
}
|
||
|
||
fprintf( $X, "-- This file is automatically generated by make-tables.php\n" );
|
||
fprintf( $X, "local pats = {\n" );
|
||
foreach ( $pats as $k => $pp ) {
|
||
$ranges[$k] = array();
|
||
$re=$pp[0];
|
||
if ( !$re ) {
|
||
fprintf( $X, "\t[0x%02x] = {},\n", ord( $k ) );
|
||
continue;
|
||
}
|
||
|
||
$re2='fail';
|
||
if ( $pp[1] ) {
|
||
$re2 = array();
|
||
foreach ( str_split( $pp[1] ) as $p ) {
|
||
$re2[] = $pats[$p][0];
|
||
}
|
||
$re2=join( '|', $re2 );
|
||
}
|
||
|
||
fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) );
|
||
$rstart = null;
|
||
foreach ( $chars as $i => $c ) {
|
||
if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) {
|
||
if ( $rstart === null ) {
|
||
$rstart = $i;
|
||
}
|
||
} else {
|
||
if ( $rstart !== null ) {
|
||
addRange( $k, $rstart, $i );
|
||
$rstart = null;
|
||
}
|
||
}
|
||
}
|
||
if ( $rstart !== null ) {
|
||
addRange( $k, $rstart, 0x110000 );
|
||
}
|
||
fprintf( $X, "\t},\n" );
|
||
}
|
||
foreach ( $pats as $k => $pp ) {
|
||
$kk = strtoupper( $k );
|
||
fprintf( $X, "\t[0x%02x] = {},\n", ord( $kk ) );
|
||
}
|
||
fprintf( $X, "}\n" );
|
||
foreach ( $pats as $k => $pp ) {
|
||
$body = '';
|
||
$check = array();
|
||
if ( $pp[1] ) {
|
||
foreach ( str_split( $pp[1] ) as $p ) {
|
||
$check[] = sprintf( "pats[0x%02x][k]", ord( $p ) );
|
||
}
|
||
}
|
||
if ( $ranges[$k] ) {
|
||
$body = "\tlocal c = tonumber( k ) or 0/0;\n";
|
||
$check = array_merge( $check, $ranges[$k] );
|
||
}
|
||
if ( $check ) {
|
||
$body .= "\treturn " . join( " or\n\t\t", $check );
|
||
fprintf( $X, "setmetatable( pats[0x%02x], { __index = function ( t, k )\n%s\nend } )\n",
|
||
ord( $k ), $body );
|
||
}
|
||
}
|
||
foreach ( $pats as $k => $pp ) {
|
||
fprintf( $X, "setmetatable( pats[0x%02x], { __index = function ( t, k ) return k and not pats[0x%02x][k] end } )\n",
|
||
ord( strtoupper( $k ) ), ord( $k ) );
|
||
}
|
||
fprintf( $X, "\n-- For speed, cache printable ASCII characters in main tables\n" );
|
||
fprintf( $X, "for k, t in pairs( pats ) do\n" );
|
||
fprintf( $X, "\tif k >= 0x61 then\n" );
|
||
fprintf( $X, "\t\tfor i = 0x20, 0x7e do\n" );
|
||
fprintf( $X, "\t\t\tt[i] = t[i] or false\n" );
|
||
fprintf( $X, "\t\tend\n" );
|
||
fprintf( $X, "\tend\n" );
|
||
fprintf( $X, "end\n" );
|
||
fprintf( $X, "\nreturn pats\n" );
|
||
fclose( $X );
|