mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-30 02:54:26 +00:00
0a8757baba
This is a reimplementation of Lua's string library with support for UTF-8. The entire ustring library is implemented in pure Lua. PHP callbacks are also available for overrides: in LuaSandbox these are used for almost all functions, while in LuaStandalone they are used only for the pattern matching. Also, ustring.upper and ustring.lower are overridden using mw.language's .uc and .lc if available. It also includes a bunch of unit tests. Note that if you download the normalization tests, they may fail under LuaSandbox if you have PHP's intl extension installed and libicu on your system is too old. Change-Id: Ie76fdf8d3a85d0a3d2a41b0d3b7afe433f247af0
158 lines
4.2 KiB
PHP
Executable file
158 lines
4.2 KiB
PHP
Executable file
#!/usr/bin/php
|
||
<?
|
||
|
||
if ( PHP_SAPI !== 'cli' ) {
|
||
die( "This script may only be executed from the command line.\n" );
|
||
}
|
||
|
||
$chars=array();
|
||
for ( $i = 0; $i <= 0x10ffff; $i++ ) {
|
||
if ( $i < 0xd800 || $i > 0xdfff ) { // Skip UTF-16 surrogates
|
||
$chars[$i] = mb_convert_encoding( pack( 'N', $i ), 'UTF-8', 'UTF-32BE' );
|
||
}
|
||
}
|
||
|
||
### Uppercase and Lowercase mappings
|
||
echo "Creating upper and lower tables...\n";
|
||
$L = fopen( __DIR__ . '/lower.lua', 'w' );
|
||
if ( !$L ) {
|
||
die( "Failed to open lower.lua\n" );
|
||
}
|
||
$U = fopen( __DIR__ . '/upper.lua', 'w' );
|
||
if ( !$U ) {
|
||
die( "Failed to open upper.lua\n" );
|
||
}
|
||
fprintf( $L, "-- This file is automatically generated by make-tables.php\n" );
|
||
fprintf( $L, "return {\n" );
|
||
fprintf( $U, "-- This file is automatically generated by make-tables.php\n" );
|
||
fprintf( $U, "return {\n" );
|
||
foreach ( $chars as $i => $c ) {
|
||
$l = mb_strtolower( $c, 'UTF-8' );
|
||
$u = mb_strtoupper( $c, 'UTF-8' );
|
||
if ( $c !== $l ) {
|
||
fprintf( $L, "\t[\"%s\"] = \"%s\",\n", $c, $l );
|
||
}
|
||
if ( $c !== $u ) {
|
||
fprintf( $U, "\t[\"%s\"] = \"%s\",\n", $c, $u );
|
||
}
|
||
}
|
||
fprintf( $L, "}\n" );
|
||
fprintf( $U, "}\n" );
|
||
fclose( $L );
|
||
fclose( $U );
|
||
|
||
### Pattern code mappings
|
||
echo "Creating charsets table...\n";
|
||
$X = fopen( __DIR__ . '/charsets.lua', 'w' );
|
||
if ( !$X ) {
|
||
die( "Failed to open charsets.lua\n" );
|
||
}
|
||
$pats = array(
|
||
// These should match the expressions in UstringLibrary::patternToRegex()
|
||
'a' => array( '\p{L}', 'lu' ),
|
||
'c' => array( '\p{Cc}', null ),
|
||
'd' => array( '\p{Nd}', null ),
|
||
'l' => array( '\p{Ll}', null ),
|
||
'p' => array( '\p{P}', null ),
|
||
's' => array( '\p{Xps}', null ),
|
||
'u' => array( '\p{Lu}', null ),
|
||
'w' => array( null, 'da' ), # '[\p{L}\p{Nd}]' exactly matches 'a' + 'd'
|
||
'x' => array( '[0-9A-Fa-f0-9A-Fa-f]', null ),
|
||
'z' => array( '\0', null ),
|
||
);
|
||
|
||
$ranges = array();
|
||
function addRange( $k, $start, $end, $arr ) {
|
||
global $X, $ranges;
|
||
// Speed/memory tradeoff
|
||
if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) {
|
||
$ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end );
|
||
} else {
|
||
for ( $i = $start; $i < $end; $i++ ) {
|
||
if ( $arr ) {
|
||
fprintf( $X, "\t\t1,\n" );
|
||
} else {
|
||
fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
fprintf( $X, "-- This file is automatically generated by make-tables.php\n" );
|
||
fprintf( $X, "local pats = {\n" );
|
||
foreach ( $pats as $k => $pp ) {
|
||
$ranges[$k] = array();
|
||
$re=$pp[0];
|
||
if ( !$re ) {
|
||
fprintf( $X, "\t[0x%02x] = {},\n", ord( $k ) );
|
||
continue;
|
||
}
|
||
|
||
$re2='fail';
|
||
if ( $pp[1] ) {
|
||
$re2 = array();
|
||
foreach ( str_split( $pp[1] ) as $p ) {
|
||
$re2[] = $pats[$p][0];
|
||
}
|
||
$re2=join( '|', $re2 );
|
||
}
|
||
|
||
fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) );
|
||
$arr = true;
|
||
$rstart = null;
|
||
foreach ( $chars as $i => $c ) {
|
||
if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) {
|
||
if ( $rstart === null ) {
|
||
$rstart = $i;
|
||
}
|
||
} else {
|
||
if ( $rstart !== null ) {
|
||
addRange( $k, $rstart, $i, $arr );
|
||
$rstart = null;
|
||
}
|
||
$arr = false;
|
||
}
|
||
}
|
||
if ( $rstart !== null ) {
|
||
addRange( $k, $rstart, 0x110000, $arr );
|
||
}
|
||
fprintf( $X, "\t},\n" );
|
||
}
|
||
foreach ( $pats as $k => $pp ) {
|
||
$kk = strtoupper( $k );
|
||
fprintf( $X, "\t[0x%02x] = {},\n", ord( $kk ) );
|
||
}
|
||
fprintf( $X, "}\n" );
|
||
foreach ( $pats as $k => $pp ) {
|
||
$body = '';
|
||
$check = array();
|
||
if ( $pp[1] ) {
|
||
foreach ( str_split( $pp[1] ) as $p ) {
|
||
$check[] = sprintf( "pats[0x%02x][k]", ord( $p ) );
|
||
}
|
||
}
|
||
if ( $ranges[$k] ) {
|
||
$body = "\tlocal c = tonumber( k ) or 0/0;\n";
|
||
$check = array_merge( $check, $ranges[$k] );
|
||
}
|
||
if ( $check ) {
|
||
$body .= "\treturn " . join( " or\n\t\t", $check );
|
||
fprintf( $X, "setmetatable( pats[0x%02x], { __index = function ( t, k )\n%s\nend } )\n",
|
||
ord( $k ), $body );
|
||
}
|
||
}
|
||
foreach ( $pats as $k => $pp ) {
|
||
fprintf( $X, "setmetatable( pats[0x%02x], { __index = function ( t, k ) return k and not pats[0x%02x][k] end } )\n",
|
||
ord( strtoupper( $k ) ), ord( $k ) );
|
||
}
|
||
fprintf( $X, "\n-- For speed, cache printable ASCII characters in main tables\n" );
|
||
fprintf( $X, "for k, t in pairs( pats ) do\n" );
|
||
fprintf( $X, "\tif k >= 0x61 then\n" );
|
||
fprintf( $X, "\t\tfor i = 0x20, 0x7e do\n" );
|
||
fprintf( $X, "\t\t\tt[i] = t[i] or false\n" );
|
||
fprintf( $X, "\t\tend\n" );
|
||
fprintf( $X, "\tend\n" );
|
||
fprintf( $X, "end\n" );
|
||
fprintf( $X, "\nreturn pats\n" );
|
||
fclose( $X );
|