mediawiki-extensions-Scribunto/engines/LuaCommon/lualib/ustring/make-normalization-table.php
Brad Jorsch 0a8757baba Lua ustring implementation
This is a reimplementation of Lua's string library with support for
UTF-8.

The entire ustring library is implemented in pure Lua. PHP callbacks are
also available for overrides: in LuaSandbox these are used for almost
all functions, while in LuaStandalone they are used only for the pattern
matching. Also, ustring.upper and ustring.lower are overridden using
mw.language's .uc and .lc if available.

It also includes a bunch of unit tests.

Note that if you download the normalization tests, they may fail under
LuaSandbox if you have PHP's intl extension installed and libicu on your
system is too old.

Change-Id: Ie76fdf8d3a85d0a3d2a41b0d3b7afe433f247af0
2013-02-12 14:26:29 -05:00

186 lines
5.1 KiB
PHP
Executable file

#!/usr/bin/php
<?
if ( PHP_SAPI !== 'cli' ) {
die( "This script may only be executed from the command line.\n" );
}
$datafile = null;
if ( count( $argv ) > 1 ) {
$datafile = $argv[1];
if ( !file_exists( $datafile ) ) {
die( "The specified file '$datafile' does not exist\n" );
}
} else {
foreach( array(
__DIR__ . '/../../../../../core/includes/normal/UtfNormalData.inc',
__DIR__ . '/../../../../../includes/normal/UtfNormalData.inc',
) as $tryfile ) {
$tryfile = realpath( $tryfile );
if ( file_exists( $tryfile ) ) {
$datafile = $tryfile;
break;
}
}
if ( !$datafile ) {
die( "Cannot find UtfNormalData.inc. Please specify the path explicitly.\n" );
}
}
echo "Loading data file $datafile...\n";
class UtfNormal {
static $utfCheckNFC = null;
static $utfCombiningClass = null;
static $utfCanonicalDecomp = null;
static $utfCanonicalComp = null;
}
require_once( $datafile );
if ( !UtfNormal::$utfCheckNFC ||
!UtfNormal::$utfCombiningClass ||
!UtfNormal::$utfCanonicalDecomp ||
!UtfNormal::$utfCanonicalComp
) {
die( "Data file $datafile did not contain needed data.\n" );
}
function uord( $c, $firstOnly ) {
$ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
return $firstOnly ? $ret[1] : $ret;
}
echo "Creating normalization table...\n";
$X = fopen( __DIR__ . '/normalization-data.lua', 'w' );
if ( !$X ) {
die( "Failed to open normalization-data.lua\n" );
}
fprintf( $X, "-- This file is automatically generated by make-normalization-table.php\n" );
fprintf( $X, "local normal = {\n" );
fprintf( $X, "\t-- Characters that might change depending on the following combiner\n" );
fprintf( $X, "\t-- (minus any that are themselves combiners, those are added later)\n" );
fprintf( $X, "\tcheck = {\n" );
foreach ( UtfNormal::$utfCheckNFC as $k => $v ) {
if ( isset( UtfNormal::$utfCombiningClass[$k] ) ) {
// Skip, because it's in the other table already
continue;
}
fprintf( $X, "\t\t[0x%06x] = 1,\n", uord( $k, true ) );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\t-- Combining characters, mapped to combining class\n" );
fprintf( $X, "\tcombclass = {\n" );
$comb = array();
foreach ( UtfNormal::$utfCombiningClass as $k => $v ) {
$cp = uord( $k, true );
$comb[$cp] = 1;
fprintf( $X, "\t\t[0x%06x] = %d,\n", $cp, $v );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\t-- Characters mapped to what they decompose to\n" );
fprintf( $X, "\t-- Note Hangul to Jamo is done separately below\n" );
fprintf( $X, "\tdecomp = {\n" );
foreach ( UtfNormal::$utfCanonicalDecomp as $k => $v ) {
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
$fmt = "0x%06x";
foreach ( uord( $v, false ) as $c ) {
fprintf( $X, $fmt, $c );
$fmt = ", 0x%06x";
}
fprintf( $X, " },\n" );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
$t = array();
foreach ( UtfNormal::$utfCanonicalComp as $k => $v ) {
$k = uord( $k, false );
if ( count( $k ) == 1 ) {
// No idea why these are in the file
continue;
}
if ( isset( $comb[$k[1]] ) ) {
// Non-starter, no idea why these are in the file either
continue;
}
$t[$k[1]][$k[2]] = uord( $v, true );
}
fprintf( $X, "\tcomp = {\n" );
ksort( $t );
foreach ( $t as $k1 => $v1 ) {
fprintf( $X, "\t\t[0x%06x] = {\n", $k1 );
ksort( $v1 );
foreach ( $v1 as $k2 => $v2 ) {
if ( $k2 < 0 ) {
fprintf( $X, "\t\t\t[-1] = 0x%06x,\n", $v2 );
} else {
fprintf( $X, "\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 );
}
}
fprintf( $X, "\t\t},\n" );
}
fprintf( $X, "\t},\n" );
fprintf( $X, "}\n" );
fprintf( $X, "\n%s\n", <<<LUA
-- All combining characters need to be checked, so just do that
setmetatable( normal.check, { __index = normal.combclass } )
-- Handle Hangul to Jamo decomposition
setmetatable( normal.decomp, { __index = function ( _, k )
if k >= 0xac00 and k <= 0xd7a3 then
-- Decompose a Hangul syllable into Jamo
k = k - 0xac00
local ret = {
0x1100 + math.floor( k / 588 ),
0x1161 + math.floor( ( k % 588 ) / 28 )
}
if k % 28 ~= 0 then
ret[3] = 0x11a7 + ( k % 28 )
end
return ret
end
return nil
end } )
-- Handle Jamo to Hangul composition
local jamo_l_v_mt = { __index = function ( t, k )
if k >= 0x1161 and k <= 0x1175 then
-- Jamo leading + Jamo vowel
return t.base + 28 * ( k - 0x1161 )
end
return nil
end }
local hangul_jamo_mt = { __index = function ( t, k )
if k >= 0x11a7 and k <= 0x11c2 then
-- Hangul + jamo final
return t.base + k - 0x11a7
end
return nil
end }
setmetatable( normal.comp, { __index = function ( t, k )
if k >= 0x1100 and k <= 0x1112 then
-- Jamo leading, return a second table that combines with a Jamo vowel
local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) }
setmetatable( t2, jamo_l_v_mt )
t[k] = t2 -- cache it
return t2
elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then
-- Hangul. "k % 28 == 16" picks out just the ones that are
-- Jamo leading + vowel, no final. Return a second table that combines
-- with a Jamo final.
local t2 = { base = k }
setmetatable( t2, hangul_jamo_mt )
t[k] = t2 -- cache it
return t2
end
return nil
end } )
return normal
LUA
);
fclose( $X );