mediawiki-extensions-Scribunto/includes/engines/LuaCommon/lualib/ustring/make-normalization-table.php
Brad Jorsch 164974c4b5 ustring: Replace UtfNormal hack with a different one
Ideally we'd just have composer.json require UtfNormal so we'd know
where it is and have an autoloader to load it for us, but that seems to
not be done in the world of MediaWiki extensions.

Previously we had been taking paths to the two data files from UtfNormal
and loading them into a stub class, but phan has started complaining
about the definition of the stub class colliding with the real UtfNormal.
So let's try loading the real UtfNormal\Validator and its data files.
Hopefully this continues to not try to pull in any other files via the
nonexistent autoloader.

Change-Id: I93baf20f0eef1892685e272793b4f99236e8c905
2019-06-11 00:09:15 +00:00

235 lines
6.8 KiB
PHP
Executable file

#!/usr/bin/php
<?php
use UtfNormal\Validator;
if ( PHP_SAPI !== 'cli' && PHP_SAPI !== 'phpdbg' ) {
die( "This script may only be executed from the command line.\n" );
}
$utfnormalDir = null;
if ( count( $argv ) > 1 ) {
$utfnormalDir = rtrim( $argv[1], '/' );
if ( !is_dir( $utfnormalDir ) ) {
die( "The specified UtfNormal directory '$utfnormalDir' does not exist\n" );
}
if ( file_exists( "$utfnormalDir/Validator.php" ) ) {
// Probably ok
} elseif ( file_exists( "$utfnormalDir/src/Validator.php" ) ) {
// Add the 'src' dir
$utfnormalDir = "$utfnormalDir/src";
} else {
fprintf(
STDERR,
"Warning: Supplied path \"%s\" does not seem to contain UtfNormal. Trying it anyway.\n",
$utfnormalDir
);
}
} else {
$trydirs = [
// Checkouts of mediawiki/core and mediawiki/extensions in the same directory
__DIR__ . '/../../../../../../../core/vendor/wikimedia/utfnormal/src',
// Scribunto checked out inside the 'extensions' directory of mediawiki/core
__DIR__ . '/../../../../../../../vendor/wikimedia/utfnormal/src',
];
if ( getenv( 'MW_INSTALL_PATH' ) ) {
array_unshift( $trydirs, getenv( 'MW_INSTALL_PATH' ) . '/vendor/wikimedia/utfnormal/src' );
}
foreach ( $trydirs as $trydir ) {
$trydir = realpath( $trydir );
if ( $trydir !== false && is_dir( $trydir ) && file_exists( "$trydir/UtfNormalData.inc" ) ) {
$utfnormalDir = $trydir;
break;
}
}
if ( !$utfnormalDir ) {
die( "Cannot find UtfNormal. Please specify the path explicitly.\n" );
}
}
//phpcs:disable MediaWiki.NamingConventions
/**
* This is a function solely exists so we can suppress errors
*
* @suppress SecurityCheck-XSS
* @suppress SecurityCheck-OTHER
*/
function loadDataFiles() {
global $utfnormalDir;
echo "Loading UtfNormal from $utfnormalDir...\n";
require_once "$utfnormalDir/Validator.php";
require_once "$utfnormalDir/UtfNormalData.inc";
require_once "$utfnormalDir/UtfNormalDataK.inc";
}
//phpcs:enable MediaWiki.NamingConventions
loadDataFiles();
if ( !Validator::$utfCheckNFC ||
!Validator::$utfCombiningClass ||
!Validator::$utfCanonicalDecomp ||
!Validator::$utfCanonicalComp ||
!Validator::$utfCompatibilityDecomp
) {
die( "UtfNormal data files did not contain needed data.\n" );
}
// @codingStandardsIgnoreLine MediaWiki.NamingConventions.PrefixedGlobalFunctions
function uord( $c, $firstOnly ) {
$ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
return $firstOnly ? $ret[1] : $ret;
}
echo "Creating normalization table...\n";
$X = fopen( __DIR__ . '/normalization-data.lua', 'w' );
if ( !$X ) {
die( "Failed to open normalization-data.lua\n" );
}
fprintf( $X, "-- This file is automatically generated by make-normalization-table.php\n" );
fprintf( $X, "local normal = {\n" );
fprintf( $X, "\t-- Characters that might change depending on the following combiner\n" );
fprintf( $X, "\t-- (minus any that are themselves combiners, those are added later)\n" );
fprintf( $X, "\tcheck = {\n" );
foreach ( Validator::$utfCheckNFC as $k => $v ) {
if ( isset( Validator::$utfCombiningClass[$k] ) ) {
// Skip, because it's in the other table already
continue;
}
fprintf( $X, "\t\t[0x%06x] = 1,\n", uord( $k, true ) );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\t-- Combining characters, mapped to combining class\n" );
fprintf( $X, "\tcombclass = {\n" );
$comb = [];
foreach ( Validator::$utfCombiningClass as $k => $v ) {
$cp = uord( $k, true );
$comb[$cp] = 1;
fprintf( $X, "\t\t[0x%06x] = %d,\n", $cp, $v );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\t-- Characters mapped to what they decompose to\n" );
fprintf( $X, "\t-- Note Hangul to Jamo is done separately below\n" );
fprintf( $X, "\tdecomp = {\n" );
foreach ( Validator::$utfCanonicalDecomp as $k => $v ) {
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
$fmt = "0x%06x";
foreach ( uord( $v, false ) as $c ) {
fprintf( $X, $fmt, $c );
$fmt = ", 0x%06x";
}
fprintf( $X, " },\n" );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\tdecompK = {\n" );
foreach ( Validator::$utfCompatibilityDecomp as $k => $v ) {
if ( isset( Validator::$utfCanonicalDecomp[$k] ) && Validator::$utfCanonicalDecomp[$k] === $v ) {
// Skip duplicates
continue;
}
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
$fmt = "0x%06x";
foreach ( uord( $v, false ) as $c ) {
fprintf( $X, $fmt, $c );
$fmt = ", 0x%06x";
}
fprintf( $X, " },\n" );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
$t = [];
foreach ( Validator::$utfCanonicalComp as $k => $v ) {
$k = uord( $k, false );
if ( count( $k ) == 1 ) {
// No idea why these are in the file
continue;
}
if ( isset( $comb[$k[1]] ) ) {
// Non-starter, no idea why these are in the file either
continue;
}
$t[$k[1]][$k[2]] = uord( $v, true );
}
fprintf( $X, "\tcomp = {\n" );
ksort( $t );
foreach ( $t as $k1 => $v1 ) {
fprintf( $X, "\t\t[0x%06x] = {\n", $k1 );
ksort( $v1 );
foreach ( $v1 as $k2 => $v2 ) {
if ( $k2 < 0 ) {
fprintf( $X, "\t\t\t[-1] = 0x%06x,\n", $v2 );
} else {
fprintf( $X, "\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 );
}
}
fprintf( $X, "\t\t},\n" );
}
fprintf( $X, "\t},\n" );
fprintf( $X, "}\n" );
fprintf( $X, "\n%s\n", <<<LUA
-- All combining characters need to be checked, so just do that
setmetatable( normal.check, { __index = normal.combclass } )
-- Handle Hangul to Jamo decomposition
setmetatable( normal.decomp, { __index = function ( _, k )
if k >= 0xac00 and k <= 0xd7a3 then
-- Decompose a Hangul syllable into Jamo
k = k - 0xac00
local ret = {
0x1100 + math.floor( k / 588 ),
0x1161 + math.floor( ( k % 588 ) / 28 )
}
if k % 28 ~= 0 then
ret[3] = 0x11a7 + ( k % 28 )
end
return ret
end
return nil
end } )
-- Handle Jamo to Hangul composition
local jamo_l_v_mt = { __index = function ( t, k )
if k >= 0x1161 and k <= 0x1175 then
-- Jamo leading + Jamo vowel
return t.base + 28 * ( k - 0x1161 )
end
return nil
end }
local hangul_jamo_mt = { __index = function ( t, k )
if k >= 0x11a7 and k <= 0x11c2 then
-- Hangul + jamo final
return t.base + k - 0x11a7
end
return nil
end }
setmetatable( normal.comp, { __index = function ( t, k )
if k >= 0x1100 and k <= 0x1112 then
-- Jamo leading, return a second table that combines with a Jamo vowel
local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) }
setmetatable( t2, jamo_l_v_mt )
t[k] = t2 -- cache it
return t2
elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then
-- Hangul. "k % 28 == 16" picks out just the ones that are
-- Jamo leading + vowel, no final. Return a second table that combines
-- with a Jamo final.
local t2 = { base = k }
setmetatable( t2, hangul_jamo_mt )
t[k] = t2 -- cache it
return t2
end
return nil
end } )
-- Compatibility decomposition falls back to the normal decomposition
setmetatable( normal.decompK, { __index = normal.decomp } )
return normal
LUA
);
fclose( $X );