mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-12-13 08:48:39 +00:00
164974c4b5
Ideally we'd just have composer.json require UtfNormal so we'd know where it is and have an autoloader to load it for us, but that seems to not be done in the world of MediaWiki extensions. Previously we had been taking paths to the two data files from UtfNormal and loading them into a stub class, but phan has started complaining about the definition of the stub class colliding with the real UtfNormal. So let's try loading the real UtfNormal\Validator and its data files. Hopefully this continues to not try to pull in any other files via the nonexistent autoloader. Change-Id: I93baf20f0eef1892685e272793b4f99236e8c905
235 lines
6.8 KiB
PHP
Executable file
235 lines
6.8 KiB
PHP
Executable file
#!/usr/bin/php
|
|
<?php
|
|
|
|
use UtfNormal\Validator;
|
|
|
|
if ( PHP_SAPI !== 'cli' && PHP_SAPI !== 'phpdbg' ) {
|
|
die( "This script may only be executed from the command line.\n" );
|
|
}
|
|
|
|
$utfnormalDir = null;
|
|
if ( count( $argv ) > 1 ) {
|
|
$utfnormalDir = rtrim( $argv[1], '/' );
|
|
if ( !is_dir( $utfnormalDir ) ) {
|
|
die( "The specified UtfNormal directory '$utfnormalDir' does not exist\n" );
|
|
}
|
|
if ( file_exists( "$utfnormalDir/Validator.php" ) ) {
|
|
// Probably ok
|
|
} elseif ( file_exists( "$utfnormalDir/src/Validator.php" ) ) {
|
|
// Add the 'src' dir
|
|
$utfnormalDir = "$utfnormalDir/src";
|
|
} else {
|
|
fprintf(
|
|
STDERR,
|
|
"Warning: Supplied path \"%s\" does not seem to contain UtfNormal. Trying it anyway.\n",
|
|
$utfnormalDir
|
|
);
|
|
}
|
|
} else {
|
|
$trydirs = [
|
|
// Checkouts of mediawiki/core and mediawiki/extensions in the same directory
|
|
__DIR__ . '/../../../../../../../core/vendor/wikimedia/utfnormal/src',
|
|
// Scribunto checked out inside the 'extensions' directory of mediawiki/core
|
|
__DIR__ . '/../../../../../../../vendor/wikimedia/utfnormal/src',
|
|
];
|
|
if ( getenv( 'MW_INSTALL_PATH' ) ) {
|
|
array_unshift( $trydirs, getenv( 'MW_INSTALL_PATH' ) . '/vendor/wikimedia/utfnormal/src' );
|
|
}
|
|
foreach ( $trydirs as $trydir ) {
|
|
$trydir = realpath( $trydir );
|
|
if ( $trydir !== false && is_dir( $trydir ) && file_exists( "$trydir/UtfNormalData.inc" ) ) {
|
|
$utfnormalDir = $trydir;
|
|
break;
|
|
}
|
|
}
|
|
if ( !$utfnormalDir ) {
|
|
die( "Cannot find UtfNormal. Please specify the path explicitly.\n" );
|
|
}
|
|
}
|
|
|
|
//phpcs:disable MediaWiki.NamingConventions
|
|
/**
|
|
* This is a function solely exists so we can suppress errors
|
|
*
|
|
* @suppress SecurityCheck-XSS
|
|
* @suppress SecurityCheck-OTHER
|
|
*/
|
|
function loadDataFiles() {
|
|
global $utfnormalDir;
|
|
echo "Loading UtfNormal from $utfnormalDir...\n";
|
|
require_once "$utfnormalDir/Validator.php";
|
|
require_once "$utfnormalDir/UtfNormalData.inc";
|
|
require_once "$utfnormalDir/UtfNormalDataK.inc";
|
|
}
|
|
//phpcs:enable MediaWiki.NamingConventions
|
|
loadDataFiles();
|
|
|
|
if ( !Validator::$utfCheckNFC ||
|
|
!Validator::$utfCombiningClass ||
|
|
!Validator::$utfCanonicalDecomp ||
|
|
!Validator::$utfCanonicalComp ||
|
|
!Validator::$utfCompatibilityDecomp
|
|
) {
|
|
die( "UtfNormal data files did not contain needed data.\n" );
|
|
}
|
|
|
|
// @codingStandardsIgnoreLine MediaWiki.NamingConventions.PrefixedGlobalFunctions
|
|
function uord( $c, $firstOnly ) {
|
|
$ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
|
|
return $firstOnly ? $ret[1] : $ret;
|
|
}
|
|
|
|
echo "Creating normalization table...\n";
|
|
$X = fopen( __DIR__ . '/normalization-data.lua', 'w' );
|
|
if ( !$X ) {
|
|
die( "Failed to open normalization-data.lua\n" );
|
|
}
|
|
fprintf( $X, "-- This file is automatically generated by make-normalization-table.php\n" );
|
|
fprintf( $X, "local normal = {\n" );
|
|
fprintf( $X, "\t-- Characters that might change depending on the following combiner\n" );
|
|
fprintf( $X, "\t-- (minus any that are themselves combiners, those are added later)\n" );
|
|
fprintf( $X, "\tcheck = {\n" );
|
|
foreach ( Validator::$utfCheckNFC as $k => $v ) {
|
|
if ( isset( Validator::$utfCombiningClass[$k] ) ) {
|
|
// Skip, because it's in the other table already
|
|
continue;
|
|
}
|
|
fprintf( $X, "\t\t[0x%06x] = 1,\n", uord( $k, true ) );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
fprintf( $X, "\t-- Combining characters, mapped to combining class\n" );
|
|
fprintf( $X, "\tcombclass = {\n" );
|
|
$comb = [];
|
|
foreach ( Validator::$utfCombiningClass as $k => $v ) {
|
|
$cp = uord( $k, true );
|
|
$comb[$cp] = 1;
|
|
fprintf( $X, "\t\t[0x%06x] = %d,\n", $cp, $v );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
fprintf( $X, "\t-- Characters mapped to what they decompose to\n" );
|
|
fprintf( $X, "\t-- Note Hangul to Jamo is done separately below\n" );
|
|
fprintf( $X, "\tdecomp = {\n" );
|
|
foreach ( Validator::$utfCanonicalDecomp as $k => $v ) {
|
|
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
|
|
$fmt = "0x%06x";
|
|
foreach ( uord( $v, false ) as $c ) {
|
|
fprintf( $X, $fmt, $c );
|
|
$fmt = ", 0x%06x";
|
|
}
|
|
fprintf( $X, " },\n" );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
|
|
fprintf( $X, "\tdecompK = {\n" );
|
|
foreach ( Validator::$utfCompatibilityDecomp as $k => $v ) {
|
|
if ( isset( Validator::$utfCanonicalDecomp[$k] ) && Validator::$utfCanonicalDecomp[$k] === $v ) {
|
|
// Skip duplicates
|
|
continue;
|
|
}
|
|
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
|
|
$fmt = "0x%06x";
|
|
foreach ( uord( $v, false ) as $c ) {
|
|
fprintf( $X, $fmt, $c );
|
|
$fmt = ", 0x%06x";
|
|
}
|
|
fprintf( $X, " },\n" );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
|
|
fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
|
|
fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
|
|
$t = [];
|
|
foreach ( Validator::$utfCanonicalComp as $k => $v ) {
|
|
$k = uord( $k, false );
|
|
if ( count( $k ) == 1 ) {
|
|
// No idea why these are in the file
|
|
continue;
|
|
}
|
|
if ( isset( $comb[$k[1]] ) ) {
|
|
// Non-starter, no idea why these are in the file either
|
|
continue;
|
|
}
|
|
$t[$k[1]][$k[2]] = uord( $v, true );
|
|
}
|
|
fprintf( $X, "\tcomp = {\n" );
|
|
ksort( $t );
|
|
foreach ( $t as $k1 => $v1 ) {
|
|
fprintf( $X, "\t\t[0x%06x] = {\n", $k1 );
|
|
ksort( $v1 );
|
|
foreach ( $v1 as $k2 => $v2 ) {
|
|
if ( $k2 < 0 ) {
|
|
fprintf( $X, "\t\t\t[-1] = 0x%06x,\n", $v2 );
|
|
} else {
|
|
fprintf( $X, "\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 );
|
|
}
|
|
}
|
|
fprintf( $X, "\t\t},\n" );
|
|
}
|
|
fprintf( $X, "\t},\n" );
|
|
|
|
fprintf( $X, "}\n" );
|
|
|
|
fprintf( $X, "\n%s\n", <<<LUA
|
|
-- All combining characters need to be checked, so just do that
|
|
setmetatable( normal.check, { __index = normal.combclass } )
|
|
|
|
-- Handle Hangul to Jamo decomposition
|
|
setmetatable( normal.decomp, { __index = function ( _, k )
|
|
if k >= 0xac00 and k <= 0xd7a3 then
|
|
-- Decompose a Hangul syllable into Jamo
|
|
k = k - 0xac00
|
|
local ret = {
|
|
0x1100 + math.floor( k / 588 ),
|
|
0x1161 + math.floor( ( k % 588 ) / 28 )
|
|
}
|
|
if k % 28 ~= 0 then
|
|
ret[3] = 0x11a7 + ( k % 28 )
|
|
end
|
|
return ret
|
|
end
|
|
return nil
|
|
end } )
|
|
|
|
-- Handle Jamo to Hangul composition
|
|
local jamo_l_v_mt = { __index = function ( t, k )
|
|
if k >= 0x1161 and k <= 0x1175 then
|
|
-- Jamo leading + Jamo vowel
|
|
return t.base + 28 * ( k - 0x1161 )
|
|
end
|
|
return nil
|
|
end }
|
|
local hangul_jamo_mt = { __index = function ( t, k )
|
|
if k >= 0x11a7 and k <= 0x11c2 then
|
|
-- Hangul + jamo final
|
|
return t.base + k - 0x11a7
|
|
end
|
|
return nil
|
|
end }
|
|
setmetatable( normal.comp, { __index = function ( t, k )
|
|
if k >= 0x1100 and k <= 0x1112 then
|
|
-- Jamo leading, return a second table that combines with a Jamo vowel
|
|
local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) }
|
|
setmetatable( t2, jamo_l_v_mt )
|
|
t[k] = t2 -- cache it
|
|
return t2
|
|
elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then
|
|
-- Hangul. "k % 28 == 16" picks out just the ones that are
|
|
-- Jamo leading + vowel, no final. Return a second table that combines
|
|
-- with a Jamo final.
|
|
local t2 = { base = k }
|
|
setmetatable( t2, hangul_jamo_mt )
|
|
t[k] = t2 -- cache it
|
|
return t2
|
|
end
|
|
return nil
|
|
end } )
|
|
|
|
-- Compatibility decomposition falls back to the normal decomposition
|
|
setmetatable( normal.decompK, { __index = normal.decomp } )
|
|
|
|
return normal
|
|
LUA
|
|
);
|
|
|
|
fclose( $X );
|