mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2025-01-05 19:44:12 +00:00
1eecdac6de
Change-Id: I6d730d67decc859fd130fee5ec92b1cfb8d9ef64
232 lines
6.9 KiB
PHP
Executable file
232 lines
6.9 KiB
PHP
Executable file
#!/usr/bin/php
|
|
<?php
|
|
|
|
use UtfNormal\Validator;
|
|
|
|
if ( PHP_SAPI !== 'cli' && PHP_SAPI !== 'phpdbg' ) {
|
|
die( "This script may only be executed from the command line.\n" );
|
|
}
|
|
|
|
$utfnormalDir = null;
|
|
if ( count( $argv ) > 1 ) {
|
|
$utfnormalDir = rtrim( $argv[1], '/' );
|
|
if ( !is_dir( $utfnormalDir ) ) {
|
|
// @phan-suppress-next-line SecurityCheck-XSS
|
|
die( "The specified UtfNormal directory '$utfnormalDir' does not exist\n" );
|
|
}
|
|
if ( file_exists( "$utfnormalDir/Validator.php" ) ) {
|
|
// Probably ok
|
|
} elseif ( file_exists( "$utfnormalDir/src/Validator.php" ) ) {
|
|
// Add the 'src' dir
|
|
$utfnormalDir = "$utfnormalDir/src";
|
|
} else {
|
|
fprintf(
|
|
STDERR,
|
|
"Warning: Supplied path \"%s\" does not seem to contain UtfNormal. Trying it anyway.\n",
|
|
$utfnormalDir
|
|
);
|
|
}
|
|
} else {
|
|
$trydirs = [
|
|
// Checkouts of mediawiki/core and mediawiki/extensions in the same directory
|
|
__DIR__ . '/../../../../../../../core/vendor/wikimedia/utfnormal/src',
|
|
// Scribunto checked out inside the 'extensions' directory of mediawiki/core
|
|
__DIR__ . '/../../../../../../../vendor/wikimedia/utfnormal/src',
|
|
];
|
|
if ( getenv( 'MW_INSTALL_PATH' ) ) {
|
|
array_unshift( $trydirs, getenv( 'MW_INSTALL_PATH' ) . '/vendor/wikimedia/utfnormal/src' );
|
|
}
|
|
foreach ( $trydirs as $trydir ) {
|
|
$trydir = realpath( $trydir );
|
|
if ( $trydir !== false && is_dir( $trydir ) && file_exists( "$trydir/UtfNormalData.inc" ) ) {
|
|
$utfnormalDir = $trydir;
|
|
break;
|
|
}
|
|
}
|
|
if ( !$utfnormalDir ) {
|
|
die( "Cannot find UtfNormal. Please specify the path explicitly.\n" );
|
|
}
|
|
}
|
|
|
|
// @phan-suppress-next-line SecurityCheck-XSS
|
|
echo "Loading UtfNormal from $utfnormalDir...\n";
|
|
// @phan-suppress-next-line SecurityCheck-PathTraversal
|
|
require_once "$utfnormalDir/Validator.php";
|
|
// @phan-suppress-next-line SecurityCheck-PathTraversal
|
|
require_once "$utfnormalDir/UtfNormalData.inc";
|
|
// @phan-suppress-next-line SecurityCheck-PathTraversal
|
|
require_once "$utfnormalDir/UtfNormalDataK.inc";
|
|
|
|
if ( !Validator::$utfCheckNFC ||
|
|
!Validator::$utfCombiningClass ||
|
|
!Validator::$utfCanonicalDecomp ||
|
|
!Validator::$utfCanonicalComp ||
|
|
!Validator::$utfCompatibilityDecomp
|
|
) {
|
|
die( "UtfNormal data files did not contain needed data.\n" );
|
|
}
|
|
|
|
/**
|
|
* @param string $c
|
|
* @param bool $firstOnly
|
|
* @return array|string
|
|
*/
|
|
function uord( $c, $firstOnly ) { // phpcs:ignore MediaWiki.NamingConventions.PrefixedGlobalFunctions
|
|
$ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
|
|
return $firstOnly ? $ret[1] : $ret;
|
|
}
|
|
|
|
echo "Creating normalization table...\n";
|
|
$X = fopen( __DIR__ . '/normalization-data.lua', 'w' );
|
|
if ( !$X ) {
|
|
die( "Failed to open normalization-data.lua\n" );
|
|
}
|
|
fprintf( $X, "-- This file is automatically generated by make-normalization-table.php\n" );
|
|
fprintf( $X, "local normal = {\n" );
|
|
fprintf( $X, "\t-- Characters that might change depending on the following combiner\n" );
|
|
fprintf( $X, "\t-- (minus any that are themselves combiners, those are added later)\n" );
|
|
fprintf( $X, "\tcheck = {\n" );
|
|
foreach ( Validator::$utfCheckNFC as $k => $v ) {
|
|
if ( isset( Validator::$utfCombiningClass[$k] ) ) {
|
|
// Skip, because it's in the other table already
|
|
continue;
|
|
}
|
|
fprintf( $X, "\t\t[0x%06x] = 1,\n", uord( $k, true ) );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
fprintf( $X, "\t-- Combining characters, mapped to combining class\n" );
|
|
fprintf( $X, "\tcombclass = {\n" );
|
|
$comb = [];
|
|
foreach ( Validator::$utfCombiningClass as $k => $v ) {
|
|
$cp = uord( $k, true );
|
|
$comb[$cp] = 1;
|
|
fprintf( $X, "\t\t[0x%06x] = %d,\n", $cp, $v );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
fprintf( $X, "\t-- Characters mapped to what they decompose to\n" );
|
|
fprintf( $X, "\t-- Note Hangul to Jamo is done separately below\n" );
|
|
fprintf( $X, "\tdecomp = {\n" );
|
|
foreach ( Validator::$utfCanonicalDecomp as $k => $v ) {
|
|
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
|
|
$fmt = "0x%06x";
|
|
foreach ( uord( $v, false ) as $c ) {
|
|
fprintf( $X, $fmt, $c );
|
|
$fmt = ", 0x%06x";
|
|
}
|
|
fprintf( $X, " },\n" );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
|
|
fprintf( $X, "\tdecompK = {\n" );
|
|
foreach ( Validator::$utfCompatibilityDecomp as $k => $v ) {
|
|
if ( isset( Validator::$utfCanonicalDecomp[$k] ) && Validator::$utfCanonicalDecomp[$k] === $v ) {
|
|
// Skip duplicates
|
|
continue;
|
|
}
|
|
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
|
|
$fmt = "0x%06x";
|
|
foreach ( uord( $v, false ) as $c ) {
|
|
fprintf( $X, $fmt, $c );
|
|
$fmt = ", 0x%06x";
|
|
}
|
|
fprintf( $X, " },\n" );
|
|
}
|
|
fprintf( $X, "\t},\n\n" );
|
|
|
|
fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
|
|
fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
|
|
$t = [];
|
|
foreach ( Validator::$utfCanonicalComp as $k => $v ) {
|
|
$k = uord( $k, false );
|
|
if ( count( $k ) == 1 ) {
|
|
// No idea why these are in the file
|
|
continue;
|
|
}
|
|
if ( isset( $comb[$k[1]] ) ) {
|
|
// Non-starter, no idea why these are in the file either
|
|
continue;
|
|
}
|
|
$t[$k[1]][$k[2]] = uord( $v, true );
|
|
}
|
|
fprintf( $X, "\tcomp = {\n" );
|
|
ksort( $t );
|
|
foreach ( $t as $k1 => $v1 ) {
|
|
fprintf( $X, "\t\t[0x%06x] = {\n", $k1 );
|
|
ksort( $v1 );
|
|
foreach ( $v1 as $k2 => $v2 ) {
|
|
if ( $k2 < 0 ) {
|
|
fprintf( $X, "\t\t\t[-1] = 0x%06x,\n", $v2 );
|
|
} else {
|
|
fprintf( $X, "\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 );
|
|
}
|
|
}
|
|
fprintf( $X, "\t\t},\n" );
|
|
}
|
|
fprintf( $X, "\t},\n" );
|
|
|
|
fprintf( $X, "}\n" );
|
|
|
|
fprintf( $X, "\n%s\n", <<<LUA
|
|
-- All combining characters need to be checked, so just do that
|
|
setmetatable( normal.check, { __index = normal.combclass } )
|
|
|
|
-- Handle Hangul to Jamo decomposition
|
|
setmetatable( normal.decomp, { __index = function ( _, k )
|
|
if k >= 0xac00 and k <= 0xd7a3 then
|
|
-- Decompose a Hangul syllable into Jamo
|
|
k = k - 0xac00
|
|
local ret = {
|
|
0x1100 + math.floor( k / 588 ),
|
|
0x1161 + math.floor( ( k % 588 ) / 28 )
|
|
}
|
|
if k % 28 ~= 0 then
|
|
ret[3] = 0x11a7 + ( k % 28 )
|
|
end
|
|
return ret
|
|
end
|
|
return nil
|
|
end } )
|
|
|
|
-- Handle Jamo to Hangul composition
|
|
local jamo_l_v_mt = { __index = function ( t, k )
|
|
if k >= 0x1161 and k <= 0x1175 then
|
|
-- Jamo leading + Jamo vowel
|
|
return t.base + 28 * ( k - 0x1161 )
|
|
end
|
|
return nil
|
|
end }
|
|
local hangul_jamo_mt = { __index = function ( t, k )
|
|
if k >= 0x11a7 and k <= 0x11c2 then
|
|
-- Hangul + jamo final
|
|
return t.base + k - 0x11a7
|
|
end
|
|
return nil
|
|
end }
|
|
setmetatable( normal.comp, { __index = function ( t, k )
|
|
if k >= 0x1100 and k <= 0x1112 then
|
|
-- Jamo leading, return a second table that combines with a Jamo vowel
|
|
local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) }
|
|
setmetatable( t2, jamo_l_v_mt )
|
|
t[k] = t2 -- cache it
|
|
return t2
|
|
elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then
|
|
-- Hangul. "k % 28 == 16" picks out just the ones that are
|
|
-- Jamo leading + vowel, no final. Return a second table that combines
|
|
-- with a Jamo final.
|
|
local t2 = { base = k }
|
|
setmetatable( t2, hangul_jamo_mt )
|
|
t[k] = t2 -- cache it
|
|
return t2
|
|
end
|
|
return nil
|
|
end } )
|
|
|
|
-- Compatibility decomposition falls back to the normal decomposition
|
|
setmetatable( normal.decompK, { __index = normal.decomp } )
|
|
|
|
return normal
|
|
LUA
|
|
);
|
|
|
|
fclose( $X );
|