mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-27 17:50:06 +00:00
Add toNFKC and toNFKD to mw.ustring
This also makes some updates to make-normalization-table.php to handle the move of UtfNormal to a separate library. Bug: T126427 Change-Id: Id4985c3ca441cf92f08ba1f1af85c762ba43d7d2
This commit is contained in:
parent
22153435a2
commit
b3da8a698d
|
@ -59,6 +59,8 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
'codepoint' => array( $this, 'ustringCodepoint' ),
|
||||
'toNFC' => array( $this, 'ustringToNFC' ),
|
||||
'toNFD' => array( $this, 'ustringToNFD' ),
|
||||
'toNFKC' => array( $this, 'ustringToNFKC' ),
|
||||
'toNFKD' => array( $this, 'ustringToNFKD' ),
|
||||
'char' => array( $this, 'ustringChar' ),
|
||||
'len' => array( $this, 'ustringLen' ),
|
||||
'sub' => array( $this, 'ustringSub' ),
|
||||
|
@ -173,6 +175,22 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
return array( UtfNormal::toNFD( $s ) );
|
||||
}
|
||||
|
||||
public function ustringToNFKC( $s ) {
|
||||
$this->checkString( 'toNFKC', $s, false );
|
||||
if ( !$this->checkEncoding( $s ) ) {
|
||||
return array( null );
|
||||
}
|
||||
return array( UtfNormal::toNFKC( $s ) );
|
||||
}
|
||||
|
||||
public function ustringToNFKD( $s ) {
|
||||
$this->checkString( 'toNFKD', $s, false );
|
||||
if ( !$this->checkEncoding( $s ) ) {
|
||||
return array( null );
|
||||
}
|
||||
return array( UtfNormal::toNFKD( $s ) );
|
||||
}
|
||||
|
||||
public function ustringChar() {
|
||||
$args = func_get_args();
|
||||
if ( count( $args ) > $this->stringLengthLimit ) {
|
||||
|
|
|
@ -13,6 +13,8 @@ if ( count( $argv ) > 1 ) {
|
|||
}
|
||||
} else {
|
||||
foreach( array(
|
||||
__DIR__ . '/../../../../../core/vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
|
||||
__DIR__ . '/../../../../../vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
|
||||
__DIR__ . '/../../../../../core/includes/libs/normal/UtfNormalData.inc',
|
||||
__DIR__ . '/../../../../../includes/libs/normal/UtfNormalData.inc',
|
||||
) as $tryfile ) {
|
||||
|
@ -27,15 +29,46 @@ if ( count( $argv ) > 1 ) {
|
|||
}
|
||||
}
|
||||
|
||||
echo "Loading data file $datafile...\n";
|
||||
$datafileK = null;
|
||||
if ( count( $argv ) > 2 ) {
|
||||
$datafileK = $argv[2];
|
||||
if ( !file_exists( $datafileK ) ) {
|
||||
die( "The specified file '$datafileK' does not exist\n" );
|
||||
}
|
||||
} else {
|
||||
foreach( array(
|
||||
dirname( $datafile ) . '/UtfNormalDataK.inc',
|
||||
__DIR__ . '/../../../../../core/vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
|
||||
__DIR__ . '/../../../../../vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
|
||||
__DIR__ . '/../../../../../core/includes/libs/normal/UtfNormalData.inc',
|
||||
__DIR__ . '/../../../../../includes/libs/normal/UtfNormalData.inc',
|
||||
) as $tryfile ) {
|
||||
$tryfile = realpath( $tryfile );
|
||||
if ( file_exists( $tryfile ) ) {
|
||||
$datafileK = $tryfile;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( !$datafileK ) {
|
||||
die( "Cannot find UtfNormalDataK.inc. Please specify the path explicitly.\n" );
|
||||
}
|
||||
}
|
||||
|
||||
class UtfNormal {
|
||||
static $utfCheckNFC = null;
|
||||
static $utfCombiningClass = null;
|
||||
static $utfCanonicalDecomp = null;
|
||||
static $utfCanonicalComp = null;
|
||||
static $utfCompatibilityDecomp = null;
|
||||
}
|
||||
class_alias( 'UtfNormal', 'UtfNormal\\Validator' );
|
||||
|
||||
echo "Loading data file $datafile...\n";
|
||||
require_once( $datafile );
|
||||
|
||||
echo "Loading data file $datafileK...\n";
|
||||
require_once( $datafileK );
|
||||
|
||||
if ( !UtfNormal::$utfCheckNFC ||
|
||||
!UtfNormal::$utfCombiningClass ||
|
||||
!UtfNormal::$utfCanonicalDecomp ||
|
||||
|
@ -43,6 +76,9 @@ if ( !UtfNormal::$utfCheckNFC ||
|
|||
) {
|
||||
die( "Data file $datafile did not contain needed data.\n" );
|
||||
}
|
||||
if ( !UtfNormal::$utfCompatibilityDecomp ) {
|
||||
die( "Data file $datafileK did not contain needed data.\n" );
|
||||
}
|
||||
|
||||
function uord( $c, $firstOnly ) {
|
||||
$ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
|
||||
|
@ -90,6 +126,22 @@ foreach ( UtfNormal::$utfCanonicalDecomp as $k => $v ) {
|
|||
}
|
||||
fprintf( $X, "\t},\n\n" );
|
||||
|
||||
fprintf( $X, "\tdecompK = {\n" );
|
||||
foreach ( UtfNormal::$utfCompatibilityDecomp as $k => $v ) {
|
||||
if ( isset( UtfNormal::$utfCanonicalDecomp[$k] ) && UtfNormal::$utfCanonicalDecomp[$k] === $v ) {
|
||||
// Skip duplicates
|
||||
continue;
|
||||
}
|
||||
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
|
||||
$fmt = "0x%06x";
|
||||
foreach ( uord( $v, false ) as $c ) {
|
||||
fprintf( $X, $fmt, $c );
|
||||
$fmt = ", 0x%06x";
|
||||
}
|
||||
fprintf( $X, " },\n" );
|
||||
}
|
||||
fprintf( $X, "\t},\n\n" );
|
||||
|
||||
fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
|
||||
fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
|
||||
$t = array();
|
||||
|
@ -178,6 +230,9 @@ setmetatable( normal.comp, { __index = function ( t, k )
|
|||
return nil
|
||||
end } )
|
||||
|
||||
-- Compatibility decomposition falls back to the normal decomposition
|
||||
setmetatable( normal.decompK, { __index = normal.decomp } )
|
||||
|
||||
return normal
|
||||
LUA
|
||||
);
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1011,14 +1011,14 @@ end
|
|||
---- Unicode Normalization ----
|
||||
-- These functions load a conversion table when called
|
||||
|
||||
local function internalToNFD( cps )
|
||||
local function internalDecompose( cps, decomp )
|
||||
local cp = {}
|
||||
local normal = require 'ustring/normalization-data'
|
||||
|
||||
-- Decompose into cp, using the lookup table and logic for hangul
|
||||
for i = 1, cps.len do
|
||||
local c = cps.codepoints[i]
|
||||
local m = normal.decomp[c]
|
||||
local m = decomp[c]
|
||||
if m then
|
||||
for j = 0, #m do
|
||||
cp[#cp + 1] = m[j]
|
||||
|
@ -1048,6 +1048,49 @@ local function internalToNFD( cps )
|
|||
return cp, 1, l
|
||||
end
|
||||
|
||||
local function internalCompose( cp, _, l )
|
||||
local normal = require 'ustring/normalization-data'
|
||||
|
||||
-- Since NFD->NFC can never expand a character sequence, we can do this
|
||||
-- in-place.
|
||||
local comp = normal.comp[cp[1]]
|
||||
local sc = 1
|
||||
local j = 1
|
||||
local lastclass = 0
|
||||
for i = 2, l do
|
||||
local c = cp[i]
|
||||
local ccc = normal.combclass[c]
|
||||
if ccc then
|
||||
-- Trying a combiner with the starter
|
||||
if comp and lastclass < ccc and comp[c] then
|
||||
-- Yes!
|
||||
c = comp[c]
|
||||
cp[sc] = c
|
||||
comp = normal.comp[c]
|
||||
else
|
||||
-- No, copy it to the right place for output
|
||||
j = j + 1
|
||||
cp[j] = c
|
||||
lastclass = ccc
|
||||
end
|
||||
elseif comp and lastclass == 0 and comp[c] then
|
||||
-- Combining two adjacent starters
|
||||
c = comp[c]
|
||||
cp[sc] = c
|
||||
comp = normal.comp[c]
|
||||
else
|
||||
-- New starter, doesn't combine
|
||||
j = j + 1
|
||||
cp[j] = c
|
||||
comp = normal.comp[c]
|
||||
sc = j
|
||||
lastclass = 0
|
||||
end
|
||||
end
|
||||
|
||||
return cp, 1, j
|
||||
end
|
||||
|
||||
-- Normalize a string to NFC
|
||||
--
|
||||
-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
|
||||
|
@ -1082,47 +1125,8 @@ function ustring.toNFC( s )
|
|||
return s
|
||||
end
|
||||
|
||||
-- Next, expand to NFD
|
||||
local cp, _, l = internalToNFD( cps )
|
||||
|
||||
-- Then combine to NFC. Since NFD->NFC can never expand a character
|
||||
-- sequence, we can do this in-place.
|
||||
local comp = normal.comp[cp[1]]
|
||||
local sc = 1
|
||||
local j = 1
|
||||
local lastclass = 0
|
||||
for i = 2, l do
|
||||
local c = cp[i]
|
||||
local ccc = normal.combclass[c]
|
||||
if ccc then
|
||||
-- Trying a combiner with the starter
|
||||
if comp and lastclass < ccc and comp[c] then
|
||||
-- Yes!
|
||||
c = comp[c]
|
||||
cp[sc] = c
|
||||
comp = normal.comp[c]
|
||||
else
|
||||
-- No, copy it to the right place for output
|
||||
j = j + 1
|
||||
cp[j] = c
|
||||
lastclass = ccc
|
||||
end
|
||||
elseif comp and lastclass == 0 and comp[c] then
|
||||
-- Combining two adjacent starters
|
||||
c = comp[c]
|
||||
cp[sc] = c
|
||||
comp = normal.comp[c]
|
||||
else
|
||||
-- New starter, doesn't combine
|
||||
j = j + 1
|
||||
cp[j] = c
|
||||
comp = normal.comp[c]
|
||||
sc = j
|
||||
lastclass = 0
|
||||
end
|
||||
end
|
||||
|
||||
return internalChar( cp, 1, j )
|
||||
-- Next, expand to NFD then recompose
|
||||
return internalChar( internalCompose( internalDecompose( cps, normal.decomp ) ) )
|
||||
end
|
||||
|
||||
-- Normalize a string to NFD
|
||||
|
@ -1135,7 +1139,7 @@ end
|
|||
function ustring.toNFD( s )
|
||||
checkString( 'toNFD', s )
|
||||
|
||||
-- ASCII is always NFC
|
||||
-- ASCII is always NFD
|
||||
if not S.find( s, '[\128-\255]' ) then
|
||||
return s
|
||||
end
|
||||
|
@ -1145,7 +1149,57 @@ function ustring.toNFD( s )
|
|||
return nil
|
||||
end
|
||||
|
||||
return internalChar( internalToNFD( cps ) )
|
||||
local normal = require 'ustring/normalization-data'
|
||||
return internalChar( internalDecompose( cps, normal.decomp ) )
|
||||
end
|
||||
|
||||
-- Normalize a string to NFKC
|
||||
--
|
||||
-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
|
||||
-- UTF-8.
|
||||
--
|
||||
-- @param s string
|
||||
-- @return string|nil
|
||||
function ustring.toNFKC( s )
|
||||
checkString( 'toNFKC', s )
|
||||
|
||||
-- ASCII is always NFKC
|
||||
if not S.find( s, '[\128-\255]' ) then
|
||||
return s
|
||||
end
|
||||
|
||||
local cps = utf8_explode( s )
|
||||
if cps == nil then
|
||||
return nil
|
||||
end
|
||||
local normal = require 'ustring/normalization-data'
|
||||
|
||||
-- Next, expand to NFKD then recompose
|
||||
return internalChar( internalCompose( internalDecompose( cps, normal.decompK ) ) )
|
||||
end
|
||||
|
||||
-- Normalize a string to NFKD
|
||||
--
|
||||
-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
|
||||
-- UTF-8.
|
||||
--
|
||||
-- @param s string
|
||||
-- @return string|nil
|
||||
function ustring.toNFKD( s )
|
||||
checkString( 'toNFKD', s )
|
||||
|
||||
-- ASCII is always NFKD
|
||||
if not S.find( s, '[\128-\255]' ) then
|
||||
return s
|
||||
end
|
||||
|
||||
local cps = utf8_explode( s )
|
||||
if cps == nil then
|
||||
return nil
|
||||
end
|
||||
|
||||
local normal = require 'ustring/normalization-data'
|
||||
return internalChar( internalDecompose( cps, normal.decompK ) )
|
||||
end
|
||||
|
||||
return ustring
|
||||
|
|
|
@ -18,6 +18,16 @@ return {
|
|||
tohex( mw.ustring.toNFD( c2 ) ),
|
||||
tohex( mw.ustring.toNFD( c3 ) ),
|
||||
tohex( mw.ustring.toNFD( c4 ) ),
|
||||
tohex( mw.ustring.toNFD( c5 ) )
|
||||
tohex( mw.ustring.toNFD( c5 ) ),
|
||||
tohex( mw.ustring.toNFKC( c1 ) ),
|
||||
tohex( mw.ustring.toNFKC( c2 ) ),
|
||||
tohex( mw.ustring.toNFKC( c3 ) ),
|
||||
tohex( mw.ustring.toNFKC( c4 ) ),
|
||||
tohex( mw.ustring.toNFKC( c5 ) ),
|
||||
tohex( mw.ustring.toNFKD( c1 ) ),
|
||||
tohex( mw.ustring.toNFKD( c2 ) ),
|
||||
tohex( mw.ustring.toNFKD( c3 ) ),
|
||||
tohex( mw.ustring.toNFKD( c4 ) ),
|
||||
tohex( mw.ustring.toNFKD( c5 ) )
|
||||
end
|
||||
}
|
||||
|
|
|
@ -41,7 +41,12 @@ class Scribunto_LuaUstringLibraryTests extends Scribunto_LuaEngineTestBase {
|
|||
public function testUstringLibraryNormalizationTests( $name, $c1, $c2, $c3, $c4, $c5 ) {
|
||||
$this->luaTestName = "UstringLibraryNormalization: $name";
|
||||
$dataProvider = $this->provideUstringLibraryNormalizationTests();
|
||||
$expected = array( $c2, $c2, $c2, $c4, $c4, $c3, $c3, $c3, $c5, $c5 );
|
||||
$expected = array(
|
||||
$c2, $c2, $c2, $c4, $c4, // NFC
|
||||
$c3, $c3, $c3, $c5, $c5, // NFD
|
||||
$c4, $c4, $c4, $c4, $c4, // NFKC
|
||||
$c5, $c5, $c5, $c5, $c5, // NFKD
|
||||
);
|
||||
foreach ( $expected as &$e ) {
|
||||
$chars = array_values( unpack( 'N*', mb_convert_encoding( $e, 'UTF-32BE', 'UTF-8' ) ) );
|
||||
foreach ( $chars as &$c ) {
|
||||
|
|
Loading…
Reference in a new issue