Add toNFKC and toNFKD to mw.ustring

This also makes some updates to make-normalization-table.php to handle
the move of UtfNormal to a separate library.

Bug: T126427
Change-Id: Id4985c3ca441cf92f08ba1f1af85c762ba43d7d2
This commit is contained in:
Brad Jorsch 2016-04-01 13:54:42 +03:00 committed by BryanDavis
parent 22153435a2
commit b3da8a698d
6 changed files with 5755 additions and 48 deletions

View file

@ -59,6 +59,8 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
'codepoint' => array( $this, 'ustringCodepoint' ),
'toNFC' => array( $this, 'ustringToNFC' ),
'toNFD' => array( $this, 'ustringToNFD' ),
'toNFKC' => array( $this, 'ustringToNFKC' ),
'toNFKD' => array( $this, 'ustringToNFKD' ),
'char' => array( $this, 'ustringChar' ),
'len' => array( $this, 'ustringLen' ),
'sub' => array( $this, 'ustringSub' ),
@ -173,6 +175,22 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
return array( UtfNormal::toNFD( $s ) );
}
public function ustringToNFKC( $s ) {
$this->checkString( 'toNFKC', $s, false );
if ( !$this->checkEncoding( $s ) ) {
return array( null );
}
return array( UtfNormal::toNFKC( $s ) );
}
public function ustringToNFKD( $s ) {
$this->checkString( 'toNFKD', $s, false );
if ( !$this->checkEncoding( $s ) ) {
return array( null );
}
return array( UtfNormal::toNFKD( $s ) );
}
public function ustringChar() {
$args = func_get_args();
if ( count( $args ) > $this->stringLengthLimit ) {

View file

@ -13,6 +13,8 @@ if ( count( $argv ) > 1 ) {
}
} else {
foreach( array(
__DIR__ . '/../../../../../core/vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
__DIR__ . '/../../../../../vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
__DIR__ . '/../../../../../core/includes/libs/normal/UtfNormalData.inc',
__DIR__ . '/../../../../../includes/libs/normal/UtfNormalData.inc',
) as $tryfile ) {
@ -27,15 +29,46 @@ if ( count( $argv ) > 1 ) {
}
}
echo "Loading data file $datafile...\n";
$datafileK = null;
if ( count( $argv ) > 2 ) {
$datafileK = $argv[2];
if ( !file_exists( $datafileK ) ) {
die( "The specified file '$datafileK' does not exist\n" );
}
} else {
foreach( array(
dirname( $datafile ) . '/UtfNormalDataK.inc',
__DIR__ . '/../../../../../core/vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
__DIR__ . '/../../../../../vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
__DIR__ . '/../../../../../core/includes/libs/normal/UtfNormalData.inc',
__DIR__ . '/../../../../../includes/libs/normal/UtfNormalData.inc',
) as $tryfile ) {
$tryfile = realpath( $tryfile );
if ( file_exists( $tryfile ) ) {
$datafileK = $tryfile;
break;
}
}
if ( !$datafileK ) {
die( "Cannot find UtfNormalDataK.inc. Please specify the path explicitly.\n" );
}
}
class UtfNormal {
static $utfCheckNFC = null;
static $utfCombiningClass = null;
static $utfCanonicalDecomp = null;
static $utfCanonicalComp = null;
static $utfCompatibilityDecomp = null;
}
class_alias( 'UtfNormal', 'UtfNormal\\Validator' );
echo "Loading data file $datafile...\n";
require_once( $datafile );
echo "Loading data file $datafileK...\n";
require_once( $datafileK );
if ( !UtfNormal::$utfCheckNFC ||
!UtfNormal::$utfCombiningClass ||
!UtfNormal::$utfCanonicalDecomp ||
@ -43,6 +76,9 @@ if ( !UtfNormal::$utfCheckNFC ||
) {
die( "Data file $datafile did not contain needed data.\n" );
}
if ( !UtfNormal::$utfCompatibilityDecomp ) {
die( "Data file $datafileK did not contain needed data.\n" );
}
function uord( $c, $firstOnly ) {
$ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
@ -90,6 +126,22 @@ foreach ( UtfNormal::$utfCanonicalDecomp as $k => $v ) {
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\tdecompK = {\n" );
foreach ( UtfNormal::$utfCompatibilityDecomp as $k => $v ) {
if ( isset( UtfNormal::$utfCanonicalDecomp[$k] ) && UtfNormal::$utfCanonicalDecomp[$k] === $v ) {
// Skip duplicates
continue;
}
fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
$fmt = "0x%06x";
foreach ( uord( $v, false ) as $c ) {
fprintf( $X, $fmt, $c );
$fmt = ", 0x%06x";
}
fprintf( $X, " },\n" );
}
fprintf( $X, "\t},\n\n" );
fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
$t = array();
@ -178,6 +230,9 @@ setmetatable( normal.comp, { __index = function ( t, k )
return nil
end } )
-- Compatibility decomposition falls back to the normal decomposition
setmetatable( normal.decompK, { __index = normal.decomp } )
return normal
LUA
);

File diff suppressed because it is too large Load diff

View file

@ -1011,14 +1011,14 @@ end
---- Unicode Normalization ----
-- These functions load a conversion table when called
local function internalToNFD( cps )
local function internalDecompose( cps, decomp )
local cp = {}
local normal = require 'ustring/normalization-data'
-- Decompose into cp, using the lookup table and logic for hangul
for i = 1, cps.len do
local c = cps.codepoints[i]
local m = normal.decomp[c]
local m = decomp[c]
if m then
for j = 0, #m do
cp[#cp + 1] = m[j]
@ -1048,6 +1048,49 @@ local function internalToNFD( cps )
return cp, 1, l
end
local function internalCompose( cp, _, l )
local normal = require 'ustring/normalization-data'
-- Since NFD->NFC can never expand a character sequence, we can do this
-- in-place.
local comp = normal.comp[cp[1]]
local sc = 1
local j = 1
local lastclass = 0
for i = 2, l do
local c = cp[i]
local ccc = normal.combclass[c]
if ccc then
-- Trying a combiner with the starter
if comp and lastclass < ccc and comp[c] then
-- Yes!
c = comp[c]
cp[sc] = c
comp = normal.comp[c]
else
-- No, copy it to the right place for output
j = j + 1
cp[j] = c
lastclass = ccc
end
elseif comp and lastclass == 0 and comp[c] then
-- Combining two adjacent starters
c = comp[c]
cp[sc] = c
comp = normal.comp[c]
else
-- New starter, doesn't combine
j = j + 1
cp[j] = c
comp = normal.comp[c]
sc = j
lastclass = 0
end
end
return cp, 1, j
end
-- Normalize a string to NFC
--
-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
@ -1082,47 +1125,8 @@ function ustring.toNFC( s )
return s
end
-- Next, expand to NFD
local cp, _, l = internalToNFD( cps )
-- Then combine to NFC. Since NFD->NFC can never expand a character
-- sequence, we can do this in-place.
local comp = normal.comp[cp[1]]
local sc = 1
local j = 1
local lastclass = 0
for i = 2, l do
local c = cp[i]
local ccc = normal.combclass[c]
if ccc then
-- Trying a combiner with the starter
if comp and lastclass < ccc and comp[c] then
-- Yes!
c = comp[c]
cp[sc] = c
comp = normal.comp[c]
else
-- No, copy it to the right place for output
j = j + 1
cp[j] = c
lastclass = ccc
end
elseif comp and lastclass == 0 and comp[c] then
-- Combining two adjacent starters
c = comp[c]
cp[sc] = c
comp = normal.comp[c]
else
-- New starter, doesn't combine
j = j + 1
cp[j] = c
comp = normal.comp[c]
sc = j
lastclass = 0
end
end
return internalChar( cp, 1, j )
-- Next, expand to NFD then recompose
return internalChar( internalCompose( internalDecompose( cps, normal.decomp ) ) )
end
-- Normalize a string to NFD
@ -1135,7 +1139,7 @@ end
function ustring.toNFD( s )
checkString( 'toNFD', s )
-- ASCII is always NFC
-- ASCII is always NFD
if not S.find( s, '[\128-\255]' ) then
return s
end
@ -1145,7 +1149,57 @@ function ustring.toNFD( s )
return nil
end
return internalChar( internalToNFD( cps ) )
local normal = require 'ustring/normalization-data'
return internalChar( internalDecompose( cps, normal.decomp ) )
end
-- Normalize a string to NFKC
--
-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
-- UTF-8.
--
-- @param s string
-- @return string|nil
function ustring.toNFKC( s )
checkString( 'toNFKC', s )
-- ASCII is always NFKC
if not S.find( s, '[\128-\255]' ) then
return s
end
local cps = utf8_explode( s )
if cps == nil then
return nil
end
local normal = require 'ustring/normalization-data'
-- Next, expand to NFKD then recompose
return internalChar( internalCompose( internalDecompose( cps, normal.decompK ) ) )
end
-- Normalize a string to NFKD
--
-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
-- UTF-8.
--
-- @param s string
-- @return string|nil
function ustring.toNFKD( s )
checkString( 'toNFKD', s )
-- ASCII is always NFKD
if not S.find( s, '[\128-\255]' ) then
return s
end
local cps = utf8_explode( s )
if cps == nil then
return nil
end
local normal = require 'ustring/normalization-data'
return internalChar( internalDecompose( cps, normal.decompK ) )
end
return ustring

View file

@ -18,6 +18,16 @@ return {
tohex( mw.ustring.toNFD( c2 ) ),
tohex( mw.ustring.toNFD( c3 ) ),
tohex( mw.ustring.toNFD( c4 ) ),
tohex( mw.ustring.toNFD( c5 ) )
tohex( mw.ustring.toNFD( c5 ) ),
tohex( mw.ustring.toNFKC( c1 ) ),
tohex( mw.ustring.toNFKC( c2 ) ),
tohex( mw.ustring.toNFKC( c3 ) ),
tohex( mw.ustring.toNFKC( c4 ) ),
tohex( mw.ustring.toNFKC( c5 ) ),
tohex( mw.ustring.toNFKD( c1 ) ),
tohex( mw.ustring.toNFKD( c2 ) ),
tohex( mw.ustring.toNFKD( c3 ) ),
tohex( mw.ustring.toNFKD( c4 ) ),
tohex( mw.ustring.toNFKD( c5 ) )
end
}

View file

@ -41,7 +41,12 @@ class Scribunto_LuaUstringLibraryTests extends Scribunto_LuaEngineTestBase {
public function testUstringLibraryNormalizationTests( $name, $c1, $c2, $c3, $c4, $c5 ) {
$this->luaTestName = "UstringLibraryNormalization: $name";
$dataProvider = $this->provideUstringLibraryNormalizationTests();
$expected = array( $c2, $c2, $c2, $c4, $c4, $c3, $c3, $c3, $c5, $c5 );
$expected = array(
$c2, $c2, $c2, $c4, $c4, // NFC
$c3, $c3, $c3, $c5, $c5, // NFD
$c4, $c4, $c4, $c4, $c4, // NFKC
$c5, $c5, $c5, $c5, $c5, // NFKD
);
foreach ( $expected as &$e ) {
$chars = array_values( unpack( 'N*', mb_convert_encoding( $e, 'UTF-32BE', 'UTF-8' ) ) );
foreach ( $chars as &$c ) {