Add toNFKC and toNFKD to mw.ustring

This also makes some updates to make-normalization-table.php to handle the move of UtfNormal to a separate library. Bug: T126427 Change-Id: Id4985c3ca441cf92f08ba1f1af85c762ba43d7d2
2024-11-27 17:50:06 +00:00 · 2016-04-01 13:54:42 +03:00 · 2016-04-01 13:54:42 +03:00 · b3da8a698d
parent 22153435a2
commit b3da8a698d
6 changed files with 5755 additions and 48 deletions
--- a/engines/LuaCommon/UstringLibrary.php
+++ b/engines/LuaCommon/UstringLibrary.php
@ -59,6 +59,8 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
 				'codepoint' => array( $this, 'ustringCodepoint' ),
 				'toNFC' => array( $this, 'ustringToNFC' ),
 				'toNFD' => array( $this, 'ustringToNFD' ),
+				'toNFKC' => array( $this, 'ustringToNFKC' ),
+				'toNFKD' => array( $this, 'ustringToNFKD' ),
 				'char' => array( $this, 'ustringChar' ),
 				'len' => array( $this, 'ustringLen' ),
 				'sub' => array( $this, 'ustringSub' ),
@ -173,6 +175,22 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
 		return array( UtfNormal::toNFD( $s ) );
 	}

+	public function ustringToNFKC( $s ) {
+		$this->checkString( 'toNFKC', $s, false );
+		if ( !$this->checkEncoding( $s ) ) {
+			return array( null );
+		}
+		return array( UtfNormal::toNFKC( $s ) );
+	}
+
+	public function ustringToNFKD( $s ) {
+		$this->checkString( 'toNFKD', $s, false );
+		if ( !$this->checkEncoding( $s ) ) {
+			return array( null );
+		}
+		return array( UtfNormal::toNFKD( $s ) );
+	}
+
 	public function ustringChar() {
 		$args = func_get_args();
 		if ( count( $args ) > $this->stringLengthLimit ) {
--- a/engines/LuaCommon/lualib/ustring/make-normalization-table.php
+++ b/engines/LuaCommon/lualib/ustring/make-normalization-table.php
@ -13,6 +13,8 @@ if ( count( $argv ) > 1 ) {
 	}
 } else {
 	foreach( array(
+		__DIR__ . '/../../../../../core/vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
+		__DIR__ . '/../../../../../vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
 		__DIR__ . '/../../../../../core/includes/libs/normal/UtfNormalData.inc',
 		__DIR__ . '/../../../../../includes/libs/normal/UtfNormalData.inc',
 	) as $tryfile ) {
@ -27,15 +29,46 @@ if ( count( $argv ) > 1 ) {
 	}
 }

-echo "Loading data file $datafile...\n";
+$datafileK = null;
+if ( count( $argv ) > 2 ) {
+	$datafileK = $argv[2];
+	if ( !file_exists( $datafileK ) ) {
+		die( "The specified file '$datafileK' does not exist\n" );
+	}
+} else {
+	foreach( array(
+		dirname( $datafile ) . '/UtfNormalDataK.inc',
+		__DIR__ . '/../../../../../core/vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
+		__DIR__ . '/../../../../../vendor/wikimedia/utfnormal/src/UtfNormalData.inc',
+		__DIR__ . '/../../../../../core/includes/libs/normal/UtfNormalData.inc',
+		__DIR__ . '/../../../../../includes/libs/normal/UtfNormalData.inc',
+	) as $tryfile ) {
+		$tryfile = realpath( $tryfile );
+		if ( file_exists( $tryfile ) ) {
+			$datafileK = $tryfile;
+			break;
+		}
+	}
+	if ( !$datafileK ) {
+		die( "Cannot find UtfNormalDataK.inc. Please specify the path explicitly.\n" );
+	}
+}
+
 class UtfNormal {
 	static $utfCheckNFC = null;
 	static $utfCombiningClass = null;
 	static $utfCanonicalDecomp = null;
 	static $utfCanonicalComp = null;
+	static $utfCompatibilityDecomp = null;
 }
+class_alias( 'UtfNormal', 'UtfNormal\\Validator' );
+
+echo "Loading data file $datafile...\n";
 require_once( $datafile );

+echo "Loading data file $datafileK...\n";
+require_once( $datafileK );
+
 if ( !UtfNormal::$utfCheckNFC ||
 	!UtfNormal::$utfCombiningClass ||
 	!UtfNormal::$utfCanonicalDecomp ||
@ -43,6 +76,9 @@ if ( !UtfNormal::$utfCheckNFC ||
 ) {
 	die( "Data file $datafile did not contain needed data.\n" );
 }
+if ( !UtfNormal::$utfCompatibilityDecomp ) {
+	die( "Data file $datafileK did not contain needed data.\n" );
+}

 function uord( $c, $firstOnly ) {
 	$ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
@ -90,6 +126,22 @@ foreach ( UtfNormal::$utfCanonicalDecomp as $k => $v ) {
 }
 fprintf( $X, "\t},\n\n" );

+fprintf( $X, "\tdecompK = {\n" );
+foreach ( UtfNormal::$utfCompatibilityDecomp as $k => $v ) {
+	if ( isset( UtfNormal::$utfCanonicalDecomp[$k] ) && UtfNormal::$utfCanonicalDecomp[$k] === $v ) {
+		// Skip duplicates
+		continue;
+	}
+	fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
+	$fmt = "0x%06x";
+	foreach ( uord( $v, false ) as $c ) {
+		fprintf( $X, $fmt, $c );
+		$fmt = ", 0x%06x";
+	}
+	fprintf( $X, " },\n" );
+}
+fprintf( $X, "\t},\n\n" );
+
 fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
 fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
 $t = array();
@ -178,6 +230,9 @@ setmetatable( normal.comp, { __index = function ( t, k )
 	return nil
 end } )

+-- Compatibility decomposition falls back to the normal decomposition
+setmetatable( normal.decompK, { __index = normal.decomp } )
+
 return normal
 LUA
 );
--- a/engines/LuaCommon/lualib/ustring/normalization-data.lua
+++ b/engines/LuaCommon/lualib/ustring/normalization-data.lua
--- a/engines/LuaCommon/lualib/ustring/ustring.lua
+++ b/engines/LuaCommon/lualib/ustring/ustring.lua
@ -1011,14 +1011,14 @@ end
 ---- Unicode Normalization ----
 -- These functions load a conversion table when called

-local function internalToNFD( cps )
+local function internalDecompose( cps, decomp )
 	local cp = {}
 	local normal = require 'ustring/normalization-data'

 	-- Decompose into cp, using the lookup table and logic for hangul
 	for i = 1, cps.len do
 		local c = cps.codepoints[i]
-		local m = normal.decomp[c]
+		local m = decomp[c]
 		if m then
 			for j = 0, #m do
 				cp[#cp + 1] = m[j]
@ -1048,6 +1048,49 @@ local function internalToNFD( cps )
 	return cp, 1, l
 end

+local function internalCompose( cp, _, l )
+	local normal = require 'ustring/normalization-data'
+
+	-- Since NFD->NFC can never expand a character sequence, we can do this
+	-- in-place.
+	local comp = normal.comp[cp[1]]
+	local sc = 1
+	local j = 1
+	local lastclass = 0
+	for i = 2, l do
+		local c = cp[i]
+		local ccc = normal.combclass[c]
+		if ccc then
+			-- Trying a combiner with the starter
+			if comp and lastclass < ccc and comp[c] then
+				-- Yes!
+				c = comp[c]
+				cp[sc] = c
+				comp = normal.comp[c]
+			else
+				-- No, copy it to the right place for output
+				j = j + 1
+				cp[j] = c
+				lastclass = ccc
+			end
+		elseif comp and lastclass == 0 and comp[c] then
+			-- Combining two adjacent starters
+			c = comp[c]
+			cp[sc] = c
+			comp = normal.comp[c]
+		else
+			-- New starter, doesn't combine
+			j = j + 1
+			cp[j] = c
+			comp = normal.comp[c]
+			sc = j
+			lastclass = 0
+		end
+	end
+
+	return cp, 1, j
+end
+
 -- Normalize a string to NFC
 --
 -- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
@ -1082,47 +1125,8 @@ function ustring.toNFC( s )
 		return s
 	end

-	-- Next, expand to NFD
-	local cp, _, l = internalToNFD( cps )
-
-	-- Then combine to NFC. Since NFD->NFC can never expand a character
-	-- sequence, we can do this in-place.
-	local comp = normal.comp[cp[1]]
-	local sc = 1
-	local j = 1
-	local lastclass = 0
-	for i = 2, l do
-		local c = cp[i]
-		local ccc = normal.combclass[c]
-		if ccc then
-			-- Trying a combiner with the starter
-			if comp and lastclass < ccc and comp[c] then
-				-- Yes!
-				c = comp[c]
-				cp[sc] = c
-				comp = normal.comp[c]
-			else
-				-- No, copy it to the right place for output
-				j = j + 1
-				cp[j] = c
-				lastclass = ccc
-			end
-		elseif comp and lastclass == 0 and comp[c] then
-			-- Combining two adjacent starters
-			c = comp[c]
-			cp[sc] = c
-			comp = normal.comp[c]
-		else
-			-- New starter, doesn't combine
-			j = j + 1
-			cp[j] = c
-			comp = normal.comp[c]
-			sc = j
-			lastclass = 0
-		end
-	end
-
-	return internalChar( cp, 1, j )
+	-- Next, expand to NFD then recompose
+	return internalChar( internalCompose( internalDecompose( cps, normal.decomp ) ) )
 end

 -- Normalize a string to NFD
@ -1135,7 +1139,7 @@ end
 function ustring.toNFD( s )
 	checkString( 'toNFD', s )

-	-- ASCII is always NFC
+	-- ASCII is always NFD
 	if not S.find( s, '[\128-\255]' ) then
 		return s
 	end
@ -1145,7 +1149,57 @@ function ustring.toNFD( s )
 		return nil
 	end

-	return internalChar( internalToNFD( cps ) )
+	local normal = require 'ustring/normalization-data'
+	return internalChar( internalDecompose( cps, normal.decomp ) )
+end
+
+-- Normalize a string to NFKC
+--
+-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
+-- UTF-8.
+--
+-- @param s string
+-- @return string|nil
+function ustring.toNFKC( s )
+	checkString( 'toNFKC', s )
+
+	-- ASCII is always NFKC
+	if not S.find( s, '[\128-\255]' ) then
+		return s
+	end
+
+	local cps = utf8_explode( s )
+	if cps == nil then
+		return nil
+	end
+	local normal = require 'ustring/normalization-data'
+
+	-- Next, expand to NFKD then recompose
+	return internalChar( internalCompose( internalDecompose( cps, normal.decompK ) ) )
+end
+
+-- Normalize a string to NFKD
+--
+-- Based on MediaWiki's UtfNormal class. Returns nil if the string is not valid
+-- UTF-8.
+--
+-- @param s string
+-- @return string|nil
+function ustring.toNFKD( s )
+	checkString( 'toNFKD', s )
+
+	-- ASCII is always NFKD
+	if not S.find( s, '[\128-\255]' ) then
+		return s
+	end
+
+	local cps = utf8_explode( s )
+	if cps == nil then
+		return nil
+	end
+
+	local normal = require 'ustring/normalization-data'
+	return internalChar( internalDecompose( cps, normal.decompK ) )
 end

 return ustring
--- a/tests/engines/LuaCommon/UstringLibraryNormalizationTests.lua
+++ b/tests/engines/LuaCommon/UstringLibraryNormalizationTests.lua
@ -18,6 +18,16 @@ return {
 			tohex( mw.ustring.toNFD( c2 ) ),
 			tohex( mw.ustring.toNFD( c3 ) ),
 			tohex( mw.ustring.toNFD( c4 ) ),
-			tohex( mw.ustring.toNFD( c5 ) )
+			tohex( mw.ustring.toNFD( c5 ) ),
+			tohex( mw.ustring.toNFKC( c1 ) ),
+			tohex( mw.ustring.toNFKC( c2 ) ),
+			tohex( mw.ustring.toNFKC( c3 ) ),
+			tohex( mw.ustring.toNFKC( c4 ) ),
+			tohex( mw.ustring.toNFKC( c5 ) ),
+			tohex( mw.ustring.toNFKD( c1 ) ),
+			tohex( mw.ustring.toNFKD( c2 ) ),
+			tohex( mw.ustring.toNFKD( c3 ) ),
+			tohex( mw.ustring.toNFKD( c4 ) ),
+			tohex( mw.ustring.toNFKD( c5 ) )
 	end
 }
--- a/tests/engines/LuaCommon/UstringLibraryTest.php
+++ b/tests/engines/LuaCommon/UstringLibraryTest.php
@ -41,7 +41,12 @@ class Scribunto_LuaUstringLibraryTests extends Scribunto_LuaEngineTestBase {
 	public function testUstringLibraryNormalizationTests( $name, $c1, $c2, $c3, $c4, $c5 ) {
 		$this->luaTestName = "UstringLibraryNormalization: $name";
 		$dataProvider = $this->provideUstringLibraryNormalizationTests();
-		$expected = array( $c2, $c2, $c2, $c4, $c4, $c3, $c3, $c3, $c5, $c5 );
+		$expected = array(
+			$c2, $c2, $c2, $c4, $c4, // NFC
+			$c3, $c3, $c3, $c5, $c5, // NFD
+			$c4, $c4, $c4, $c4, $c4, // NFKC
+			$c5, $c5, $c5, $c5, $c5, // NFKD
+		);
 		foreach ( $expected as &$e ) {
 			$chars = array_values( unpack( 'N*', mb_convert_encoding( $e, 'UTF-32BE', 'UTF-8' ) ) );
 			foreach ( $chars as &$c ) {