Fix pure-Lua ustring and empty patterns

An empty pattern isn't "safe" since it could match in between the bytes of a UTF-8 character. Also, it turns out there's a bug in PHP <5.6.9 preg_replace() that we need to work around too. Change-Id: I282e5909e4663461d60c5386693db182de2fd44c
2024-11-24 00:05:00 +00:00 · 2015-09-23 13:31:54 -04:00 · 2015-09-23 13:31:54 -04:00 · 629f11d0dd
parent c48bda0698
commit 629f11d0dd
3 changed files with 86 additions and 13 deletions
--- a/engines/LuaCommon/UstringLibrary.php
+++ b/engines/LuaCommon/UstringLibrary.php
@ -22,6 +22,13 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
 	 */
 	private $manualCheckForU110000AndUp = false;
 	/**
 	 * PHP until 5.6.9 are buggy when the regex in preg_replace an
 	 * preg_match_all matches the empty string.
 	 * @var boolean
 	 */
 	private $phpBug53823 = false;
 	/**
 	 * A cache of patterns and the regexes they generate.
 	 * @var array
@ -35,6 +42,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
 		}
 		$this->manualCheckForU110000AndUp = mb_check_encoding( "\xf4\x90\x80\x80", "UTF-8" );
 		$this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x";
 		$this->patternRegexCache = new MapCacheLRU( 100 );
 		parent::__construct( $engine );
@ -331,6 +339,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
 			$captparen = array();
 			$opencapt = array();
 			$bct = 0;
 			for ( $i = 0; $i < $len; $i++ ) {
 				$ii = $i + 1;
 				$q = false;
@ -608,30 +617,45 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
 		if ( $n === null ) {
 			$n = -1;
-		} elseif ( $n < 0 ) {
+		} elseif ( $n < 1 ) {
-			$n = 0;
+			return array( $s, 0 );
 		}
 		list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' );
 		$captures = array();
 		if ( $this->phpBug53823 ) {
 			// PHP bug 53823 means that a zero-length match before a UTF-8
 			// character will match again before every byte of that character.
 			// The workaround is to capture the first "character" of/after the
 			// match and verify that its first byte is legal to start a UTF-8
 			// character.
 			$re = '/(?=(?<phpBug53823>.|$))' . substr( $re, 1 );
 		}
 		if ( $anypos ) {
 			// preg_replace_callback doesn't take a "flags" argument, so we
 			// can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle
 			// position captures. So instead we have to do a preg_match_all and
 			// handle the captures ourself.
 			$ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
 			if ( $n >= 0 ) {
 				$ct = min( $ct, $n );
 			}
 			for ( $i = 0; $i < $ct; $i++ ) {
 				$m = $mm[$i];
 				if ( $this->phpBug53823 ) {
 					$c = ord( $m['phpBug53823'][0] );
 					if ( $c >= 0x80 && $c <= 0xbf ) {
 						continue;
 					}
 				}
 				$c = array( $m[0][0] );
 				foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) {
 					$k++;
 					$c["m$k"] = $v;
 				}
 				$captures[] = $c;
 				if ( $n >= 0 && count( $captures ) >= $n ) {
 					break;
 				}
 			}
 		}
@ -693,12 +717,31 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
 			$this->checkType( 'gsub', 3, $repl, 'function or table or string' );
 		}
 		$skippedMatches = 0;
 		if ( $this->phpBug53823 ) {
 			// Since we're having bogus matches, we need to keep track of the
 			// necessary adjustment and stop manually once we hit the limit.
 			$maxMatches = $n < 0 ? INF : $n;
 			$n = -1;
 			$realCallback = $cb;
 			$cb = function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) {
 				$c = ord( $m['phpBug53823'] );
 				if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) {
 					$skippedMatches++;
 					return $m[0];
 				} else {
 					$maxMatches--;
 					return $realCallback( $m );
 				}
 			};
 		}
 		$count = 0;
 		$s2 = preg_replace_callback( $re, $cb, $s, $n, $count );
 		if ( $s2 === null ) {
 			self::handlePCREError( preg_last_error(), $pattern );
 		}
-		return array( $s2, $count );
+		return array( $s2, $count - $skippedMatches );
 	}
 	/**
--- a/engines/LuaCommon/lualib/ustring/ustring.lua
+++ b/engines/LuaCommon/lualib/ustring/ustring.lua
@ -750,16 +750,23 @@ end
 --    matches a partial UTF-8 character, but the others will happily enough
 --    match a whole UTF-8 character thinking it's 2, 3 or 4.
 --  * If it contains position-captures.
 --  * If it matches the empty string
 --
 -- @param string pattern
 -- @return boolean
 local function patternIsSimple( pattern )
 	local findWithPcall = function ( ... )
 		local ok, ret = pcall( S.find, ... )
 		return ok and ret
 	end
 	return not (
 		S.find( pattern, '[\128-\255]' ) or
 		S.find( pattern, '%[%^' ) or
 		S.find( pattern, '%%[acdlpsuwxACDLPSUWXZ]' ) or
 		S.find( pattern, '%.[^*+-]' ) or S.find( pattern, '%.$' ) or
-		S.find( pattern, '()', 1, true )
+		S.find( pattern, '()', 1, true ) or
 		pattern == '' or findWithPcall( '', pattern )
 	)
 end
@ -923,6 +930,14 @@ function ustring.gsub( s, pattern, repl, n )
 		end
 	end
 	if n == nil then
 		n = 1e100
 	end
 	if n < 1 then
 		-- No replacement
 		return s, 0
 	end
 	local cps = utf8_explode( s )
 	if cps == nil then
 		error( "bad argument #1 for 'gsub' (string is not UTF-8)", 2 )
@ -931,9 +946,6 @@ function ustring.gsub( s, pattern, repl, n )
 	if pat == nil then
 		error( "bad argument #2 for 'gsub' (string is not UTF-8)", 2 )
 	end
 	if n == nil then
 		n = 1e100
 	end
 	if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored
 		-- There can be only the one match, so make that explicit
@ -957,8 +969,9 @@ function ustring.gsub( s, pattern, repl, n )
 	local init = 1
 	local ct = 0
 	local ret = {}
-	while init < cps.len + 1 and ct < n do
+	local zeroAdjustment = 0
-		local m = { find( s, cps, pattern, pat, init ) }
+	repeat
 		local m = { find( s, cps, pattern, pat, init + zeroAdjustment ) }
 		if not m[1] then
 			break
 		end
@ -1001,7 +1014,8 @@ function ustring.gsub( s, pattern, repl, n )
 		ret[#ret + 1] = val or mm
 		init = m[2] + 1
 		ct = ct + 1
-	end
+		zeroAdjustment = m[2] < m[1] and 1 or 0
 	until init > cps.len or ct >= n
 	if init <= cps.len then
 		ret[#ret + 1] = sub( s, cps, init, cps.len )
 	end
--- a/tests/engines/LuaCommon/UstringLibraryTests.lua
+++ b/tests/engines/LuaCommon/UstringLibraryTests.lua
@ -515,6 +515,22 @@ return testframework.getTestProvider( {
 	  args = { 'á', 'á', 'X' },
 	  expect = { 'X', 1 }
 	},
 	{ name = 'gsub: (one char string, empty pattern)', func = mw.ustring.gsub,
 	  args = { 'á', '', 'X' },
 	  expect = { 'XáX', 2 }
 	},
 	{ name = 'gsub: (empty pattern with position captures)', func = mw.ustring.gsub,
 	  args = { 'ábć', '()', '%1' },
 	  expect = { '1á2b3ć4', 4 }
 	},
 	{ name = 'gsub: (limited to 1 replacement)', func = mw.ustring.gsub,
 	  args = { 'áá', 'á', 'X', 1 },
 	  expect = { 'Xá', 1 }
 	},
 	{ name = 'gsub: (limited to 0 replacements)', func = mw.ustring.gsub,
 	  args = { 'áá', 'á', 'X', 0 },
 	  expect = { 'áá', 0 }
 	},
 	{ name = 'gsub: (string 1)', func = mw.ustring.gsub,
 	  args = { str2, 'f%a+', 'X' },
 	  expect = { 'X bar X X baz X X X', 6 }