Fix pure-Lua ustring and empty patterns

An empty pattern isn't "safe" since it could match in between the
bytes of a UTF-8 character.

Also, it turns out there's a bug in PHP <5.6.9 preg_replace() that we
need to work around too.

Change-Id: I282e5909e4663461d60c5386693db182de2fd44c
This commit is contained in:
Brad Jorsch 2015-09-23 13:31:54 -04:00
parent c48bda0698
commit 629f11d0dd
3 changed files with 86 additions and 13 deletions

View file

@ -22,6 +22,13 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
*/ */
private $manualCheckForU110000AndUp = false; private $manualCheckForU110000AndUp = false;
/**
* PHP until 5.6.9 are buggy when the regex in preg_replace an
* preg_match_all matches the empty string.
* @var boolean
*/
private $phpBug53823 = false;
/** /**
* A cache of patterns and the regexes they generate. * A cache of patterns and the regexes they generate.
* @var array * @var array
@ -35,6 +42,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
} }
$this->manualCheckForU110000AndUp = mb_check_encoding( "\xf4\x90\x80\x80", "UTF-8" ); $this->manualCheckForU110000AndUp = mb_check_encoding( "\xf4\x90\x80\x80", "UTF-8" );
$this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x";
$this->patternRegexCache = new MapCacheLRU( 100 ); $this->patternRegexCache = new MapCacheLRU( 100 );
parent::__construct( $engine ); parent::__construct( $engine );
@ -331,6 +339,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
$captparen = array(); $captparen = array();
$opencapt = array(); $opencapt = array();
$bct = 0; $bct = 0;
for ( $i = 0; $i < $len; $i++ ) { for ( $i = 0; $i < $len; $i++ ) {
$ii = $i + 1; $ii = $i + 1;
$q = false; $q = false;
@ -608,30 +617,45 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
if ( $n === null ) { if ( $n === null ) {
$n = -1; $n = -1;
} elseif ( $n < 0 ) { } elseif ( $n < 1 ) {
$n = 0; return array( $s, 0 );
} }
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' ); list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' );
$captures = array(); $captures = array();
if ( $this->phpBug53823 ) {
// PHP bug 53823 means that a zero-length match before a UTF-8
// character will match again before every byte of that character.
// The workaround is to capture the first "character" of/after the
// match and verify that its first byte is legal to start a UTF-8
// character.
$re = '/(?=(?<phpBug53823>.|$))' . substr( $re, 1 );
}
if ( $anypos ) { if ( $anypos ) {
// preg_replace_callback doesn't take a "flags" argument, so we // preg_replace_callback doesn't take a "flags" argument, so we
// can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle // can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle
// position captures. So instead we have to do a preg_match_all and // position captures. So instead we have to do a preg_match_all and
// handle the captures ourself. // handle the captures ourself.
$ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); $ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
if ( $n >= 0 ) {
$ct = min( $ct, $n );
}
for ( $i = 0; $i < $ct; $i++ ) { for ( $i = 0; $i < $ct; $i++ ) {
$m = $mm[$i]; $m = $mm[$i];
if ( $this->phpBug53823 ) {
$c = ord( $m['phpBug53823'][0] );
if ( $c >= 0x80 && $c <= 0xbf ) {
continue;
}
}
$c = array( $m[0][0] ); $c = array( $m[0][0] );
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) { foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) {
$k++; $k++;
$c["m$k"] = $v; $c["m$k"] = $v;
} }
$captures[] = $c; $captures[] = $c;
if ( $n >= 0 && count( $captures ) >= $n ) {
break;
}
} }
} }
@ -693,12 +717,31 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
$this->checkType( 'gsub', 3, $repl, 'function or table or string' ); $this->checkType( 'gsub', 3, $repl, 'function or table or string' );
} }
$skippedMatches = 0;
if ( $this->phpBug53823 ) {
// Since we're having bogus matches, we need to keep track of the
// necessary adjustment and stop manually once we hit the limit.
$maxMatches = $n < 0 ? INF : $n;
$n = -1;
$realCallback = $cb;
$cb = function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) {
$c = ord( $m['phpBug53823'] );
if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) {
$skippedMatches++;
return $m[0];
} else {
$maxMatches--;
return $realCallback( $m );
}
};
}
$count = 0; $count = 0;
$s2 = preg_replace_callback( $re, $cb, $s, $n, $count ); $s2 = preg_replace_callback( $re, $cb, $s, $n, $count );
if ( $s2 === null ) { if ( $s2 === null ) {
self::handlePCREError( preg_last_error(), $pattern ); self::handlePCREError( preg_last_error(), $pattern );
} }
return array( $s2, $count ); return array( $s2, $count - $skippedMatches );
} }
/** /**

View file

@ -750,16 +750,23 @@ end
-- matches a partial UTF-8 character, but the others will happily enough -- matches a partial UTF-8 character, but the others will happily enough
-- match a whole UTF-8 character thinking it's 2, 3 or 4. -- match a whole UTF-8 character thinking it's 2, 3 or 4.
-- * If it contains position-captures. -- * If it contains position-captures.
-- * If it matches the empty string
-- --
-- @param string pattern -- @param string pattern
-- @return boolean -- @return boolean
local function patternIsSimple( pattern ) local function patternIsSimple( pattern )
local findWithPcall = function ( ... )
local ok, ret = pcall( S.find, ... )
return ok and ret
end
return not ( return not (
S.find( pattern, '[\128-\255]' ) or S.find( pattern, '[\128-\255]' ) or
S.find( pattern, '%[%^' ) or S.find( pattern, '%[%^' ) or
S.find( pattern, '%%[acdlpsuwxACDLPSUWXZ]' ) or S.find( pattern, '%%[acdlpsuwxACDLPSUWXZ]' ) or
S.find( pattern, '%.[^*+-]' ) or S.find( pattern, '%.$' ) or S.find( pattern, '%.[^*+-]' ) or S.find( pattern, '%.$' ) or
S.find( pattern, '()', 1, true ) S.find( pattern, '()', 1, true ) or
pattern == '' or findWithPcall( '', pattern )
) )
end end
@ -923,6 +930,14 @@ function ustring.gsub( s, pattern, repl, n )
end end
end end
if n == nil then
n = 1e100
end
if n < 1 then
-- No replacement
return s, 0
end
local cps = utf8_explode( s ) local cps = utf8_explode( s )
if cps == nil then if cps == nil then
error( "bad argument #1 for 'gsub' (string is not UTF-8)", 2 ) error( "bad argument #1 for 'gsub' (string is not UTF-8)", 2 )
@ -931,9 +946,6 @@ function ustring.gsub( s, pattern, repl, n )
if pat == nil then if pat == nil then
error( "bad argument #2 for 'gsub' (string is not UTF-8)", 2 ) error( "bad argument #2 for 'gsub' (string is not UTF-8)", 2 )
end end
if n == nil then
n = 1e100
end
if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored
-- There can be only the one match, so make that explicit -- There can be only the one match, so make that explicit
@ -957,8 +969,9 @@ function ustring.gsub( s, pattern, repl, n )
local init = 1 local init = 1
local ct = 0 local ct = 0
local ret = {} local ret = {}
while init < cps.len + 1 and ct < n do local zeroAdjustment = 0
local m = { find( s, cps, pattern, pat, init ) } repeat
local m = { find( s, cps, pattern, pat, init + zeroAdjustment ) }
if not m[1] then if not m[1] then
break break
end end
@ -1001,7 +1014,8 @@ function ustring.gsub( s, pattern, repl, n )
ret[#ret + 1] = val or mm ret[#ret + 1] = val or mm
init = m[2] + 1 init = m[2] + 1
ct = ct + 1 ct = ct + 1
end zeroAdjustment = m[2] < m[1] and 1 or 0
until init > cps.len or ct >= n
if init <= cps.len then if init <= cps.len then
ret[#ret + 1] = sub( s, cps, init, cps.len ) ret[#ret + 1] = sub( s, cps, init, cps.len )
end end

View file

@ -515,6 +515,22 @@ return testframework.getTestProvider( {
args = { 'á', 'á', 'X' }, args = { 'á', 'á', 'X' },
expect = { 'X', 1 } expect = { 'X', 1 }
}, },
{ name = 'gsub: (one char string, empty pattern)', func = mw.ustring.gsub,
args = { 'á', '', 'X' },
expect = { 'XáX', 2 }
},
{ name = 'gsub: (empty pattern with position captures)', func = mw.ustring.gsub,
args = { 'ábć', '()', '%1' },
expect = { '1á2b3ć4', 4 }
},
{ name = 'gsub: (limited to 1 replacement)', func = mw.ustring.gsub,
args = { 'áá', 'á', 'X', 1 },
expect = { '', 1 }
},
{ name = 'gsub: (limited to 0 replacements)', func = mw.ustring.gsub,
args = { 'áá', 'á', 'X', 0 },
expect = { 'áá', 0 }
},
{ name = 'gsub: (string 1)', func = mw.ustring.gsub, { name = 'gsub: (string 1)', func = mw.ustring.gsub,
args = { str2, 'f%a+', 'X' }, args = { str2, 'f%a+', 'X' },
expect = { 'X bar X X baz X X X', 6 } expect = { 'X bar X X baz X X X', 6 }