mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-24 00:05:00 +00:00
Fix pure-Lua ustring and empty patterns
An empty pattern isn't "safe" since it could match in between the bytes of a UTF-8 character. Also, it turns out there's a bug in PHP <5.6.9 preg_replace() that we need to work around too. Change-Id: I282e5909e4663461d60c5386693db182de2fd44c
This commit is contained in:
parent
c48bda0698
commit
629f11d0dd
|
@ -22,6 +22,13 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
*/
|
||||
private $manualCheckForU110000AndUp = false;
|
||||
|
||||
/**
|
||||
* PHP until 5.6.9 are buggy when the regex in preg_replace an
|
||||
* preg_match_all matches the empty string.
|
||||
* @var boolean
|
||||
*/
|
||||
private $phpBug53823 = false;
|
||||
|
||||
/**
|
||||
* A cache of patterns and the regexes they generate.
|
||||
* @var array
|
||||
|
@ -35,6 +42,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
}
|
||||
|
||||
$this->manualCheckForU110000AndUp = mb_check_encoding( "\xf4\x90\x80\x80", "UTF-8" );
|
||||
$this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x";
|
||||
$this->patternRegexCache = new MapCacheLRU( 100 );
|
||||
|
||||
parent::__construct( $engine );
|
||||
|
@ -331,6 +339,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
$captparen = array();
|
||||
$opencapt = array();
|
||||
$bct = 0;
|
||||
|
||||
for ( $i = 0; $i < $len; $i++ ) {
|
||||
$ii = $i + 1;
|
||||
$q = false;
|
||||
|
@ -608,30 +617,45 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
|
||||
if ( $n === null ) {
|
||||
$n = -1;
|
||||
} elseif ( $n < 0 ) {
|
||||
$n = 0;
|
||||
} elseif ( $n < 1 ) {
|
||||
return array( $s, 0 );
|
||||
}
|
||||
|
||||
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' );
|
||||
$captures = array();
|
||||
|
||||
if ( $this->phpBug53823 ) {
|
||||
// PHP bug 53823 means that a zero-length match before a UTF-8
|
||||
// character will match again before every byte of that character.
|
||||
// The workaround is to capture the first "character" of/after the
|
||||
// match and verify that its first byte is legal to start a UTF-8
|
||||
// character.
|
||||
$re = '/(?=(?<phpBug53823>.|$))' . substr( $re, 1 );
|
||||
}
|
||||
|
||||
if ( $anypos ) {
|
||||
// preg_replace_callback doesn't take a "flags" argument, so we
|
||||
// can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle
|
||||
// position captures. So instead we have to do a preg_match_all and
|
||||
// handle the captures ourself.
|
||||
$ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
|
||||
if ( $n >= 0 ) {
|
||||
$ct = min( $ct, $n );
|
||||
}
|
||||
for ( $i = 0; $i < $ct; $i++ ) {
|
||||
$m = $mm[$i];
|
||||
if ( $this->phpBug53823 ) {
|
||||
$c = ord( $m['phpBug53823'][0] );
|
||||
if ( $c >= 0x80 && $c <= 0xbf ) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$c = array( $m[0][0] );
|
||||
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) {
|
||||
$k++;
|
||||
$c["m$k"] = $v;
|
||||
}
|
||||
$captures[] = $c;
|
||||
if ( $n >= 0 && count( $captures ) >= $n ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -693,12 +717,31 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
$this->checkType( 'gsub', 3, $repl, 'function or table or string' );
|
||||
}
|
||||
|
||||
$skippedMatches = 0;
|
||||
if ( $this->phpBug53823 ) {
|
||||
// Since we're having bogus matches, we need to keep track of the
|
||||
// necessary adjustment and stop manually once we hit the limit.
|
||||
$maxMatches = $n < 0 ? INF : $n;
|
||||
$n = -1;
|
||||
$realCallback = $cb;
|
||||
$cb = function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) {
|
||||
$c = ord( $m['phpBug53823'] );
|
||||
if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) {
|
||||
$skippedMatches++;
|
||||
return $m[0];
|
||||
} else {
|
||||
$maxMatches--;
|
||||
return $realCallback( $m );
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
$count = 0;
|
||||
$s2 = preg_replace_callback( $re, $cb, $s, $n, $count );
|
||||
if ( $s2 === null ) {
|
||||
self::handlePCREError( preg_last_error(), $pattern );
|
||||
}
|
||||
return array( $s2, $count );
|
||||
return array( $s2, $count - $skippedMatches );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -750,16 +750,23 @@ end
|
|||
-- matches a partial UTF-8 character, but the others will happily enough
|
||||
-- match a whole UTF-8 character thinking it's 2, 3 or 4.
|
||||
-- * If it contains position-captures.
|
||||
-- * If it matches the empty string
|
||||
--
|
||||
-- @param string pattern
|
||||
-- @return boolean
|
||||
local function patternIsSimple( pattern )
|
||||
local findWithPcall = function ( ... )
|
||||
local ok, ret = pcall( S.find, ... )
|
||||
return ok and ret
|
||||
end
|
||||
|
||||
return not (
|
||||
S.find( pattern, '[\128-\255]' ) or
|
||||
S.find( pattern, '%[%^' ) or
|
||||
S.find( pattern, '%%[acdlpsuwxACDLPSUWXZ]' ) or
|
||||
S.find( pattern, '%.[^*+-]' ) or S.find( pattern, '%.$' ) or
|
||||
S.find( pattern, '()', 1, true )
|
||||
S.find( pattern, '()', 1, true ) or
|
||||
pattern == '' or findWithPcall( '', pattern )
|
||||
)
|
||||
end
|
||||
|
||||
|
@ -923,6 +930,14 @@ function ustring.gsub( s, pattern, repl, n )
|
|||
end
|
||||
end
|
||||
|
||||
if n == nil then
|
||||
n = 1e100
|
||||
end
|
||||
if n < 1 then
|
||||
-- No replacement
|
||||
return s, 0
|
||||
end
|
||||
|
||||
local cps = utf8_explode( s )
|
||||
if cps == nil then
|
||||
error( "bad argument #1 for 'gsub' (string is not UTF-8)", 2 )
|
||||
|
@ -931,9 +946,6 @@ function ustring.gsub( s, pattern, repl, n )
|
|||
if pat == nil then
|
||||
error( "bad argument #2 for 'gsub' (string is not UTF-8)", 2 )
|
||||
end
|
||||
if n == nil then
|
||||
n = 1e100
|
||||
end
|
||||
|
||||
if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored
|
||||
-- There can be only the one match, so make that explicit
|
||||
|
@ -957,8 +969,9 @@ function ustring.gsub( s, pattern, repl, n )
|
|||
local init = 1
|
||||
local ct = 0
|
||||
local ret = {}
|
||||
while init < cps.len + 1 and ct < n do
|
||||
local m = { find( s, cps, pattern, pat, init ) }
|
||||
local zeroAdjustment = 0
|
||||
repeat
|
||||
local m = { find( s, cps, pattern, pat, init + zeroAdjustment ) }
|
||||
if not m[1] then
|
||||
break
|
||||
end
|
||||
|
@ -1001,7 +1014,8 @@ function ustring.gsub( s, pattern, repl, n )
|
|||
ret[#ret + 1] = val or mm
|
||||
init = m[2] + 1
|
||||
ct = ct + 1
|
||||
end
|
||||
zeroAdjustment = m[2] < m[1] and 1 or 0
|
||||
until init > cps.len or ct >= n
|
||||
if init <= cps.len then
|
||||
ret[#ret + 1] = sub( s, cps, init, cps.len )
|
||||
end
|
||||
|
|
|
@ -515,6 +515,22 @@ return testframework.getTestProvider( {
|
|||
args = { 'á', 'á', 'X' },
|
||||
expect = { 'X', 1 }
|
||||
},
|
||||
{ name = 'gsub: (one char string, empty pattern)', func = mw.ustring.gsub,
|
||||
args = { 'á', '', 'X' },
|
||||
expect = { 'XáX', 2 }
|
||||
},
|
||||
{ name = 'gsub: (empty pattern with position captures)', func = mw.ustring.gsub,
|
||||
args = { 'ábć', '()', '%1' },
|
||||
expect = { '1á2b3ć4', 4 }
|
||||
},
|
||||
{ name = 'gsub: (limited to 1 replacement)', func = mw.ustring.gsub,
|
||||
args = { 'áá', 'á', 'X', 1 },
|
||||
expect = { 'Xá', 1 }
|
||||
},
|
||||
{ name = 'gsub: (limited to 0 replacements)', func = mw.ustring.gsub,
|
||||
args = { 'áá', 'á', 'X', 0 },
|
||||
expect = { 'áá', 0 }
|
||||
},
|
||||
{ name = 'gsub: (string 1)', func = mw.ustring.gsub,
|
||||
args = { str2, 'f%a+', 'X' },
|
||||
expect = { 'X bar X X baz X X X', 6 }
|
||||
|
|
Loading…
Reference in a new issue