mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-24 00:05:00 +00:00
Fix pure-Lua ustring and empty patterns
An empty pattern isn't "safe" since it could match in between the bytes of a UTF-8 character. Also, it turns out there's a bug in PHP <5.6.9 preg_replace() that we need to work around too. Change-Id: I282e5909e4663461d60c5386693db182de2fd44c
This commit is contained in:
parent
c48bda0698
commit
629f11d0dd
|
@ -22,6 +22,13 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
||||||
*/
|
*/
|
||||||
private $manualCheckForU110000AndUp = false;
|
private $manualCheckForU110000AndUp = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PHP until 5.6.9 are buggy when the regex in preg_replace an
|
||||||
|
* preg_match_all matches the empty string.
|
||||||
|
* @var boolean
|
||||||
|
*/
|
||||||
|
private $phpBug53823 = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A cache of patterns and the regexes they generate.
|
* A cache of patterns and the regexes they generate.
|
||||||
* @var array
|
* @var array
|
||||||
|
@ -35,6 +42,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->manualCheckForU110000AndUp = mb_check_encoding( "\xf4\x90\x80\x80", "UTF-8" );
|
$this->manualCheckForU110000AndUp = mb_check_encoding( "\xf4\x90\x80\x80", "UTF-8" );
|
||||||
|
$this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x";
|
||||||
$this->patternRegexCache = new MapCacheLRU( 100 );
|
$this->patternRegexCache = new MapCacheLRU( 100 );
|
||||||
|
|
||||||
parent::__construct( $engine );
|
parent::__construct( $engine );
|
||||||
|
@ -331,6 +339,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
||||||
$captparen = array();
|
$captparen = array();
|
||||||
$opencapt = array();
|
$opencapt = array();
|
||||||
$bct = 0;
|
$bct = 0;
|
||||||
|
|
||||||
for ( $i = 0; $i < $len; $i++ ) {
|
for ( $i = 0; $i < $len; $i++ ) {
|
||||||
$ii = $i + 1;
|
$ii = $i + 1;
|
||||||
$q = false;
|
$q = false;
|
||||||
|
@ -608,30 +617,45 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
||||||
|
|
||||||
if ( $n === null ) {
|
if ( $n === null ) {
|
||||||
$n = -1;
|
$n = -1;
|
||||||
} elseif ( $n < 0 ) {
|
} elseif ( $n < 1 ) {
|
||||||
$n = 0;
|
return array( $s, 0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' );
|
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' );
|
||||||
$captures = array();
|
$captures = array();
|
||||||
|
|
||||||
|
if ( $this->phpBug53823 ) {
|
||||||
|
// PHP bug 53823 means that a zero-length match before a UTF-8
|
||||||
|
// character will match again before every byte of that character.
|
||||||
|
// The workaround is to capture the first "character" of/after the
|
||||||
|
// match and verify that its first byte is legal to start a UTF-8
|
||||||
|
// character.
|
||||||
|
$re = '/(?=(?<phpBug53823>.|$))' . substr( $re, 1 );
|
||||||
|
}
|
||||||
|
|
||||||
if ( $anypos ) {
|
if ( $anypos ) {
|
||||||
// preg_replace_callback doesn't take a "flags" argument, so we
|
// preg_replace_callback doesn't take a "flags" argument, so we
|
||||||
// can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle
|
// can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle
|
||||||
// position captures. So instead we have to do a preg_match_all and
|
// position captures. So instead we have to do a preg_match_all and
|
||||||
// handle the captures ourself.
|
// handle the captures ourself.
|
||||||
$ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
|
$ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
|
||||||
if ( $n >= 0 ) {
|
|
||||||
$ct = min( $ct, $n );
|
|
||||||
}
|
|
||||||
for ( $i = 0; $i < $ct; $i++ ) {
|
for ( $i = 0; $i < $ct; $i++ ) {
|
||||||
$m = $mm[$i];
|
$m = $mm[$i];
|
||||||
|
if ( $this->phpBug53823 ) {
|
||||||
|
$c = ord( $m['phpBug53823'][0] );
|
||||||
|
if ( $c >= 0x80 && $c <= 0xbf ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
$c = array( $m[0][0] );
|
$c = array( $m[0][0] );
|
||||||
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) {
|
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) {
|
||||||
$k++;
|
$k++;
|
||||||
$c["m$k"] = $v;
|
$c["m$k"] = $v;
|
||||||
}
|
}
|
||||||
$captures[] = $c;
|
$captures[] = $c;
|
||||||
|
if ( $n >= 0 && count( $captures ) >= $n ) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -693,12 +717,31 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
||||||
$this->checkType( 'gsub', 3, $repl, 'function or table or string' );
|
$this->checkType( 'gsub', 3, $repl, 'function or table or string' );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$skippedMatches = 0;
|
||||||
|
if ( $this->phpBug53823 ) {
|
||||||
|
// Since we're having bogus matches, we need to keep track of the
|
||||||
|
// necessary adjustment and stop manually once we hit the limit.
|
||||||
|
$maxMatches = $n < 0 ? INF : $n;
|
||||||
|
$n = -1;
|
||||||
|
$realCallback = $cb;
|
||||||
|
$cb = function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) {
|
||||||
|
$c = ord( $m['phpBug53823'] );
|
||||||
|
if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) {
|
||||||
|
$skippedMatches++;
|
||||||
|
return $m[0];
|
||||||
|
} else {
|
||||||
|
$maxMatches--;
|
||||||
|
return $realCallback( $m );
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
$count = 0;
|
$count = 0;
|
||||||
$s2 = preg_replace_callback( $re, $cb, $s, $n, $count );
|
$s2 = preg_replace_callback( $re, $cb, $s, $n, $count );
|
||||||
if ( $s2 === null ) {
|
if ( $s2 === null ) {
|
||||||
self::handlePCREError( preg_last_error(), $pattern );
|
self::handlePCREError( preg_last_error(), $pattern );
|
||||||
}
|
}
|
||||||
return array( $s2, $count );
|
return array( $s2, $count - $skippedMatches );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -750,16 +750,23 @@ end
|
||||||
-- matches a partial UTF-8 character, but the others will happily enough
|
-- matches a partial UTF-8 character, but the others will happily enough
|
||||||
-- match a whole UTF-8 character thinking it's 2, 3 or 4.
|
-- match a whole UTF-8 character thinking it's 2, 3 or 4.
|
||||||
-- * If it contains position-captures.
|
-- * If it contains position-captures.
|
||||||
|
-- * If it matches the empty string
|
||||||
--
|
--
|
||||||
-- @param string pattern
|
-- @param string pattern
|
||||||
-- @return boolean
|
-- @return boolean
|
||||||
local function patternIsSimple( pattern )
|
local function patternIsSimple( pattern )
|
||||||
|
local findWithPcall = function ( ... )
|
||||||
|
local ok, ret = pcall( S.find, ... )
|
||||||
|
return ok and ret
|
||||||
|
end
|
||||||
|
|
||||||
return not (
|
return not (
|
||||||
S.find( pattern, '[\128-\255]' ) or
|
S.find( pattern, '[\128-\255]' ) or
|
||||||
S.find( pattern, '%[%^' ) or
|
S.find( pattern, '%[%^' ) or
|
||||||
S.find( pattern, '%%[acdlpsuwxACDLPSUWXZ]' ) or
|
S.find( pattern, '%%[acdlpsuwxACDLPSUWXZ]' ) or
|
||||||
S.find( pattern, '%.[^*+-]' ) or S.find( pattern, '%.$' ) or
|
S.find( pattern, '%.[^*+-]' ) or S.find( pattern, '%.$' ) or
|
||||||
S.find( pattern, '()', 1, true )
|
S.find( pattern, '()', 1, true ) or
|
||||||
|
pattern == '' or findWithPcall( '', pattern )
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -923,6 +930,14 @@ function ustring.gsub( s, pattern, repl, n )
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
if n == nil then
|
||||||
|
n = 1e100
|
||||||
|
end
|
||||||
|
if n < 1 then
|
||||||
|
-- No replacement
|
||||||
|
return s, 0
|
||||||
|
end
|
||||||
|
|
||||||
local cps = utf8_explode( s )
|
local cps = utf8_explode( s )
|
||||||
if cps == nil then
|
if cps == nil then
|
||||||
error( "bad argument #1 for 'gsub' (string is not UTF-8)", 2 )
|
error( "bad argument #1 for 'gsub' (string is not UTF-8)", 2 )
|
||||||
|
@ -931,9 +946,6 @@ function ustring.gsub( s, pattern, repl, n )
|
||||||
if pat == nil then
|
if pat == nil then
|
||||||
error( "bad argument #2 for 'gsub' (string is not UTF-8)", 2 )
|
error( "bad argument #2 for 'gsub' (string is not UTF-8)", 2 )
|
||||||
end
|
end
|
||||||
if n == nil then
|
|
||||||
n = 1e100
|
|
||||||
end
|
|
||||||
|
|
||||||
if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored
|
if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored
|
||||||
-- There can be only the one match, so make that explicit
|
-- There can be only the one match, so make that explicit
|
||||||
|
@ -957,8 +969,9 @@ function ustring.gsub( s, pattern, repl, n )
|
||||||
local init = 1
|
local init = 1
|
||||||
local ct = 0
|
local ct = 0
|
||||||
local ret = {}
|
local ret = {}
|
||||||
while init < cps.len + 1 and ct < n do
|
local zeroAdjustment = 0
|
||||||
local m = { find( s, cps, pattern, pat, init ) }
|
repeat
|
||||||
|
local m = { find( s, cps, pattern, pat, init + zeroAdjustment ) }
|
||||||
if not m[1] then
|
if not m[1] then
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
|
@ -1001,7 +1014,8 @@ function ustring.gsub( s, pattern, repl, n )
|
||||||
ret[#ret + 1] = val or mm
|
ret[#ret + 1] = val or mm
|
||||||
init = m[2] + 1
|
init = m[2] + 1
|
||||||
ct = ct + 1
|
ct = ct + 1
|
||||||
end
|
zeroAdjustment = m[2] < m[1] and 1 or 0
|
||||||
|
until init > cps.len or ct >= n
|
||||||
if init <= cps.len then
|
if init <= cps.len then
|
||||||
ret[#ret + 1] = sub( s, cps, init, cps.len )
|
ret[#ret + 1] = sub( s, cps, init, cps.len )
|
||||||
end
|
end
|
||||||
|
|
|
@ -515,6 +515,22 @@ return testframework.getTestProvider( {
|
||||||
args = { 'á', 'á', 'X' },
|
args = { 'á', 'á', 'X' },
|
||||||
expect = { 'X', 1 }
|
expect = { 'X', 1 }
|
||||||
},
|
},
|
||||||
|
{ name = 'gsub: (one char string, empty pattern)', func = mw.ustring.gsub,
|
||||||
|
args = { 'á', '', 'X' },
|
||||||
|
expect = { 'XáX', 2 }
|
||||||
|
},
|
||||||
|
{ name = 'gsub: (empty pattern with position captures)', func = mw.ustring.gsub,
|
||||||
|
args = { 'ábć', '()', '%1' },
|
||||||
|
expect = { '1á2b3ć4', 4 }
|
||||||
|
},
|
||||||
|
{ name = 'gsub: (limited to 1 replacement)', func = mw.ustring.gsub,
|
||||||
|
args = { 'áá', 'á', 'X', 1 },
|
||||||
|
expect = { 'Xá', 1 }
|
||||||
|
},
|
||||||
|
{ name = 'gsub: (limited to 0 replacements)', func = mw.ustring.gsub,
|
||||||
|
args = { 'áá', 'á', 'X', 0 },
|
||||||
|
expect = { 'áá', 0 }
|
||||||
|
},
|
||||||
{ name = 'gsub: (string 1)', func = mw.ustring.gsub,
|
{ name = 'gsub: (string 1)', func = mw.ustring.gsub,
|
||||||
args = { str2, 'f%a+', 'X' },
|
args = { str2, 'f%a+', 'X' },
|
||||||
expect = { 'X bar X X baz X X X', 6 }
|
expect = { 'X bar X X baz X X X', 6 }
|
||||||
|
|
Loading…
Reference in a new issue