mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-24 00:05:00 +00:00
Fix mw.ustring.gmatch and patterns with '^'
The Lua manual says this: For this function, a '^' at the start of a pattern does not work as an anchor, as this would prevent the iteration. I had interpreted that to mean that a pattern starting with '^' would never match in gmatch. But further testing reveals that the '^' is just treated as a literal character: string.gmatch( "foo ^bar baz", "^%a+" ) will match "^bar". Change-Id: Id91d6ee2db753ce1d6a4f6ae27764691d9e9fdc4
This commit is contained in:
parent
c98cc64545
commit
4dcac2fcd9
|
@ -232,7 +232,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
}
|
||||
|
||||
/* Convert a Lua pattern into a PCRE regex */
|
||||
private function patternToRegex( $pattern ) {
|
||||
private function patternToRegex( $pattern, $noAnchor = false ) {
|
||||
$pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY );
|
||||
|
||||
static $charsets = null, $brcharsets = null;
|
||||
|
@ -295,7 +295,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
switch ( $pat[$i] ) {
|
||||
case '^':
|
||||
$q = $i;
|
||||
$re .= $q ? '\\^' : '^';
|
||||
$re .= ( $noAnchor || $q ) ? '\\^' : '^';
|
||||
break;
|
||||
|
||||
case '$':
|
||||
|
@ -497,7 +497,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
$this->checkString( 'gmatch', $s );
|
||||
$this->checkPattern( 'gmatch', $pattern );
|
||||
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern );
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern, true );
|
||||
return array( $re, $capt );
|
||||
}
|
||||
|
||||
|
|
|
@ -14,10 +14,6 @@ local function php_gmatch( s, pattern )
|
|||
checkType( 'gmatch', 1, s, 'string' )
|
||||
checkType( 'gmatch', 2, pattern, 'string' )
|
||||
|
||||
if string.sub( pattern, 1, 1 ) == '^' then
|
||||
return function() return nil end, nil, nil
|
||||
end
|
||||
|
||||
local re, capt = gmatch_init( s, pattern )
|
||||
local pos = 0
|
||||
return function()
|
||||
|
|
|
@ -397,10 +397,11 @@ setmetatable( charset_cache, { __weak = 'kv' } )
|
|||
-- @param rawpat string Pattern
|
||||
-- @param pattern table Exploded pattern
|
||||
-- @param init int Starting index
|
||||
-- @param noAnchor boolean True to ignore '^'
|
||||
-- @return int starting index of the match
|
||||
-- @return int ending index of the match
|
||||
-- @return string|int* captures
|
||||
local function find( s, cps, rawpat, pattern, init )
|
||||
local function find( s, cps, rawpat, pattern, init, noAnchor )
|
||||
local charsets = require 'ustring/charsets'
|
||||
local anchor = false
|
||||
local ncapt, captures
|
||||
|
@ -676,7 +677,7 @@ local function find( s, cps, rawpat, pattern, init )
|
|||
-- match.
|
||||
local sp = init
|
||||
local pp = 1
|
||||
if pattern.codepoints[1] == 0x5e then -- '^': Pattern is anchored
|
||||
if not noAnchor and pattern.codepoints[1] == 0x5e then -- '^': Pattern is anchored
|
||||
anchor = true
|
||||
pp = 2
|
||||
end
|
||||
|
@ -838,15 +839,8 @@ function ustring.gmatch( s, pattern )
|
|||
end
|
||||
local init = 1
|
||||
|
||||
if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored
|
||||
-- Lua special-cases this to never match
|
||||
return function ()
|
||||
return nil
|
||||
end
|
||||
end
|
||||
|
||||
return function ()
|
||||
local m = { find( s, cps, pattern, pat, init ) }
|
||||
local m = { find( s, cps, pattern, pat, init, true ) }
|
||||
if not m[1] then
|
||||
return nil
|
||||
end
|
||||
|
|
|
@ -396,8 +396,8 @@ return testframework.getTestProvider( {
|
|||
type = 'Iterator'
|
||||
},
|
||||
{ name = 'gmatch: anchored', func = mw.ustring.gmatch,
|
||||
args = { str2, '^f%a+' },
|
||||
expect = {},
|
||||
args = { "fóó1 ^fóó2 fóó3 ^fóó4", '^fóó%d+' },
|
||||
expect = { { "^fóó2" }, { "^fóó4" } },
|
||||
type = 'Iterator'
|
||||
},
|
||||
|
||||
|
|
Loading…
Reference in a new issue