Fix mw.ustring.gmatch and patterns with '^'

The Lua manual says this:

 For this function, a '^' at the start of a pattern does not work as an
 anchor, as this would prevent the iteration.

I had interpreted that to mean that a pattern starting with '^' would
never match in gmatch. But further testing reveals that the '^' is just
treated as a literal character: string.gmatch( "foo ^bar baz", "^%a+" )
will match "^bar".

Change-Id: Id91d6ee2db753ce1d6a4f6ae27764691d9e9fdc4
This commit is contained in:
Brad Jorsch 2013-02-14 14:20:57 -05:00
parent c98cc64545
commit 4dcac2fcd9
4 changed files with 9 additions and 19 deletions

View file

@ -232,7 +232,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
}
/* Convert a Lua pattern into a PCRE regex */
private function patternToRegex( $pattern ) {
private function patternToRegex( $pattern, $noAnchor = false ) {
$pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY );
static $charsets = null, $brcharsets = null;
@ -295,7 +295,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
switch ( $pat[$i] ) {
case '^':
$q = $i;
$re .= $q ? '\\^' : '^';
$re .= ( $noAnchor || $q ) ? '\\^' : '^';
break;
case '$':
@ -497,7 +497,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
$this->checkString( 'gmatch', $s );
$this->checkPattern( 'gmatch', $pattern );
list( $re, $capt ) = $this->patternToRegex( $pattern );
list( $re, $capt ) = $this->patternToRegex( $pattern, true );
return array( $re, $capt );
}

View file

@ -14,10 +14,6 @@ local function php_gmatch( s, pattern )
checkType( 'gmatch', 1, s, 'string' )
checkType( 'gmatch', 2, pattern, 'string' )
if string.sub( pattern, 1, 1 ) == '^' then
return function() return nil end, nil, nil
end
local re, capt = gmatch_init( s, pattern )
local pos = 0
return function()

View file

@ -397,10 +397,11 @@ setmetatable( charset_cache, { __weak = 'kv' } )
-- @param rawpat string Pattern
-- @param pattern table Exploded pattern
-- @param init int Starting index
-- @param noAnchor boolean True to ignore '^'
-- @return int starting index of the match
-- @return int ending index of the match
-- @return string|int* captures
local function find( s, cps, rawpat, pattern, init )
local function find( s, cps, rawpat, pattern, init, noAnchor )
local charsets = require 'ustring/charsets'
local anchor = false
local ncapt, captures
@ -676,7 +677,7 @@ local function find( s, cps, rawpat, pattern, init )
-- match.
local sp = init
local pp = 1
if pattern.codepoints[1] == 0x5e then -- '^': Pattern is anchored
if not noAnchor and pattern.codepoints[1] == 0x5e then -- '^': Pattern is anchored
anchor = true
pp = 2
end
@ -838,15 +839,8 @@ function ustring.gmatch( s, pattern )
end
local init = 1
if pat.codepoints[1] == 0x5e then -- '^': Pattern is anchored
-- Lua special-cases this to never match
return function ()
return nil
end
end
return function ()
local m = { find( s, cps, pattern, pat, init ) }
local m = { find( s, cps, pattern, pat, init, true ) }
if not m[1] then
return nil
end

View file

@ -396,8 +396,8 @@ return testframework.getTestProvider( {
type = 'Iterator'
},
{ name = 'gmatch: anchored', func = mw.ustring.gmatch,
args = { str2, '^f%a+' },
expect = {},
args = { "fóó1 ^fóó2 fóó3 ^fóó4", '^fóó%d+' },
expect = { { "^fóó2" }, { "^fóó4" } },
type = 'Iterator'
},