Merge "ustring: Handle "empty" charset like Lua does (part 2)"

This commit is contained in:
jenkins-bot 2015-10-30 16:34:54 +00:00 committed by Gerrit Code Review
commit b8830a3e57
3 changed files with 113 additions and 30 deletions

View file

@ -445,7 +445,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
$re .= '^';
$i++;
}
for (; $i < $len && $pat[$i] !== ']'; $i++ ) {
for ( $j = $i; $i < $len && ( $j == $i || $pat[$i] !== ']' ); $i++ ) {
if ( $pat[$i] === '%' ) {
$i++;
if ( $i >= $len ) {
@ -456,21 +456,30 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
} else {
$re .= preg_quote( $pat[$i], '/' );
}
} elseif ( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) {
$re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' );
} elseif ( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' && $pat[$i + 2] !== '%' ) {
if ( $pat[$i] <= $pat[$i+2] ) {
$re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' );
}
$i += 2;
} else {
$re .= preg_quote( $pat[$i], '/' );
}
}
if ( $re === '[' || $re === '[^' ) {
// This is the error Lua string functions throws in this situation
throw new Scribunto_LuaError( "malformed pattern (missing ']')" );
}
if ( $i >= $len ) {
throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" );
}
$re .= ']';
// Lua just ignores invalid ranges, while pcre throws an error.
// We filter them out above, but then we need to special-case empty sets
if ( $re === '[]' ) {
// Can't directly quantify (*FAIL), so wrap it.
// "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33
$re = '(?:(*FAIL))';
} elseif ( $re === '[^]' ) {
$re = '.'; // 's' modifier is always used, so this works
}
return array( $i, $re );
}

View file

@ -558,7 +558,14 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
-- Returns the position after the set and a table holding the matching characters
parse_charset = function ( pp )
local _, ep
local epp = pattern.bytepos[pp]
local epp = pattern.bytepos[pp] + 1
if S.sub( rawpat, epp, epp ) == '^' then
epp = epp + 1
end
if S.sub( rawpat, epp, epp ) == ']' then
-- Lua's string module effectively does this
epp = epp + 1
end
repeat
_, ep = S.find( rawpat, ']', epp, true )
if not ep then
@ -567,10 +574,6 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
epp = ep + 1
until S.byte( rawpat, ep - 1 ) ~= 0x25 or S.byte( rawpat, ep - 2 ) == 0x25
local key = S.sub( rawpat, pattern.bytepos[pp], ep )
if key == '[]' or key == '[^]' then
-- This is the error Lua string functions throws in this situation
error( "malformed pattern (missing ']')" )
end
if charset_cache[key] then
local pl, cs = unpack( charset_cache[key] )
return pp + pl, cs
@ -585,9 +588,13 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
invert = true
pp = pp + 1
end
local first = true
while true do
local c = pattern.codepoints[pp]
if c == 0x25 then -- '%'
if not first and c == 0x5d then -- closing ']'
pp = pp + 1
break
elseif c == 0x25 then -- '%'
c = pattern.codepoints[pp + 1]
if charsets[c] then
csrefs[#csrefs + 1] = charsets[c]
@ -600,15 +607,13 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
cs[i] = 1
end
pp = pp + 3
elseif c == 0x5d then -- closing ']'
pp = pp + 1
break
elseif not c then -- Should never get here, but Just In Case...
error( 'Missing close-bracket', 3 )
else
cs[c] = 1
pp = pp + 1
end
first = false
end
local ret
@ -799,12 +804,19 @@ function ustring.find( s, pattern, init, plain )
if init and init > cps.len + 1 then
init = cps.len + 1
end
local m = { S.find( s, pattern, cps.bytepos[init], plain ) }
if m[1] then
m[1] = cpoffset( cps, m[1] )
m[2] = cpoffset( cps, m[2] )
local m
if plain then
m = { true, S.find( s, pattern, cps.bytepos[init], plain ) }
else
m = { pcall( S.find, s, pattern, cps.bytepos[init], plain ) }
end
if m[1] then
if m[2] then
m[2] = cpoffset( cps, m[2] )
m[3] = cpoffset( cps, m[3] )
end
return unpack( m, 2 )
end
return unpack( m )
end
return find( s, cps, pattern, pat, init )
@ -832,7 +844,10 @@ function ustring.match( s, pattern, init )
end
if patternIsSimple( pattern ) then
return S.match( s, pattern, cps.bytepos[init] )
local ret = { pcall( S.match, s, pattern, cps.bytepos[init] ) }
if ret[1] then
return unpack( ret, 2 )
end
end
local m = { find( s, cps, pattern, pat, init ) }
@ -858,7 +873,10 @@ function ustring.gmatch( s, pattern )
checkString( 'gmatch', s )
checkPattern( 'gmatch', pattern )
if patternIsSimple( pattern ) then
return S.gmatch( s, pattern )
local ret = { pcall( S.gmatch, s, pattern ) }
if ret[1] then
return unpack( ret, 2 )
end
end
local cps = utf8_explode( s )
@ -899,7 +917,10 @@ function ustring.gsub( s, pattern, repl, n )
checkPattern( 'gsub', pattern )
checkType( 'gsub', 4, n, 'number', true )
if patternIsSimple( pattern ) then
return S.gsub( s, pattern, repl, n )
local ret = { pcall( S.gsub, s, pattern, repl, n ) }
if ret[1] then
return unpack( ret, 2 )
end
end
local cps = utf8_explode( s )

View file

@ -429,13 +429,53 @@ return testframework.getTestProvider( {
args = { "fóó", '([^a-z])' },
expect = { 2, 2, 'ó' }
},
{ name = 'find: empty character set', func = mw.ustring.find,
args = { "fóó", '[]' },
expect = "malformed pattern (missing ']')"
{ name = 'find: Bracket at start of a character set doesn\'t close', func = mw.ustring.find,
args = { "fóó", '()[]' },
expect = "Missing close-bracket for character set beginning at pattern character 3"
},
{ name = 'find: empty negated character set', func = mw.ustring.find,
args = { "fóó", '[^]' },
expect = "malformed pattern (missing ']')"
{ name = 'find: Bracket at start of a negated character set doesn\'t close', func = mw.ustring.find,
args = { "fóó", '()[^]' },
expect = "Missing close-bracket for character set beginning at pattern character 3"
},
{ name = 'find: Bracket at start of a character set is literal', func = mw.ustring.find,
args = { "foo]bar¿", '()([]])' },
expect = { 4, 4, 4, ']' }
},
{ name = 'find: Bracket at start of a negated character set is literal', func = mw.ustring.find,
args = { "]bar¿", '()([^]])' },
expect = { 2, 2, 2, 'b' }
},
{ name = 'find: Bracket at start of a character set can be a range endpoint', func = mw.ustring.find,
args = { "foo]bar¿", '()([]-z]+)' },
expect = { 1, 7, 1, 'foo]bar' }
},
{ name = 'find: Bracket at start of a negated character can be a range endpoint', func = mw.ustring.find,
args = { "fOO]bar¿", '()([^]-z]+)' },
expect = { 2, 3, 2, 'OO' }
},
{ name = 'find: Weird edge-case that was failing (1)', func = mw.ustring.find,
args = { "foo]ba-]r¿", '()([a]-%]+)' },
expect = { 4, 4, 4, ']' }
},
{ name = 'find: Weird edge-case that was failing (2)', func = mw.ustring.find,
args = { "foo¿", '()[!-%]' },
expect = "Missing close-bracket for character set beginning at pattern character 3"
},
{ name = 'find: Inverted range (1)', func = mw.ustring.find,
args = { "foo¿", '()([z-a]+)' },
expect = { nil }
},
{ name = 'find: Inverted range (2)', func = mw.ustring.find,
args = { "foo¿", '()([^z-a]+)' },
expect = { 1, 4, 1, 'foo¿' }
},
{ name = 'find: Inverted range (3)', func = mw.ustring.find,
args = { "foo¿", '()(f[z-a]o)' },
expect = { nil }
},
{ name = 'find: Inverted range (4)', func = mw.ustring.find,
args = { "foo¿", '()(f[z-a]*o)' },
expect = { 1, 2, 1, 'fo' }
},
{ name = 'match: (1)', func = mw.ustring.match,
@ -611,6 +651,19 @@ return testframework.getTestProvider( {
type = 'Iterator'
},
{ name = 'find: Pure-lua version, non-native error message', func = mw.ustring.find,
args = { "fóó", '[]' },
expect = "Missing close-bracket for character set beginning at pattern character 1"
},
{ name = 'match: Pure-lua version, non-native error message', func = mw.ustring.match,
args = { "fóó", '[]' },
expect = "Missing close-bracket for character set beginning at pattern character 1"
},
{ name = 'gsub: Pure-lua version, non-native error message', func = mw.ustring.gsub,
args = { "fóó", '[]', '' },
expect = "Missing close-bracket for character set beginning at pattern character 1"
},
{ name = 'string length limit',
func = function ()
local s = string.rep( "x", mw.ustring.maxStringLength + 1 )