mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-09-24 10:49:52 +00:00
Merge "ustring: Handle "empty" charset like Lua does (part 2)"
This commit is contained in:
commit
b8830a3e57
|
@ -445,7 +445,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
$re .= '^';
|
||||
$i++;
|
||||
}
|
||||
for (; $i < $len && $pat[$i] !== ']'; $i++ ) {
|
||||
for ( $j = $i; $i < $len && ( $j == $i || $pat[$i] !== ']' ); $i++ ) {
|
||||
if ( $pat[$i] === '%' ) {
|
||||
$i++;
|
||||
if ( $i >= $len ) {
|
||||
|
@ -456,21 +456,30 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
} else {
|
||||
$re .= preg_quote( $pat[$i], '/' );
|
||||
}
|
||||
} elseif ( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) {
|
||||
} elseif ( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' && $pat[$i + 2] !== '%' ) {
|
||||
if ( $pat[$i] <= $pat[$i+2] ) {
|
||||
$re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' );
|
||||
}
|
||||
$i += 2;
|
||||
} else {
|
||||
$re .= preg_quote( $pat[$i], '/' );
|
||||
}
|
||||
}
|
||||
if ( $re === '[' || $re === '[^' ) {
|
||||
// This is the error Lua string functions throws in this situation
|
||||
throw new Scribunto_LuaError( "malformed pattern (missing ']')" );
|
||||
}
|
||||
if ( $i >= $len ) {
|
||||
throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" );
|
||||
}
|
||||
$re .= ']';
|
||||
|
||||
// Lua just ignores invalid ranges, while pcre throws an error.
|
||||
// We filter them out above, but then we need to special-case empty sets
|
||||
if ( $re === '[]' ) {
|
||||
// Can't directly quantify (*FAIL), so wrap it.
|
||||
// "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33
|
||||
$re = '(?:(*FAIL))';
|
||||
} elseif ( $re === '[^]' ) {
|
||||
$re = '.'; // 's' modifier is always used, so this works
|
||||
}
|
||||
|
||||
return array( $i, $re );
|
||||
}
|
||||
|
||||
|
|
|
@ -558,7 +558,14 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
|
|||
-- Returns the position after the set and a table holding the matching characters
|
||||
parse_charset = function ( pp )
|
||||
local _, ep
|
||||
local epp = pattern.bytepos[pp]
|
||||
local epp = pattern.bytepos[pp] + 1
|
||||
if S.sub( rawpat, epp, epp ) == '^' then
|
||||
epp = epp + 1
|
||||
end
|
||||
if S.sub( rawpat, epp, epp ) == ']' then
|
||||
-- Lua's string module effectively does this
|
||||
epp = epp + 1
|
||||
end
|
||||
repeat
|
||||
_, ep = S.find( rawpat, ']', epp, true )
|
||||
if not ep then
|
||||
|
@ -567,10 +574,6 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
|
|||
epp = ep + 1
|
||||
until S.byte( rawpat, ep - 1 ) ~= 0x25 or S.byte( rawpat, ep - 2 ) == 0x25
|
||||
local key = S.sub( rawpat, pattern.bytepos[pp], ep )
|
||||
if key == '[]' or key == '[^]' then
|
||||
-- This is the error Lua string functions throws in this situation
|
||||
error( "malformed pattern (missing ']')" )
|
||||
end
|
||||
if charset_cache[key] then
|
||||
local pl, cs = unpack( charset_cache[key] )
|
||||
return pp + pl, cs
|
||||
|
@ -585,9 +588,13 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
|
|||
invert = true
|
||||
pp = pp + 1
|
||||
end
|
||||
local first = true
|
||||
while true do
|
||||
local c = pattern.codepoints[pp]
|
||||
if c == 0x25 then -- '%'
|
||||
if not first and c == 0x5d then -- closing ']'
|
||||
pp = pp + 1
|
||||
break
|
||||
elseif c == 0x25 then -- '%'
|
||||
c = pattern.codepoints[pp + 1]
|
||||
if charsets[c] then
|
||||
csrefs[#csrefs + 1] = charsets[c]
|
||||
|
@ -600,15 +607,13 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
|
|||
cs[i] = 1
|
||||
end
|
||||
pp = pp + 3
|
||||
elseif c == 0x5d then -- closing ']'
|
||||
pp = pp + 1
|
||||
break
|
||||
elseif not c then -- Should never get here, but Just In Case...
|
||||
error( 'Missing close-bracket', 3 )
|
||||
else
|
||||
cs[c] = 1
|
||||
pp = pp + 1
|
||||
end
|
||||
first = false
|
||||
end
|
||||
|
||||
local ret
|
||||
|
@ -799,12 +804,19 @@ function ustring.find( s, pattern, init, plain )
|
|||
if init and init > cps.len + 1 then
|
||||
init = cps.len + 1
|
||||
end
|
||||
local m = { S.find( s, pattern, cps.bytepos[init], plain ) }
|
||||
if m[1] then
|
||||
m[1] = cpoffset( cps, m[1] )
|
||||
m[2] = cpoffset( cps, m[2] )
|
||||
local m
|
||||
if plain then
|
||||
m = { true, S.find( s, pattern, cps.bytepos[init], plain ) }
|
||||
else
|
||||
m = { pcall( S.find, s, pattern, cps.bytepos[init], plain ) }
|
||||
end
|
||||
if m[1] then
|
||||
if m[2] then
|
||||
m[2] = cpoffset( cps, m[2] )
|
||||
m[3] = cpoffset( cps, m[3] )
|
||||
end
|
||||
return unpack( m, 2 )
|
||||
end
|
||||
return unpack( m )
|
||||
end
|
||||
|
||||
return find( s, cps, pattern, pat, init )
|
||||
|
@ -832,7 +844,10 @@ function ustring.match( s, pattern, init )
|
|||
end
|
||||
|
||||
if patternIsSimple( pattern ) then
|
||||
return S.match( s, pattern, cps.bytepos[init] )
|
||||
local ret = { pcall( S.match, s, pattern, cps.bytepos[init] ) }
|
||||
if ret[1] then
|
||||
return unpack( ret, 2 )
|
||||
end
|
||||
end
|
||||
|
||||
local m = { find( s, cps, pattern, pat, init ) }
|
||||
|
@ -858,7 +873,10 @@ function ustring.gmatch( s, pattern )
|
|||
checkString( 'gmatch', s )
|
||||
checkPattern( 'gmatch', pattern )
|
||||
if patternIsSimple( pattern ) then
|
||||
return S.gmatch( s, pattern )
|
||||
local ret = { pcall( S.gmatch, s, pattern ) }
|
||||
if ret[1] then
|
||||
return unpack( ret, 2 )
|
||||
end
|
||||
end
|
||||
|
||||
local cps = utf8_explode( s )
|
||||
|
@ -899,7 +917,10 @@ function ustring.gsub( s, pattern, repl, n )
|
|||
checkPattern( 'gsub', pattern )
|
||||
checkType( 'gsub', 4, n, 'number', true )
|
||||
if patternIsSimple( pattern ) then
|
||||
return S.gsub( s, pattern, repl, n )
|
||||
local ret = { pcall( S.gsub, s, pattern, repl, n ) }
|
||||
if ret[1] then
|
||||
return unpack( ret, 2 )
|
||||
end
|
||||
end
|
||||
|
||||
local cps = utf8_explode( s )
|
||||
|
|
|
@ -429,13 +429,53 @@ return testframework.getTestProvider( {
|
|||
args = { "fóó", '([^a-z])' },
|
||||
expect = { 2, 2, 'ó' }
|
||||
},
|
||||
{ name = 'find: empty character set', func = mw.ustring.find,
|
||||
args = { "fóó", '[]' },
|
||||
expect = "malformed pattern (missing ']')"
|
||||
{ name = 'find: Bracket at start of a character set doesn\'t close', func = mw.ustring.find,
|
||||
args = { "fóó", '()[]' },
|
||||
expect = "Missing close-bracket for character set beginning at pattern character 3"
|
||||
},
|
||||
{ name = 'find: empty negated character set', func = mw.ustring.find,
|
||||
args = { "fóó", '[^]' },
|
||||
expect = "malformed pattern (missing ']')"
|
||||
{ name = 'find: Bracket at start of a negated character set doesn\'t close', func = mw.ustring.find,
|
||||
args = { "fóó", '()[^]' },
|
||||
expect = "Missing close-bracket for character set beginning at pattern character 3"
|
||||
},
|
||||
{ name = 'find: Bracket at start of a character set is literal', func = mw.ustring.find,
|
||||
args = { "foo]bar¿", '()([]])' },
|
||||
expect = { 4, 4, 4, ']' }
|
||||
},
|
||||
{ name = 'find: Bracket at start of a negated character set is literal', func = mw.ustring.find,
|
||||
args = { "]bar¿", '()([^]])' },
|
||||
expect = { 2, 2, 2, 'b' }
|
||||
},
|
||||
{ name = 'find: Bracket at start of a character set can be a range endpoint', func = mw.ustring.find,
|
||||
args = { "foo]bar¿", '()([]-z]+)' },
|
||||
expect = { 1, 7, 1, 'foo]bar' }
|
||||
},
|
||||
{ name = 'find: Bracket at start of a negated character can be a range endpoint', func = mw.ustring.find,
|
||||
args = { "fOO]bar¿", '()([^]-z]+)' },
|
||||
expect = { 2, 3, 2, 'OO' }
|
||||
},
|
||||
{ name = 'find: Weird edge-case that was failing (1)', func = mw.ustring.find,
|
||||
args = { "foo]ba-]r¿", '()([a]-%]+)' },
|
||||
expect = { 4, 4, 4, ']' }
|
||||
},
|
||||
{ name = 'find: Weird edge-case that was failing (2)', func = mw.ustring.find,
|
||||
args = { "foo¿", '()[!-%]' },
|
||||
expect = "Missing close-bracket for character set beginning at pattern character 3"
|
||||
},
|
||||
{ name = 'find: Inverted range (1)', func = mw.ustring.find,
|
||||
args = { "foo¿", '()([z-a]+)' },
|
||||
expect = { nil }
|
||||
},
|
||||
{ name = 'find: Inverted range (2)', func = mw.ustring.find,
|
||||
args = { "foo¿", '()([^z-a]+)' },
|
||||
expect = { 1, 4, 1, 'foo¿' }
|
||||
},
|
||||
{ name = 'find: Inverted range (3)', func = mw.ustring.find,
|
||||
args = { "foo¿", '()(f[z-a]o)' },
|
||||
expect = { nil }
|
||||
},
|
||||
{ name = 'find: Inverted range (4)', func = mw.ustring.find,
|
||||
args = { "foo¿", '()(f[z-a]*o)' },
|
||||
expect = { 1, 2, 1, 'fo' }
|
||||
},
|
||||
|
||||
{ name = 'match: (1)', func = mw.ustring.match,
|
||||
|
@ -611,6 +651,19 @@ return testframework.getTestProvider( {
|
|||
type = 'Iterator'
|
||||
},
|
||||
|
||||
{ name = 'find: Pure-lua version, non-native error message', func = mw.ustring.find,
|
||||
args = { "fóó", '[]' },
|
||||
expect = "Missing close-bracket for character set beginning at pattern character 1"
|
||||
},
|
||||
{ name = 'match: Pure-lua version, non-native error message', func = mw.ustring.match,
|
||||
args = { "fóó", '[]' },
|
||||
expect = "Missing close-bracket for character set beginning at pattern character 1"
|
||||
},
|
||||
{ name = 'gsub: Pure-lua version, non-native error message', func = mw.ustring.gsub,
|
||||
args = { "fóó", '[]', '' },
|
||||
expect = "Missing close-bracket for character set beginning at pattern character 1"
|
||||
},
|
||||
|
||||
{ name = 'string length limit',
|
||||
func = function ()
|
||||
local s = string.rep( "x", mw.ustring.maxStringLength + 1 )
|
||||
|
|
Loading…
Reference in a new issue