mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-24 00:05:00 +00:00
Add frontier pattern (%f[set]) to ustring
The "%f[set]" frontier pattern has been in Lua 5.1 since the beginning, but was undocumented until Lua 5.2. And the code is even unchanged from 5.1.0 to 5.2.1. So there's no reason not to implement it in ustring too. Note the changes to UstringLibrary.php are somewhat large, because it splits the "convert a Lua bracketed charset to PCRE" code into a separate function and it changes the handling of mw.ustring.find's and mw.ustring.match's 'init' parameter from "substring, match from 0, then add back on $init" to "use preg_match's $offset and use \G instead of ^ where this matters". Both of these are necessary to properly support %f. This also fixes a bug in the pure-Lua code (not used in Scribunto) exposed by the unit tests for %f where %z was matching '\1' rather than '\0' and %Z everything except '\1' instead of everything except '\0'. Bug: 48331 Change-Id: Ie0b95ef5b734db53d6adc9de5dae4874f8944c08
This commit is contained in:
parent
d6f3633428
commit
82820aafc8
|
@ -232,7 +232,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
}
|
||||
|
||||
/* Convert a Lua pattern into a PCRE regex */
|
||||
private function patternToRegex( $pattern, $noAnchor = false ) {
|
||||
private function patternToRegex( $pattern, $anchor ) {
|
||||
$pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY );
|
||||
|
||||
static $charsets = null, $brcharsets = null;
|
||||
|
@ -295,7 +295,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
switch ( $pat[$i] ) {
|
||||
case '^':
|
||||
$q = $i;
|
||||
$re .= ( $noAnchor || $q ) ? '\\^' : '^';
|
||||
$re .= ( $anchor === false || $q ) ? '\\^' : $anchor;
|
||||
break;
|
||||
|
||||
case '$':
|
||||
|
@ -345,6 +345,19 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
$bct++;
|
||||
$re .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
|
||||
}
|
||||
} elseif ( $pat[$i] === 'f' ) {
|
||||
if ( $i + 1 >= $len || $pat[++$i] !== '[' ) {
|
||||
throw new Scribunto_LuaError( "missing '[' after %f in pattern at pattern character $ii" );
|
||||
}
|
||||
list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
|
||||
// Because %f considers the beginning and end of the string
|
||||
// to be \0, determine if $re2 matches that and take it
|
||||
// into account with "^" and "$".
|
||||
if ( preg_match( "/$re2/us", "\0" ) ) {
|
||||
$re .= "(?<!^)(?<!$re2)(?=$re2|$)";
|
||||
} else {
|
||||
$re .= "(?<!$re2)(?=$re2)";
|
||||
}
|
||||
} elseif ( $pat[$i] >= '0' && $pat[$i] <= '9' ) {
|
||||
$n = ord( $pat[$i] ) - 0x30;
|
||||
if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) {
|
||||
|
@ -358,34 +371,8 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
break;
|
||||
|
||||
case '[':
|
||||
$re .= '[';
|
||||
$i++;
|
||||
if ( $i < $len && $pat[$i] === '^' ) {
|
||||
$re .= '^';
|
||||
$i++;
|
||||
}
|
||||
for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) {
|
||||
if ( $pat[$i] === '%' ) {
|
||||
$i++;
|
||||
if ( $i >= $len ) {
|
||||
break;
|
||||
}
|
||||
if ( isset( $brcharsets[$pat[$i]] ) ) {
|
||||
$re .= $brcharsets[$pat[$i]];
|
||||
} else {
|
||||
$re .= preg_quote( $pat[$i], '/' );
|
||||
}
|
||||
} elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) {
|
||||
$re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' );
|
||||
$i += 2;
|
||||
} else {
|
||||
$re .= preg_quote( $pat[$i], '/' );
|
||||
}
|
||||
}
|
||||
if ( $i >= $len ) {
|
||||
throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" );
|
||||
}
|
||||
$re .= ']';
|
||||
list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
|
||||
$re .= $re2;
|
||||
$q = true;
|
||||
break;
|
||||
|
||||
|
@ -424,11 +411,44 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
return array( $re, $capt, $anypos );
|
||||
}
|
||||
|
||||
private function addCapturesFromMatch( $arr, $s, $m, $capt, $offset, $m0_if_no_captures ) {
|
||||
private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ){
|
||||
$ii = $i + 1;
|
||||
$re = '[';
|
||||
$i++;
|
||||
if ( $i < $len && $pat[$i] === '^' ) {
|
||||
$re .= '^';
|
||||
$i++;
|
||||
}
|
||||
for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) {
|
||||
if ( $pat[$i] === '%' ) {
|
||||
$i++;
|
||||
if ( $i >= $len ) {
|
||||
break;
|
||||
}
|
||||
if ( isset( $brcharsets[$pat[$i]] ) ) {
|
||||
$re .= $brcharsets[$pat[$i]];
|
||||
} else {
|
||||
$re .= preg_quote( $pat[$i], '/' );
|
||||
}
|
||||
} elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) {
|
||||
$re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' );
|
||||
$i += 2;
|
||||
} else {
|
||||
$re .= preg_quote( $pat[$i], '/' );
|
||||
}
|
||||
}
|
||||
if ( $i >= $len ) {
|
||||
throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" );
|
||||
}
|
||||
$re .= ']';
|
||||
return array( $i, $re );
|
||||
}
|
||||
|
||||
private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) {
|
||||
if ( count( $capt ) ) {
|
||||
foreach ( $capt as $n => $pos ) {
|
||||
if ( $pos ) {
|
||||
$o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + $offset;
|
||||
$o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1;
|
||||
$arr[] = $o;
|
||||
} else {
|
||||
$arr[] = $m["m$n"][0];
|
||||
|
@ -454,31 +474,32 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
}
|
||||
|
||||
if ( $init > 1 ) {
|
||||
$s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' );
|
||||
$offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
|
||||
} else {
|
||||
$init = 1;
|
||||
$offset = 0;
|
||||
}
|
||||
|
||||
if ( $plain ) {
|
||||
if ( $pattern !== '' ) {
|
||||
$ret = mb_strpos( $s, $pattern, 0, 'UTF-8' );
|
||||
$ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' );
|
||||
} else {
|
||||
$ret = 0;
|
||||
$ret = $init - 1;
|
||||
}
|
||||
if ( $ret === false ) {
|
||||
return array( null );
|
||||
} else {
|
||||
return array( $ret + $init, $ret + $init + mb_strlen( $pattern ) - 1 );
|
||||
return array( $ret + 1, $ret + mb_strlen( $pattern ) );
|
||||
}
|
||||
}
|
||||
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern );
|
||||
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) {
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' );
|
||||
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
|
||||
return array( null );
|
||||
}
|
||||
$o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ) + $init;
|
||||
$ret = array( $o, $o + mb_strlen( $m[0][0], 'UTF-8' ) - 1 );
|
||||
return $this->addCapturesFromMatch( $ret, $s, $m, $capt, $init, false );
|
||||
$o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' );
|
||||
$ret = array( $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) );
|
||||
return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false );
|
||||
}
|
||||
|
||||
public function ustringMatch( $s, $pattern, $init = 1 ) {
|
||||
|
@ -493,23 +514,23 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
$init = $len + 1;
|
||||
}
|
||||
if ( $init > 1 ) {
|
||||
$s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' );
|
||||
$offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
|
||||
} else {
|
||||
$init = 1;
|
||||
$offset = 0;
|
||||
}
|
||||
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern );
|
||||
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) {
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' );
|
||||
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
|
||||
return array( null );
|
||||
}
|
||||
return $this->addCapturesFromMatch( array(), $s, $m, $capt, $init, true );
|
||||
return $this->addCapturesFromMatch( array(), $s, $m, $capt, true );
|
||||
}
|
||||
|
||||
public function ustringGmatchInit( $s, $pattern ) {
|
||||
$this->checkString( 'gmatch', $s );
|
||||
$this->checkPattern( 'gmatch', $pattern );
|
||||
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern, true );
|
||||
list( $re, $capt ) = $this->patternToRegex( $pattern, false );
|
||||
return array( $re, $capt );
|
||||
}
|
||||
|
||||
|
@ -518,7 +539,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
return array( $pos, array() );
|
||||
}
|
||||
$pos = $m[0][1] + strlen( $m[0][0] );
|
||||
return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, 1, true ) );
|
||||
return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, true ) );
|
||||
}
|
||||
|
||||
public function ustringGsub( $s, $pattern, $repl, $n = null ) {
|
||||
|
@ -532,7 +553,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
$n = 0;
|
||||
}
|
||||
|
||||
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern );
|
||||
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^' );
|
||||
$captures = array();
|
||||
|
||||
if ( $anypos ) {
|
||||
|
@ -547,7 +568,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
|
|||
for ( $i = 0; $i < $ct; $i++ ) {
|
||||
$m = $mm[$i];
|
||||
$c = array( $m[0][0] );
|
||||
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, 1, false ) as $k => $v ) {
|
||||
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) {
|
||||
$k++;
|
||||
$c["m$k"] = $v;
|
||||
}
|
||||
|
|
|
@ -2779,7 +2779,7 @@ local pats = {
|
|||
[0x00ff46] = 1,
|
||||
},
|
||||
[0x7a] = {
|
||||
1,
|
||||
[0x000000] = 1,
|
||||
},
|
||||
[0x41] = {},
|
||||
[0x43] = {},
|
||||
|
|
|
@ -62,18 +62,14 @@ $pats = array(
|
|||
);
|
||||
|
||||
$ranges = array();
|
||||
function addRange( $k, $start, $end, $arr ) {
|
||||
function addRange( $k, $start, $end ) {
|
||||
global $X, $ranges;
|
||||
// Speed/memory tradeoff
|
||||
if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) {
|
||||
$ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end );
|
||||
} else {
|
||||
for ( $i = $start; $i < $end; $i++ ) {
|
||||
if ( $arr ) {
|
||||
fprintf( $X, "\t\t1,\n" );
|
||||
} else {
|
||||
fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
|
||||
}
|
||||
fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -98,7 +94,6 @@ foreach ( $pats as $k => $pp ) {
|
|||
}
|
||||
|
||||
fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) );
|
||||
$arr = true;
|
||||
$rstart = null;
|
||||
foreach ( $chars as $i => $c ) {
|
||||
if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) {
|
||||
|
@ -107,14 +102,13 @@ foreach ( $pats as $k => $pp ) {
|
|||
}
|
||||
} else {
|
||||
if ( $rstart !== null ) {
|
||||
addRange( $k, $rstart, $i, $arr );
|
||||
addRange( $k, $rstart, $i );
|
||||
$rstart = null;
|
||||
}
|
||||
$arr = false;
|
||||
}
|
||||
}
|
||||
if ( $rstart !== null ) {
|
||||
addRange( $k, $rstart, 0x110000, $arr );
|
||||
addRange( $k, $rstart, 0x110000 );
|
||||
}
|
||||
fprintf( $X, "\t},\n" );
|
||||
}
|
||||
|
|
|
@ -476,8 +476,8 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
|
|||
if charsets[c] then -- A character set like '%a'
|
||||
return match_charset( sp, pp + 2, charsets[c] )
|
||||
elseif c == 0x62 then -- '%b': balanced delimiter match
|
||||
d1 = pattern.codepoints[pp + 2]
|
||||
d2 = pattern.codepoints[pp + 3]
|
||||
local d1 = pattern.codepoints[pp + 2]
|
||||
local d2 = pattern.codepoints[pp + 3]
|
||||
if not d1 or not d2 then
|
||||
error( 'malformed pattern (missing arguments to \'%b\')', 3 )
|
||||
end
|
||||
|
@ -500,6 +500,18 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
|
|||
ct = ct + 1
|
||||
end
|
||||
end
|
||||
elseif c == 0x66 then -- '%f': frontier pattern match
|
||||
if pattern.codepoints[pp + 2] ~= 0x5b then
|
||||
error( 'missing \'[\' after %f in pattern at pattern character ' .. pp, 3 )
|
||||
end
|
||||
local pp, charset = parse_charset( pp + 2 )
|
||||
local c1 = cps.codepoints[sp - 1] or 0
|
||||
local c2 = cps.codepoints[sp] or 0
|
||||
if not charset[c1] and charset[c2] then
|
||||
return match( sp, pp )
|
||||
else
|
||||
return nil
|
||||
end
|
||||
elseif c >= 0x30 and c <= 0x39 then -- '%0' to '%9': backreference
|
||||
local m, l = getcapt( c - 0x30, 'invalid capture index %' .. c .. ' at pattern character ' .. pp, 3 )
|
||||
local ep = math.min( cps.len + 1, sp + l )
|
||||
|
@ -702,7 +714,7 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
|
|||
return sp, ep - 1, unpack( captures )
|
||||
end
|
||||
sp = sp + 1
|
||||
until anchor or sp > cps.len
|
||||
until anchor or sp > cps.len + 1
|
||||
return nil
|
||||
end
|
||||
|
||||
|
|
|
@ -253,6 +253,42 @@ return testframework.getTestProvider( {
|
|||
args = { "bar ¡foo¡foo¡ bar", '%b¡¡' },
|
||||
expect = { 5, 9 }
|
||||
},
|
||||
{ name = 'find: (%f)', func = mw.ustring.find,
|
||||
args = { "foo ¡foobar ¡foo bar baz", '¡.-%f[%s]' },
|
||||
expect = { 5, 11 }
|
||||
},
|
||||
{ name = 'find: (%f 2)', func = mw.ustring.find,
|
||||
args = { "foo ¡foobar ¡foo bar baz", '¡foo%f[%s]' },
|
||||
expect = { 13, 16 }
|
||||
},
|
||||
{ name = 'find: (%f 3)', func = mw.ustring.find,
|
||||
args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]' },
|
||||
expect = { 16, 19 }
|
||||
},
|
||||
{ name = 'find: (%f 4)', func = mw.ustring.find,
|
||||
args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]', 16 },
|
||||
expect = { 16, 19 }
|
||||
},
|
||||
{ name = 'find: (%f 5)', func = mw.ustring.find,
|
||||
args = { "foo ¡bar baz", '%f[%Z]' },
|
||||
expect = { 1, 0 }
|
||||
},
|
||||
{ name = 'find: (%f 6)', func = mw.ustring.find,
|
||||
args = { "foo ¡bar baz", '%f[%z]' },
|
||||
expect = { 13, 12 }
|
||||
},
|
||||
{ name = 'find: (%f 7)', func = mw.ustring.find,
|
||||
args = { "foo ¡b\0r baz", '%f[%Z]', 2 },
|
||||
expect = { 8, 7 }
|
||||
},
|
||||
{ name = 'find: (%f 8)', func = mw.ustring.find,
|
||||
args = { "\0foo ¡b\0r baz", '%f[%z]' },
|
||||
expect = { 8, 7 }
|
||||
},
|
||||
{ name = 'find: (%f 9)', func = mw.ustring.find,
|
||||
args = { "\0foo ¡b\0r baz", '%f[%Z]' },
|
||||
expect = { 2, 1 }
|
||||
},
|
||||
{ name = 'find: (%A)', func = mw.ustring.find,
|
||||
args = { "fóó? bar", '%A+' },
|
||||
expect = { 4, 5 }
|
||||
|
|
Loading…
Reference in a new issue