Add frontier pattern (%f[set]) to ustring

The "%f[set]" frontier pattern has been in Lua 5.1 since the beginning,
but was undocumented until Lua 5.2. And the code is even unchanged from
5.1.0 to 5.2.1. So there's no reason not to implement it in ustring too.

Note the changes to UstringLibrary.php are somewhat large, because it
splits the "convert a Lua bracketed charset to PCRE" code into a
separate function and it changes the handling of mw.ustring.find's and
mw.ustring.match's 'init' parameter from "substring, match from 0, then
add back on $init" to "use preg_match's $offset and use \G instead of ^
where this matters". Both of these are necessary to properly support
%f.

This also fixes a bug in the pure-Lua code (not used in Scribunto)
exposed by the unit tests for %f where %z was matching '\1' rather than
'\0' and %Z everything except '\1' instead of everything except '\0'.

Bug: 48331
Change-Id: Ie0b95ef5b734db53d6adc9de5dae4874f8944c08
This commit is contained in:
Brad Jorsch 2013-04-19 15:26:45 -04:00
parent d6f3633428
commit 82820aafc8
5 changed files with 127 additions and 64 deletions

View file

@ -232,7 +232,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
}
/* Convert a Lua pattern into a PCRE regex */
private function patternToRegex( $pattern, $noAnchor = false ) {
private function patternToRegex( $pattern, $anchor ) {
$pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY );
static $charsets = null, $brcharsets = null;
@ -295,7 +295,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
switch ( $pat[$i] ) {
case '^':
$q = $i;
$re .= ( $noAnchor || $q ) ? '\\^' : '^';
$re .= ( $anchor === false || $q ) ? '\\^' : $anchor;
break;
case '$':
@ -345,6 +345,19 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
$bct++;
$re .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
}
} elseif ( $pat[$i] === 'f' ) {
if ( $i + 1 >= $len || $pat[++$i] !== '[' ) {
throw new Scribunto_LuaError( "missing '[' after %f in pattern at pattern character $ii" );
}
list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
// Because %f considers the beginning and end of the string
// to be \0, determine if $re2 matches that and take it
// into account with "^" and "$".
if ( preg_match( "/$re2/us", "\0" ) ) {
$re .= "(?<!^)(?<!$re2)(?=$re2|$)";
} else {
$re .= "(?<!$re2)(?=$re2)";
}
} elseif ( $pat[$i] >= '0' && $pat[$i] <= '9' ) {
$n = ord( $pat[$i] ) - 0x30;
if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) {
@ -358,34 +371,8 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
break;
case '[':
$re .= '[';
$i++;
if ( $i < $len && $pat[$i] === '^' ) {
$re .= '^';
$i++;
}
for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) {
if ( $pat[$i] === '%' ) {
$i++;
if ( $i >= $len ) {
break;
}
if ( isset( $brcharsets[$pat[$i]] ) ) {
$re .= $brcharsets[$pat[$i]];
} else {
$re .= preg_quote( $pat[$i], '/' );
}
} elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) {
$re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' );
$i += 2;
} else {
$re .= preg_quote( $pat[$i], '/' );
}
}
if ( $i >= $len ) {
throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" );
}
$re .= ']';
list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
$re .= $re2;
$q = true;
break;
@ -424,11 +411,44 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
return array( $re, $capt, $anypos );
}
private function addCapturesFromMatch( $arr, $s, $m, $capt, $offset, $m0_if_no_captures ) {
private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ){
$ii = $i + 1;
$re = '[';
$i++;
if ( $i < $len && $pat[$i] === '^' ) {
$re .= '^';
$i++;
}
for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) {
if ( $pat[$i] === '%' ) {
$i++;
if ( $i >= $len ) {
break;
}
if ( isset( $brcharsets[$pat[$i]] ) ) {
$re .= $brcharsets[$pat[$i]];
} else {
$re .= preg_quote( $pat[$i], '/' );
}
} elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) {
$re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' );
$i += 2;
} else {
$re .= preg_quote( $pat[$i], '/' );
}
}
if ( $i >= $len ) {
throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" );
}
$re .= ']';
return array( $i, $re );
}
private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) {
if ( count( $capt ) ) {
foreach ( $capt as $n => $pos ) {
if ( $pos ) {
$o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + $offset;
$o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1;
$arr[] = $o;
} else {
$arr[] = $m["m$n"][0];
@ -454,31 +474,32 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
}
if ( $init > 1 ) {
$s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' );
$offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
} else {
$init = 1;
$offset = 0;
}
if ( $plain ) {
if ( $pattern !== '' ) {
$ret = mb_strpos( $s, $pattern, 0, 'UTF-8' );
$ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' );
} else {
$ret = 0;
$ret = $init - 1;
}
if ( $ret === false ) {
return array( null );
} else {
return array( $ret + $init, $ret + $init + mb_strlen( $pattern ) - 1 );
return array( $ret + 1, $ret + mb_strlen( $pattern ) );
}
}
list( $re, $capt ) = $this->patternToRegex( $pattern );
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) {
list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' );
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
return array( null );
}
$o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ) + $init;
$ret = array( $o, $o + mb_strlen( $m[0][0], 'UTF-8' ) - 1 );
return $this->addCapturesFromMatch( $ret, $s, $m, $capt, $init, false );
$o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' );
$ret = array( $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) );
return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false );
}
public function ustringMatch( $s, $pattern, $init = 1 ) {
@ -493,23 +514,23 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
$init = $len + 1;
}
if ( $init > 1 ) {
$s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' );
$offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
} else {
$init = 1;
$offset = 0;
}
list( $re, $capt ) = $this->patternToRegex( $pattern );
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) {
list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' );
if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
return array( null );
}
return $this->addCapturesFromMatch( array(), $s, $m, $capt, $init, true );
return $this->addCapturesFromMatch( array(), $s, $m, $capt, true );
}
public function ustringGmatchInit( $s, $pattern ) {
$this->checkString( 'gmatch', $s );
$this->checkPattern( 'gmatch', $pattern );
list( $re, $capt ) = $this->patternToRegex( $pattern, true );
list( $re, $capt ) = $this->patternToRegex( $pattern, false );
return array( $re, $capt );
}
@ -518,7 +539,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
return array( $pos, array() );
}
$pos = $m[0][1] + strlen( $m[0][0] );
return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, 1, true ) );
return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, true ) );
}
public function ustringGsub( $s, $pattern, $repl, $n = null ) {
@ -532,7 +553,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
$n = 0;
}
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern );
list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^' );
$captures = array();
if ( $anypos ) {
@ -547,7 +568,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase {
for ( $i = 0; $i < $ct; $i++ ) {
$m = $mm[$i];
$c = array( $m[0][0] );
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, 1, false ) as $k => $v ) {
foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) {
$k++;
$c["m$k"] = $v;
}

View file

@ -2779,7 +2779,7 @@ local pats = {
[0x00ff46] = 1,
},
[0x7a] = {
1,
[0x000000] = 1,
},
[0x41] = {},
[0x43] = {},

View file

@ -62,18 +62,14 @@ $pats = array(
);
$ranges = array();
function addRange( $k, $start, $end, $arr ) {
function addRange( $k, $start, $end ) {
global $X, $ranges;
// Speed/memory tradeoff
if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) {
$ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end );
} else {
for ( $i = $start; $i < $end; $i++ ) {
if ( $arr ) {
fprintf( $X, "\t\t1,\n" );
} else {
fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
}
fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
}
}
}
@ -98,7 +94,6 @@ foreach ( $pats as $k => $pp ) {
}
fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) );
$arr = true;
$rstart = null;
foreach ( $chars as $i => $c ) {
if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) {
@ -107,14 +102,13 @@ foreach ( $pats as $k => $pp ) {
}
} else {
if ( $rstart !== null ) {
addRange( $k, $rstart, $i, $arr );
addRange( $k, $rstart, $i );
$rstart = null;
}
$arr = false;
}
}
if ( $rstart !== null ) {
addRange( $k, $rstart, 0x110000, $arr );
addRange( $k, $rstart, 0x110000 );
}
fprintf( $X, "\t},\n" );
}

View file

@ -476,8 +476,8 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
if charsets[c] then -- A character set like '%a'
return match_charset( sp, pp + 2, charsets[c] )
elseif c == 0x62 then -- '%b': balanced delimiter match
d1 = pattern.codepoints[pp + 2]
d2 = pattern.codepoints[pp + 3]
local d1 = pattern.codepoints[pp + 2]
local d2 = pattern.codepoints[pp + 3]
if not d1 or not d2 then
error( 'malformed pattern (missing arguments to \'%b\')', 3 )
end
@ -500,6 +500,18 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
ct = ct + 1
end
end
elseif c == 0x66 then -- '%f': frontier pattern match
if pattern.codepoints[pp + 2] ~= 0x5b then
error( 'missing \'[\' after %f in pattern at pattern character ' .. pp, 3 )
end
local pp, charset = parse_charset( pp + 2 )
local c1 = cps.codepoints[sp - 1] or 0
local c2 = cps.codepoints[sp] or 0
if not charset[c1] and charset[c2] then
return match( sp, pp )
else
return nil
end
elseif c >= 0x30 and c <= 0x39 then -- '%0' to '%9': backreference
local m, l = getcapt( c - 0x30, 'invalid capture index %' .. c .. ' at pattern character ' .. pp, 3 )
local ep = math.min( cps.len + 1, sp + l )
@ -702,7 +714,7 @@ local function find( s, cps, rawpat, pattern, init, noAnchor )
return sp, ep - 1, unpack( captures )
end
sp = sp + 1
until anchor or sp > cps.len
until anchor or sp > cps.len + 1
return nil
end

View file

@ -253,6 +253,42 @@ return testframework.getTestProvider( {
args = { "bar ¡foo¡foo¡ bar", '%b¡¡' },
expect = { 5, 9 }
},
{ name = 'find: (%f)', func = mw.ustring.find,
args = { "foo ¡foobar ¡foo bar baz", '¡.-%f[%s]' },
expect = { 5, 11 }
},
{ name = 'find: (%f 2)', func = mw.ustring.find,
args = { "foo ¡foobar ¡foo bar baz", '¡foo%f[%s]' },
expect = { 13, 16 }
},
{ name = 'find: (%f 3)', func = mw.ustring.find,
args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]' },
expect = { 16, 19 }
},
{ name = 'find: (%f 4)', func = mw.ustring.find,
args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]', 16 },
expect = { 16, 19 }
},
{ name = 'find: (%f 5)', func = mw.ustring.find,
args = { "foo ¡bar baz", '%f[%Z]' },
expect = { 1, 0 }
},
{ name = 'find: (%f 6)', func = mw.ustring.find,
args = { "foo ¡bar baz", '%f[%z]' },
expect = { 13, 12 }
},
{ name = 'find: (%f 7)', func = mw.ustring.find,
args = { "foo ¡b\0r baz", '%f[%Z]', 2 },
expect = { 8, 7 }
},
{ name = 'find: (%f 8)', func = mw.ustring.find,
args = { "\0foo ¡b\0r baz", '%f[%z]' },
expect = { 8, 7 }
},
{ name = 'find: (%f 9)', func = mw.ustring.find,
args = { "\0foo ¡b\0r baz", '%f[%Z]' },
expect = { 2, 1 }
},
{ name = 'find: (%A)', func = mw.ustring.find,
args = { "fóó? bar", '%A+' },
expect = { 4, 5 }