diff --git a/engines/LuaCommon/UstringLibrary.php b/engines/LuaCommon/UstringLibrary.php index b253e0be..5c96a9ad 100644 --- a/engines/LuaCommon/UstringLibrary.php +++ b/engines/LuaCommon/UstringLibrary.php @@ -232,7 +232,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { } /* Convert a Lua pattern into a PCRE regex */ - private function patternToRegex( $pattern, $noAnchor = false ) { + private function patternToRegex( $pattern, $anchor ) { $pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY ); static $charsets = null, $brcharsets = null; @@ -295,7 +295,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { switch ( $pat[$i] ) { case '^': $q = $i; - $re .= ( $noAnchor || $q ) ? '\\^' : '^'; + $re .= ( $anchor === false || $q ) ? '\\^' : $anchor; break; case '$': @@ -345,6 +345,19 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { $bct++; $re .= "(?$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)"; } + } elseif ( $pat[$i] === 'f' ) { + if ( $i + 1 >= $len || $pat[++$i] !== '[' ) { + throw new Scribunto_LuaError( "missing '[' after %f in pattern at pattern character $ii" ); + } + list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); + // Because %f considers the beginning and end of the string + // to be \0, determine if $re2 matches that and take it + // into account with "^" and "$". + if ( preg_match( "/$re2/us", "\0" ) ) { + $re .= "(?= '0' && $pat[$i] <= '9' ) { $n = ord( $pat[$i] ) - 0x30; if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) { @@ -358,34 +371,8 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { break; case '[': - $re .= '['; - $i++; - if ( $i < $len && $pat[$i] === '^' ) { - $re .= '^'; - $i++; - } - for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) { - if ( $pat[$i] === '%' ) { - $i++; - if ( $i >= $len ) { - break; - } - if ( isset( $brcharsets[$pat[$i]] ) ) { - $re .= $brcharsets[$pat[$i]]; - } else { - $re .= preg_quote( $pat[$i], '/' ); - } - } elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) { - $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' ); - $i += 2; - } else { - $re .= preg_quote( $pat[$i], '/' ); - } - } - if ( $i >= $len ) { - throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" ); - } - $re .= ']'; + list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); + $re .= $re2; $q = true; break; @@ -424,11 +411,44 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { return array( $re, $capt, $anypos ); } - private function addCapturesFromMatch( $arr, $s, $m, $capt, $offset, $m0_if_no_captures ) { + private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ){ + $ii = $i + 1; + $re = '['; + $i++; + if ( $i < $len && $pat[$i] === '^' ) { + $re .= '^'; + $i++; + } + for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) { + if ( $pat[$i] === '%' ) { + $i++; + if ( $i >= $len ) { + break; + } + if ( isset( $brcharsets[$pat[$i]] ) ) { + $re .= $brcharsets[$pat[$i]]; + } else { + $re .= preg_quote( $pat[$i], '/' ); + } + } elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) { + $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' ); + $i += 2; + } else { + $re .= preg_quote( $pat[$i], '/' ); + } + } + if ( $i >= $len ) { + throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" ); + } + $re .= ']'; + return array( $i, $re ); + } + + private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) { if ( count( $capt ) ) { foreach ( $capt as $n => $pos ) { if ( $pos ) { - $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + $offset; + $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1; $arr[] = $o; } else { $arr[] = $m["m$n"][0]; @@ -454,31 +474,32 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { } if ( $init > 1 ) { - $s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' ); + $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); } else { $init = 1; + $offset = 0; } if ( $plain ) { if ( $pattern !== '' ) { - $ret = mb_strpos( $s, $pattern, 0, 'UTF-8' ); + $ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' ); } else { - $ret = 0; + $ret = $init - 1; } if ( $ret === false ) { return array( null ); } else { - return array( $ret + $init, $ret + $init + mb_strlen( $pattern ) - 1 ); + return array( $ret + 1, $ret + mb_strlen( $pattern ) ); } } - list( $re, $capt ) = $this->patternToRegex( $pattern ); - if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) { + list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' ); + if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { return array( null ); } - $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ) + $init; - $ret = array( $o, $o + mb_strlen( $m[0][0], 'UTF-8' ) - 1 ); - return $this->addCapturesFromMatch( $ret, $s, $m, $capt, $init, false ); + $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ); + $ret = array( $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) ); + return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false ); } public function ustringMatch( $s, $pattern, $init = 1 ) { @@ -493,23 +514,23 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { $init = $len + 1; } if ( $init > 1 ) { - $s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' ); + $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); } else { - $init = 1; + $offset = 0; } - list( $re, $capt ) = $this->patternToRegex( $pattern ); - if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) { + list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' ); + if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { return array( null ); } - return $this->addCapturesFromMatch( array(), $s, $m, $capt, $init, true ); + return $this->addCapturesFromMatch( array(), $s, $m, $capt, true ); } public function ustringGmatchInit( $s, $pattern ) { $this->checkString( 'gmatch', $s ); $this->checkPattern( 'gmatch', $pattern ); - list( $re, $capt ) = $this->patternToRegex( $pattern, true ); + list( $re, $capt ) = $this->patternToRegex( $pattern, false ); return array( $re, $capt ); } @@ -518,7 +539,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { return array( $pos, array() ); } $pos = $m[0][1] + strlen( $m[0][0] ); - return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, 1, true ) ); + return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, true ) ); } public function ustringGsub( $s, $pattern, $repl, $n = null ) { @@ -532,7 +553,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { $n = 0; } - list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern ); + list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^' ); $captures = array(); if ( $anypos ) { @@ -547,7 +568,7 @@ class Scribunto_LuaUstringLibrary extends Scribunto_LuaLibraryBase { for ( $i = 0; $i < $ct; $i++ ) { $m = $mm[$i]; $c = array( $m[0][0] ); - foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, 1, false ) as $k => $v ) { + foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) { $k++; $c["m$k"] = $v; } diff --git a/engines/LuaCommon/lualib/ustring/charsets.lua b/engines/LuaCommon/lualib/ustring/charsets.lua index 19038763..5f0f48ce 100644 --- a/engines/LuaCommon/lualib/ustring/charsets.lua +++ b/engines/LuaCommon/lualib/ustring/charsets.lua @@ -2779,7 +2779,7 @@ local pats = { [0x00ff46] = 1, }, [0x7a] = { - 1, + [0x000000] = 1, }, [0x41] = {}, [0x43] = {}, diff --git a/engines/LuaCommon/lualib/ustring/make-tables.php b/engines/LuaCommon/lualib/ustring/make-tables.php index 9727dce2..4caff536 100755 --- a/engines/LuaCommon/lualib/ustring/make-tables.php +++ b/engines/LuaCommon/lualib/ustring/make-tables.php @@ -62,18 +62,14 @@ $pats = array( ); $ranges = array(); -function addRange( $k, $start, $end, $arr ) { +function addRange( $k, $start, $end ) { global $X, $ranges; // Speed/memory tradeoff if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) { $ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end ); } else { for ( $i = $start; $i < $end; $i++ ) { - if ( $arr ) { - fprintf( $X, "\t\t1,\n" ); - } else { - fprintf( $X, "\t\t[0x%06x] = 1,\n", $i ); - } + fprintf( $X, "\t\t[0x%06x] = 1,\n", $i ); } } } @@ -98,7 +94,6 @@ foreach ( $pats as $k => $pp ) { } fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) ); - $arr = true; $rstart = null; foreach ( $chars as $i => $c ) { if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) { @@ -107,14 +102,13 @@ foreach ( $pats as $k => $pp ) { } } else { if ( $rstart !== null ) { - addRange( $k, $rstart, $i, $arr ); + addRange( $k, $rstart, $i ); $rstart = null; } - $arr = false; } } if ( $rstart !== null ) { - addRange( $k, $rstart, 0x110000, $arr ); + addRange( $k, $rstart, 0x110000 ); } fprintf( $X, "\t},\n" ); } diff --git a/engines/LuaCommon/lualib/ustring/ustring.lua b/engines/LuaCommon/lualib/ustring/ustring.lua index 87f3b4ab..9b090ae6 100644 --- a/engines/LuaCommon/lualib/ustring/ustring.lua +++ b/engines/LuaCommon/lualib/ustring/ustring.lua @@ -476,8 +476,8 @@ local function find( s, cps, rawpat, pattern, init, noAnchor ) if charsets[c] then -- A character set like '%a' return match_charset( sp, pp + 2, charsets[c] ) elseif c == 0x62 then -- '%b': balanced delimiter match - d1 = pattern.codepoints[pp + 2] - d2 = pattern.codepoints[pp + 3] + local d1 = pattern.codepoints[pp + 2] + local d2 = pattern.codepoints[pp + 3] if not d1 or not d2 then error( 'malformed pattern (missing arguments to \'%b\')', 3 ) end @@ -500,6 +500,18 @@ local function find( s, cps, rawpat, pattern, init, noAnchor ) ct = ct + 1 end end + elseif c == 0x66 then -- '%f': frontier pattern match + if pattern.codepoints[pp + 2] ~= 0x5b then + error( 'missing \'[\' after %f in pattern at pattern character ' .. pp, 3 ) + end + local pp, charset = parse_charset( pp + 2 ) + local c1 = cps.codepoints[sp - 1] or 0 + local c2 = cps.codepoints[sp] or 0 + if not charset[c1] and charset[c2] then + return match( sp, pp ) + else + return nil + end elseif c >= 0x30 and c <= 0x39 then -- '%0' to '%9': backreference local m, l = getcapt( c - 0x30, 'invalid capture index %' .. c .. ' at pattern character ' .. pp, 3 ) local ep = math.min( cps.len + 1, sp + l ) @@ -702,7 +714,7 @@ local function find( s, cps, rawpat, pattern, init, noAnchor ) return sp, ep - 1, unpack( captures ) end sp = sp + 1 - until anchor or sp > cps.len + until anchor or sp > cps.len + 1 return nil end diff --git a/tests/engines/LuaCommon/UstringLibraryTests.lua b/tests/engines/LuaCommon/UstringLibraryTests.lua index bc16642a..c5a40596 100644 --- a/tests/engines/LuaCommon/UstringLibraryTests.lua +++ b/tests/engines/LuaCommon/UstringLibraryTests.lua @@ -253,6 +253,42 @@ return testframework.getTestProvider( { args = { "bar ¡foo¡foo¡ bar", '%b¡¡' }, expect = { 5, 9 } }, + { name = 'find: (%f)', func = mw.ustring.find, + args = { "foo ¡foobar ¡foo bar baz", '¡.-%f[%s]' }, + expect = { 5, 11 } + }, + { name = 'find: (%f 2)', func = mw.ustring.find, + args = { "foo ¡foobar ¡foo bar baz", '¡foo%f[%s]' }, + expect = { 13, 16 } + }, + { name = 'find: (%f 3)', func = mw.ustring.find, + args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]' }, + expect = { 16, 19 } + }, + { name = 'find: (%f 4)', func = mw.ustring.find, + args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]', 16 }, + expect = { 16, 19 } + }, + { name = 'find: (%f 5)', func = mw.ustring.find, + args = { "foo ¡bar baz", '%f[%Z]' }, + expect = { 1, 0 } + }, + { name = 'find: (%f 6)', func = mw.ustring.find, + args = { "foo ¡bar baz", '%f[%z]' }, + expect = { 13, 12 } + }, + { name = 'find: (%f 7)', func = mw.ustring.find, + args = { "foo ¡b\0r baz", '%f[%Z]', 2 }, + expect = { 8, 7 } + }, + { name = 'find: (%f 8)', func = mw.ustring.find, + args = { "\0foo ¡b\0r baz", '%f[%z]' }, + expect = { 8, 7 } + }, + { name = 'find: (%f 9)', func = mw.ustring.find, + args = { "\0foo ¡b\0r baz", '%f[%Z]' }, + expect = { 2, 1 } + }, { name = 'find: (%A)', func = mw.ustring.find, args = { "fóó? bar", '%A+' }, expect = { 4, 5 }