Fix deceptively-simple pattern in pure-Lua ustring

The pure-Lua ustring pattern matching functions short-circuit to the
much faster string library when the pattern would match the same against
the raw bytes.

A pattern like "[^a-z]" can match a partial UTF-8 character when applied
bytewise, and so must be detected as unsafe.

Let's also directly test the pure-Lua module, instead of me having to
comment out lines in Scribunto_LuaUstringLibrary::register() whenever I
want to test them.

Change-Id: I91ed3374aadfea379b9db2e13b4248ab20df509e
This commit is contained in:
Brad Jorsch 2014-07-10 15:21:58 -04:00 committed by Jackmcbarn
parent 40985a1672
commit 0367e9bddd
3 changed files with 26 additions and 0 deletions

View file

@ -735,6 +735,7 @@ end
-- * If it contains any bytes over 0x7f. We could skip these if they're not
-- inside brackets and aren't followed by quantifiers and aren't part of a
-- '%b', but that's too complicated to check.
-- * If it contains a negated character set.
-- * If it contains "%a" or any of the other %-prefixed character sets except
-- %z or %Z.
-- * If it contains a '.' not followed by '*', '+', or '-'. A bare '.' or '.?'
@ -747,6 +748,7 @@ end
local function patternIsSimple( pattern )
return not (
S.find( pattern, '[\128-\255]' ) or
S.find( pattern, '%[%^' ) or
S.find( pattern, '%%[acdlpsuwxACDLPSUWX]' ) or
S.find( pattern, '%.[^*+-]' ) or
S.find( pattern, '()', 1, true )

View file

@ -0,0 +1,20 @@
<?php
require_once( __DIR__ . '/UstringLibraryTest.php' );
class Scribunto_LuaUstringLibraryPureLuaTests extends Scribunto_LuaUstringLibraryTests {
function setUp() {
parent::setUp();
// Override mw.ustring with the pure-Lua version
$interpreter = $this->getEngine()->getInterpreter();
$interpreter->callFunction(
$interpreter->loadString( '
local ustring = require( "ustring" )
ustring.maxStringLength = mw.ustring.maxStringLength
ustring.maxPatternLength = mw.ustring.maxPatternLength
mw.ustring = ustring
', 'fortest' )
);
}
}

View file

@ -425,6 +425,10 @@ return testframework.getTestProvider( {
args = { "foo foofóó foófoó bar", '(f%a+)%1' },
expect = { 12, 17, 'foó' }
},
{ name = 'find: deceptively-simple pattern', func = mw.ustring.find,
args = { "fóó", '([^a-z])' },
expect = { 2, 2, 'ó' }
},
{ name = 'match: (1)', func = mw.ustring.match,
args = { "bar fóo bar", 'f%a+' },