stringLengthLimit === null ) { global $wgMaxArticleSize; $this->stringLengthLimit = $wgMaxArticleSize * 1024; } $this->manualCheckForU110000AndUp = mb_check_encoding( "\xf4\x90\x80\x80", "UTF-8" ); $this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x"; $this->patternRegexCache = new MapCacheLRU( 100 ); parent::__construct( $engine ); } function register() { $perf = $this->getEngine()->getPerformanceCharacteristics(); if ( $perf['phpCallsRequireSerialization'] ) { $lib = array( // Pattern matching is still much faster in PHP, even with the // overhead of serialization 'find' => array( $this, 'ustringFind' ), 'match' => array( $this, 'ustringMatch' ), 'gmatch_init' => array( $this, 'ustringGmatchInit' ), 'gmatch_callback' => array( $this, 'ustringGmatchCallback' ), 'gsub' => array( $this, 'ustringGsub' ), ); } else { $lib = array( 'isutf8' => array( $this, 'ustringIsUtf8' ), 'byteoffset' => array( $this, 'ustringByteoffset' ), 'codepoint' => array( $this, 'ustringCodepoint' ), 'gcodepoint_init' => array( $this, 'ustringGcodepointInit' ), 'toNFC' => array( $this, 'ustringToNFC' ), 'toNFD' => array( $this, 'ustringToNFD' ), 'toNFKC' => array( $this, 'ustringToNFKC' ), 'toNFKD' => array( $this, 'ustringToNFKD' ), 'char' => array( $this, 'ustringChar' ), 'len' => array( $this, 'ustringLen' ), 'sub' => array( $this, 'ustringSub' ), 'upper' => array( $this, 'ustringUpper' ), 'lower' => array( $this, 'ustringLower' ), 'find' => array( $this, 'ustringFind' ), 'match' => array( $this, 'ustringMatch' ), 'gmatch_init' => array( $this, 'ustringGmatchInit' ), 'gmatch_callback' => array( $this, 'ustringGmatchCallback' ), 'gsub' => array( $this, 'ustringGsub' ), ); } return $this->getEngine()->registerInterface( 'mw.ustring.lua', $lib, array( 'stringLengthLimit' => $this->stringLengthLimit, 'patternLengthLimit' => $this->patternLengthLimit, ) ); } // Once we no longer support PHP < 5.4, calls to this method may be replaced with // mb_check_encoding( $s, 'UTF-8' ) private function checkEncoding( $s ) { $ok = mb_check_encoding( $s, 'UTF-8' ); if ( $ok && $this->manualCheckForU110000AndUp ) { $ok = !preg_match( '/\xf4[\x90-\xbf]|[\xf5-\xff]/', $s ); } return $ok; } private function checkString( $name, $s, $checkEncoding = true ) { if ( $this->getLuaType( $s ) == 'number' ) { $s = (string)$s; } else { $this->checkType( $name, 1, $s, 'string' ); if ( $checkEncoding && !$this->checkEncoding( $s ) ) { throw new Scribunto_LuaError( "bad argument #1 to '$name' (string is not UTF-8)" ); } if ( strlen( $s ) > $this->stringLengthLimit ) { throw new Scribunto_LuaError( "bad argument #1 to '$name' (string is longer than $this->stringLengthLimit bytes)" ); } } } public function ustringIsUtf8( $s ) { $this->checkString( 'isutf8', $s, false ); return array( $this->checkEncoding( $s ) ); } public function ustringByteoffset( $s, $l = 1, $i = 1 ) { $this->checkString( 'byteoffset', $s ); $this->checkTypeOptional( 'byteoffset', 2, $l, 'number', 1 ); $this->checkTypeOptional( 'byteoffset', 3, $i, 'number', 1 ); $bytelen = strlen( $s ); if ( $i < 0 ) { $i = $bytelen + $i + 1; } if ( $i < 1 || $i > $bytelen ) { return array( null ); } $i--; $j = $i; while ( ( ord( $s[$i] ) & 0xc0 ) === 0x80 ) { $i--; } if ( $l > 0 && $j === $i ) { $l--; } $char = mb_strlen( substr( $s, 0, $i ), 'UTF-8' ) + $l; if ( $char < 0 || $char >= mb_strlen( $s, 'UTF-8' ) ) { return array( null ); } else { return array( strlen( mb_substr( $s, 0, $char, 'UTF-8' ) ) + 1 ); } } public function ustringCodepoint( $s, $i = 1, $j = null ) { $this->checkString( 'codepoint', $s ); $this->checkTypeOptional( 'codepoint', 2, $i, 'number', 1 ); $this->checkTypeOptional( 'codepoint', 3, $j, 'number', $i ); $l = mb_strlen( $s, 'UTF-8' ); if ( $i < 0 ) { $i = $l + $i + 1; } if ( $j < 0 ) { $j = $l + $j + 1; } if ( $j < $i ) { return array(); } $i = max( 1, min( $i, $l + 1 ) ); $j = max( 1, min( $j, $l + 1 ) ); $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' ); return unpack( 'N*', mb_convert_encoding( $s, 'UTF-32BE', 'UTF-8' ) ); } public function ustringGcodepointInit( $s, $i = 1, $j = null ) { return array( $this->ustringCodepoint( $s, $i, $j ) ); } public function ustringToNFC( $s ) { $this->checkString( 'toNFC', $s, false ); if ( !$this->checkEncoding( $s ) ) { return array( null ); } return array( UtfNormal::toNFC( $s ) ); } public function ustringToNFD( $s ) { $this->checkString( 'toNFD', $s, false ); if ( !$this->checkEncoding( $s ) ) { return array( null ); } return array( UtfNormal::toNFD( $s ) ); } public function ustringToNFKC( $s ) { $this->checkString( 'toNFKC', $s, false ); if ( !$this->checkEncoding( $s ) ) { return array( null ); } return array( UtfNormal::toNFKC( $s ) ); } public function ustringToNFKD( $s ) { $this->checkString( 'toNFKD', $s, false ); if ( !$this->checkEncoding( $s ) ) { return array( null ); } return array( UtfNormal::toNFKD( $s ) ); } public function ustringChar() { $args = func_get_args(); if ( count( $args ) > $this->stringLengthLimit ) { throw new Scribunto_LuaError( "too many arguments to '$name'" ); } foreach ( $args as $k=>&$v ) { if ( !is_numeric( $v ) ) { $this->checkType( 'char', $k+1, $v, 'number' ); } $v = (int)floor( $v ); if ( $v < 0 || $v > 0x10ffff ) { $k++; throw new Scribunto_LuaError( "bad argument #$k to 'char' (value out of range)" ); } } array_unshift( $args, 'N*' ); $s = call_user_func_array( 'pack', $args ); $s = mb_convert_encoding( $s, 'UTF-8', 'UTF-32BE' ); if ( strlen( $s ) > $this->stringLengthLimit ) { throw new Scribunto_LuaError( "result to long for '$name'" ); } return array( $s ); } public function ustringLen( $s ) { $this->checkString( 'len', $s, false ); if ( !$this->checkEncoding( $s ) ) { return array( null ); } return array( mb_strlen( $s, 'UTF-8' ) ); } public function ustringSub( $s, $i=1, $j=-1 ) { $this->checkString( 'sub', $s ); $this->checkTypeOptional( 'sub', 2, $i, 'number', 1 ); $this->checkTypeOptional( 'sub', 3, $j, 'number', -1 ); $len = mb_strlen( $s, 'UTF-8' ); if ( $i < 0 ) { $i = $len + $i + 1; } if ( $j < 0 ) { $j = $len + $j + 1; } if ( $j < $i ) { return array( '' ); } $i = max( 1, min( $i, $len + 1 ) ); $j = max( 1, min( $j, $len + 1 ) ); $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' ); return array( $s ); } public function ustringUpper( $s ) { $this->checkString( 'upper', $s ); return array( mb_strtoupper( $s, 'UTF-8' ) ); } public function ustringLower( $s ) { $this->checkString( 'lower', $s ); return array( mb_strtolower( $s, 'UTF-8' ) ); } private function checkPattern( $name, $pattern ) { if ( $this->getLuaType( $pattern ) == 'number' ) { $pattern = (string)$pattern; } $this->checkType( $name, 2, $pattern, 'string' ); if ( !$this->checkEncoding( $pattern ) ) { throw new Scribunto_LuaError( "bad argument #2 to '$name' (string is not UTF-8)" ); } if ( strlen( $pattern ) > $this->patternLengthLimit ) { throw new Scribunto_LuaError( "bad argument #2 to '$name' (pattern is longer than $this->patternLengthLimit bytes)" ); } } /* Convert a Lua pattern into a PCRE regex */ private function patternToRegex( $pattern, $anchor, $name ) { $cacheKey = serialize( array( $pattern, $anchor ) ); if ( !$this->patternRegexCache->has( $cacheKey ) ) { $this->checkPattern( $name, $pattern ); $pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY ); static $charsets = null, $brcharsets = null; if ( $charsets === null ) { $charsets = array( // If you change these, also change lualib/ustring/make-tables.php // (and run it to regenerate charsets.lua) 'a' => '\p{L}', 'c' => '\p{Cc}', 'd' => '\p{Nd}', 'l' => '\p{Ll}', 'p' => '\p{P}', 's' => '\p{Xps}', 'u' => '\p{Lu}', 'w' => '[\p{L}\p{Nd}]', 'x' => '[0-9A-Fa-f0-9A-Fa-f]', 'z' => '\0', // These *must* be the inverse of the above 'A' => '\P{L}', 'C' => '\P{Cc}', 'D' => '\P{Nd}', 'L' => '\P{Ll}', 'P' => '\P{P}', 'S' => '\P{Xps}', 'U' => '\P{Lu}', 'W' => '[^\p{L}\p{Nd}]', 'X' => '[^0-9A-Fa-f0-9A-Fa-f]', 'Z' => '[^\0]', ); $brcharsets = array( 'w' => '\p{L}\p{Nd}', 'x' => '0-9A-Fa-f0-9A-Fa-f', // Negated sets that are not expressable as a simple \P{} are // unfortunately complicated. // Xan is L plus N, so ^Xan plus Nl plus No is anything that's not L or Nd 'W' => '\P{Xan}\p{Nl}\p{No}', // Manually constructed. Fun. 'X' => '\x00-\x2f\x3a-\x40\x47-\x60\x67-\x{ff0f}' . '\x{ff1a}-\x{ff20}\x{ff27}-\x{ff40}\x{ff47}-\x{10ffff}', // Ha! 'Z' => '\x01-\x{10ffff}', ) + $charsets; } $re = '/'; $len = count( $pat ); $capt = array(); $anypos = false; $captparen = array(); $opencapt = array(); $bct = 0; for ( $i = 0; $i < $len; $i++ ) { $ii = $i + 1; $q = false; switch ( $pat[$i] ) { case '^': $q = $i; $re .= ( $anchor === false || $q ) ? '\\^' : $anchor; break; case '$': $q = ( $i < $len - 1 ); $re .= $q ? '\\$' : '$'; break; case '(': if ( $i + 1 >= $len ) { throw new Scribunto_LuaError( "Unmatched open-paren at pattern character $ii" ); } $n = count( $capt ) + 1; $capt[$n] = ( $pat[$i + 1] === ')' ); if ( $capt[$n] ) { $anypos = true; } $re .= "(?"; $opencapt[] = $n; $captparen[$n] = $ii; break; case ')': if ( count( $opencapt ) <= 0 ) { throw new Scribunto_LuaError( "Unmatched close-paren at pattern character $ii" ); } array_pop( $opencapt ); $re .= $pat[$i]; break; case '%': $i++; if ( $i >= $len ) { throw new Scribunto_LuaError( "malformed pattern (ends with '%')" ); } if ( isset( $charsets[$pat[$i]] ) ) { $re .= $charsets[$pat[$i]]; $q = true; } elseif ( $pat[$i] === 'b' ) { if ( $i + 2 >= $len ) { throw new Scribunto_LuaError( "malformed pattern (missing arguments to \'%b\')" ); } $d1 = preg_quote( $pat[++$i], '/' ); $d2 = preg_quote( $pat[++$i], '/' ); if ( $d1 === $d2 ) { $re .= "{$d1}[^$d1]*$d1"; } else { $bct++; $re .= "(?$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)"; } } elseif ( $pat[$i] === 'f' ) { if ( $i + 1 >= $len || $pat[++$i] !== '[' ) { throw new Scribunto_LuaError( "missing '[' after %f in pattern at pattern character $ii" ); } list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); // Because %f considers the beginning and end of the string // to be \0, determine if $re2 matches that and take it // into account with "^" and "$". if ( preg_match( "/$re2/us", "\0" ) ) { $re .= "(?= '0' && $pat[$i] <= '9' ) { $n = ord( $pat[$i] ) - 0x30; if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) { throw new Scribunto_LuaError( "invalid capture index %$n at pattern character $ii" ); } $re .= "\\g{m$n}"; } else { $re .= preg_quote( $pat[$i], '/' ); $q = true; } break; case '[': list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); $re .= $re2; $q = true; break; case ']': throw new Scribunto_LuaError( "Unmatched close-bracket at pattern character $ii" ); case '.': $re .= $pat[$i]; $q = true; break; default: $re .= preg_quote( $pat[$i], '/' ); $q = true; break; } if ( $q && $i + 1 < $len ) { switch ( $pat[$i + 1] ) { case '*': case '+': case '?': $re .= $pat[++$i]; break; case '-': $re .= '*?'; $i++; break; } } } if ( count( $opencapt ) ) { $ii = $captparen[$opencapt[0]]; throw new Scribunto_LuaError( "Unclosed capture beginning at pattern character $ii" ); } $re .= '/us'; $this->patternRegexCache->set( $cacheKey, array( $re, $capt, $anypos ) ); } return $this->patternRegexCache->get( $cacheKey ); } private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ) { $ii = $i + 1; $re = '['; $i++; if ( $i < $len && $pat[$i] === '^' ) { $re .= '^'; $i++; } for ( $j = $i; $i < $len && ( $j == $i || $pat[$i] !== ']' ); $i++ ) { if ( $pat[$i] === '%' ) { $i++; if ( $i >= $len ) { break; } if ( isset( $brcharsets[$pat[$i]] ) ) { $re .= $brcharsets[$pat[$i]]; } else { $re .= preg_quote( $pat[$i], '/' ); } } elseif ( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' && $pat[$i + 2] !== '%' ) { if ( $pat[$i] <= $pat[$i+2] ) { $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' ); } $i += 2; } else { $re .= preg_quote( $pat[$i], '/' ); } } if ( $i >= $len ) { throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" ); } $re .= ']'; // Lua just ignores invalid ranges, while pcre throws an error. // We filter them out above, but then we need to special-case empty sets if ( $re === '[]' ) { // Can't directly quantify (*FAIL), so wrap it. // "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33 $re = '(?:(*FAIL))'; } elseif ( $re === '[^]' ) { $re = '.'; // 's' modifier is always used, so this works } return array( $i, $re ); } private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) { if ( count( $capt ) ) { foreach ( $capt as $n => $pos ) { if ( $pos ) { $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1; $arr[] = $o; } else { $arr[] = $m["m$n"][0]; } } } elseif ( $m0_if_no_captures ) { $arr[] = $m[0][0]; } return $arr; } public function ustringFind( $s, $pattern, $init = 1, $plain = false ) { $this->checkString( 'find', $s ); $this->checkTypeOptional( 'find', 3, $init, 'number', 1 ); $this->checkTypeOptional( 'find', 4, $plain, 'boolean', false ); $len = mb_strlen( $s, 'UTF-8' ); if ( $init < 0 ) { $init = $len + $init + 1; } elseif ( $init > $len + 1 ) { $init = $len + 1; } if ( $init > 1 ) { $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); } else { $init = 1; $offset = 0; } if ( $plain ) { $this->checkPattern( 'find', $pattern ); if ( $pattern !== '' ) { $ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' ); } else { $ret = $init - 1; } if ( $ret === false ) { return array( null ); } else { return array( $ret + 1, $ret + mb_strlen( $pattern ) ); } } else { list( $re, $capt ) = $this->patternToRegex( $pattern, '\G', 'find' ); if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { return array( null ); } $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ); $ret = array( $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) ); return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false ); } } public function ustringMatch( $s, $pattern, $init = 1 ) { $this->checkString( 'match', $s ); $this->checkTypeOptional( 'match', 3, $init, 'number', 1 ); $len = mb_strlen( $s, 'UTF-8' ); if ( $init < 0 ) { $init = $len + $init + 1; } elseif ( $init > $len + 1 ) { $init = $len + 1; } if ( $init > 1 ) { $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); } else { $offset = 0; } list( $re, $capt ) = $this->patternToRegex( $pattern, '\G', 'match' ); if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { return array( null ); } return $this->addCapturesFromMatch( array(), $s, $m, $capt, true ); } public function ustringGmatchInit( $s, $pattern ) { $this->checkString( 'gmatch', $s ); list( $re, $capt ) = $this->patternToRegex( $pattern, false, 'gmatch' ); return array( $re, $capt ); } public function ustringGmatchCallback( $s, $re, $capt, $pos ) { if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $pos ) ) { return array( $pos, array() ); } $pos = $m[0][1] + strlen( $m[0][0] ); return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, true ) ); } public function ustringGsub( $s, $pattern, $repl, $n = null ) { $this->checkString( 'gsub', $s ); $this->checkTypeOptional( 'gsub', 4, $n, 'number', null ); if ( $n === null ) { $n = -1; } elseif ( $n < 1 ) { return array( $s, 0 ); } list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' ); $captures = array(); if ( $this->phpBug53823 ) { // PHP bug 53823 means that a zero-length match before a UTF-8 // character will match again before every byte of that character. // The workaround is to capture the first "character" of/after the // match and verify that its first byte is legal to start a UTF-8 // character. $re = '/(?=(?.|$))' . substr( $re, 1 ); } if ( $anypos ) { // preg_replace_callback doesn't take a "flags" argument, so we // can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle // position captures. So instead we have to do a preg_match_all and // handle the captures ourself. $ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); for ( $i = 0; $i < $ct; $i++ ) { $m = $mm[$i]; if ( $this->phpBug53823 ) { $c = ord( $m['phpBug53823'][0] ); if ( $c >= 0x80 && $c <= 0xbf ) { continue; } } $c = array( $m[0][0] ); foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) { $k++; $c["m$k"] = $v; } $captures[] = $c; if ( $n >= 0 && count( $captures ) >= $n ) { break; } } } switch ( $this->getLuaType( $repl ) ) { case 'string': case 'number': $cb = function ( $m ) use ( $repl, $anypos, &$captures ) { if ( $anypos ) { $m = array_shift( $captures ); } return preg_replace_callback( '/%([%0-9])/', function ( $m2 ) use ( $m ) { $x = $m2[1]; if ( $x === '%' ) { return '%'; } elseif ( $x === '0' ) { return $m[0]; } elseif ( isset( $m["m$x"] ) ) { return $m["m$x"]; } else { throw new Scribunto_LuaError( "invalid capture index %$x in replacement string" ); } }, $repl ); }; break; case 'table': $cb = function ( $m ) use ( $repl, $anypos, &$captures ) { if ( $anypos ) { $m = array_shift( $captures ); } $x = isset( $m['m1'] ) ? $m['m1'] : $m[0]; return isset( $repl[$x] ) ? $repl[$x] : $m[0]; }; break; case 'function': $interpreter = $this->getInterpreter(); $cb = function ( $m ) use ( $interpreter, $capt, $repl, $anypos, &$captures ) { if ( $anypos ) { $m = array_shift( $captures ); } $args = array( $repl ); if ( count( $capt ) ) { foreach ( $capt as $i => $pos ) { $args[] = $m["m$i"]; } } else { $args[] = $m[0]; } $ret = call_user_func_array( array( $interpreter, 'callFunction' ), $args ); if ( count( $ret ) === 0 || $ret[0] === null ) { return $m[0]; } return $ret[0]; }; break; default: $this->checkType( 'gsub', 3, $repl, 'function or table or string' ); } $skippedMatches = 0; if ( $this->phpBug53823 ) { // Since we're having bogus matches, we need to keep track of the // necessary adjustment and stop manually once we hit the limit. $maxMatches = $n < 0 ? INF : $n; $n = -1; $realCallback = $cb; $cb = function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) { $c = ord( $m['phpBug53823'] ); if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) { $skippedMatches++; return $m[0]; } else { $maxMatches--; return $realCallback( $m ); } }; } $count = 0; $s2 = preg_replace_callback( $re, $cb, $s, $n, $count ); if ( $s2 === null ) { self::handlePCREError( preg_last_error(), $pattern ); } return array( $s2, $count - $skippedMatches ); } /** * Handle a PCRE error * @param int $error From preg_last_error() * @param string $pattern Pattern being matched * @throws Scribunto_LuaError */ private function handlePCREError( $error, $pattern ) { $PREG_JIT_STACKLIMIT_ERROR = defined( 'PREG_JIT_STACKLIMIT_ERROR' ) ? PREG_JIT_STACKLIMIT_ERROR : 'PREG_JIT_STACKLIMIT_ERROR'; $error = preg_last_error(); switch ( $error ) { case PREG_NO_ERROR: // Huh? break; case PREG_INTERNAL_ERROR: throw new Scribunto_LuaError( "PCRE internal error" ); case PREG_BACKTRACK_LIMIT_ERROR: throw new Scribunto_LuaError( "PCRE backtrack limit reached while matching pattern '$pattern'" ); case PREG_RECURSION_LIMIT_ERROR: throw new Scribunto_LuaError( "PCRE recursion limit reached while matching pattern '$pattern'" ); case PREG_BAD_UTF8_ERROR: // Should have alreay been caught, but just in case throw new Scribunto_LuaError( "PCRE bad UTF-8 error" ); case PREG_BAD_UTF8_OFFSET_ERROR: // Shouldn't happen, but just in case throw new Scribunto_LuaError( "PCRE bad UTF-8 offset error" ); case $PREG_JIT_STACKLIMIT_ERROR: throw new Scribunto_LuaError( "PCRE JIT stack limit reached while matching pattern '$pattern'" ); default: throw new Scribunto_LuaError( "PCRE error code $error while matching pattern '$pattern'" ); } } }