2015-08-25 19:57:23 +00:00
|
|
|
<?php
|
2017-04-17 14:40:11 +00:00
|
|
|
|
2020-12-03 22:22:43 +00:00
|
|
|
namespace MediaWiki\Extension\AbuseFilter\Parser;
|
|
|
|
|
2021-08-29 22:58:17 +00:00
|
|
|
use MediaWiki\Extension\AbuseFilter\Parser\Exception\UserVisibleException;
|
2024-10-19 19:55:40 +00:00
|
|
|
use Wikimedia\ObjectCache\BagOStuff;
|
2018-12-31 16:40:54 +00:00
|
|
|
|
2015-08-25 19:57:23 +00:00
|
|
|
/**
|
|
|
|
* Tokenizer for AbuseFilter rules.
|
|
|
|
*/
|
|
|
|
class AbuseFilterTokenizer {
|
2020-01-21 07:38:52 +00:00
|
|
|
/** @var int Tokenizer cache version. Increment this when changing the syntax. */
|
2019-11-13 13:26:46 +00:00
|
|
|
public const CACHE_VERSION = 4;
|
2020-01-21 11:13:11 +00:00
|
|
|
private const COMMENT_START_RE = '/\s*\/\*/A';
|
|
|
|
private const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
|
|
|
|
public const OPERATOR_RE =
|
2016-04-09 13:35:35 +00:00
|
|
|
'/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
|
2020-01-21 11:13:11 +00:00
|
|
|
private const BASE = '0(?<base>[xbo])';
|
|
|
|
private const DIGIT = '[0-9A-Fa-f]';
|
|
|
|
private const DIGITS = self::DIGIT . '+' . '(?:\.\d*)?|\.\d+';
|
2019-11-13 13:26:46 +00:00
|
|
|
private const RADIX_RE = '/(?:' . self::BASE . ')?(?<input>' . self::DIGITS . ')(?!\w)/Au';
|
2020-01-21 11:13:11 +00:00
|
|
|
private const WHITESPACE = "\011\012\013\014\015\040";
|
2015-08-25 19:57:23 +00:00
|
|
|
|
|
|
|
// Order is important. The punctuation-matching regex requires that
|
2017-07-08 18:49:13 +00:00
|
|
|
// ** comes before *, etc. They are sorted to make it easy to spot
|
|
|
|
// such errors.
|
2019-11-16 15:32:36 +00:00
|
|
|
public const OPERATORS = [
|
2018-04-04 21:14:25 +00:00
|
|
|
// Inequality
|
|
|
|
'!==', '!=', '!',
|
|
|
|
// Multiplication/exponentiation
|
|
|
|
'**', '*',
|
|
|
|
// Other arithmetic
|
|
|
|
'/', '+', '-', '%',
|
|
|
|
// Logic
|
|
|
|
'&', '|', '^',
|
|
|
|
// Setting
|
|
|
|
':=',
|
|
|
|
// Ternary
|
|
|
|
'?', ':',
|
|
|
|
// Less than
|
|
|
|
'<=', '<',
|
|
|
|
// Greater than
|
|
|
|
'>=', '>',
|
|
|
|
// Equality
|
|
|
|
'===', '==', '=',
|
2017-06-15 14:23:34 +00:00
|
|
|
];
|
2015-08-25 19:57:23 +00:00
|
|
|
|
2019-11-16 15:32:36 +00:00
|
|
|
public const PUNCTUATION = [
|
2015-09-28 18:03:35 +00:00
|
|
|
',' => AFPToken::TCOMMA,
|
|
|
|
'(' => AFPToken::TBRACE,
|
|
|
|
')' => AFPToken::TBRACE,
|
|
|
|
'[' => AFPToken::TSQUAREBRACKET,
|
|
|
|
']' => AFPToken::TSQUAREBRACKET,
|
|
|
|
';' => AFPToken::TSTATEMENTSEPARATOR,
|
2017-06-15 14:23:34 +00:00
|
|
|
];
|
2015-08-25 19:57:23 +00:00
|
|
|
|
2019-11-16 15:32:36 +00:00
|
|
|
public const BASES = [
|
2015-08-25 19:57:23 +00:00
|
|
|
'b' => 2,
|
|
|
|
'x' => 16,
|
|
|
|
'o' => 8
|
2017-06-15 14:23:34 +00:00
|
|
|
];
|
2015-08-25 19:57:23 +00:00
|
|
|
|
2019-11-16 15:32:36 +00:00
|
|
|
public const BASE_CHARS_RES = [
|
2015-08-25 19:57:23 +00:00
|
|
|
2 => '/^[01]+$/',
|
2019-09-07 11:34:56 +00:00
|
|
|
8 => '/^[0-7]+$/',
|
2015-08-25 19:57:23 +00:00
|
|
|
16 => '/^[0-9A-Fa-f]+$/',
|
|
|
|
10 => '/^[0-9.]+$/',
|
2017-06-15 14:23:34 +00:00
|
|
|
];
|
2015-08-25 19:57:23 +00:00
|
|
|
|
2019-11-16 15:32:36 +00:00
|
|
|
public const KEYWORDS = [
|
2015-08-25 19:57:23 +00:00
|
|
|
'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
|
|
|
|
'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
|
2017-06-15 14:23:34 +00:00
|
|
|
];
|
2015-08-25 19:57:23 +00:00
|
|
|
|
2019-08-21 10:04:10 +00:00
|
|
|
/**
|
|
|
|
* @var BagOStuff
|
|
|
|
*/
|
|
|
|
private $cache;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param BagOStuff $cache
|
|
|
|
*/
|
2021-09-01 18:29:19 +00:00
|
|
|
public function __construct( BagOStuff $cache ) {
|
2019-08-21 10:04:10 +00:00
|
|
|
$this->cache = $cache;
|
|
|
|
}
|
|
|
|
|
2019-04-14 08:59:39 +00:00
|
|
|
/**
|
|
|
|
* Get a cache key used to store the tokenized code
|
|
|
|
*
|
|
|
|
* @param string $code Not yet tokenized
|
|
|
|
* @return string
|
2019-04-23 17:08:44 +00:00
|
|
|
* @internal
|
2019-04-14 08:59:39 +00:00
|
|
|
*/
|
2019-08-21 10:04:10 +00:00
|
|
|
public function getCacheKey( $code ) {
|
|
|
|
return $this->cache->makeGlobalKey( __CLASS__, self::CACHE_VERSION, crc32( $code ) );
|
2019-04-14 08:59:39 +00:00
|
|
|
}
|
|
|
|
|
2015-08-25 19:57:23 +00:00
|
|
|
/**
|
2019-04-23 17:08:44 +00:00
|
|
|
* Get the tokens for the given code.
|
|
|
|
*
|
2015-08-25 21:15:13 +00:00
|
|
|
* @param string $code
|
2018-09-22 08:48:16 +00:00
|
|
|
* @return array[]
|
2021-09-01 18:29:19 +00:00
|
|
|
* @phan-return array<int,array{0:AFPToken,1:int}>
|
2015-08-25 19:57:23 +00:00
|
|
|
*/
|
2021-09-01 18:29:19 +00:00
|
|
|
public function getTokens( string $code ): array {
|
|
|
|
return $this->cache->getWithSetCallback(
|
2019-08-21 10:04:10 +00:00
|
|
|
$this->getCacheKey( $code ),
|
|
|
|
BagOStuff::TTL_DAY,
|
2019-05-28 08:11:03 +00:00
|
|
|
function () use ( $code ) {
|
2019-09-02 08:25:56 +00:00
|
|
|
return $this->tokenize( $code );
|
2019-05-28 08:11:03 +00:00
|
|
|
}
|
2019-04-23 17:08:44 +00:00
|
|
|
);
|
|
|
|
}
|
2015-08-25 19:57:23 +00:00
|
|
|
|
2019-04-23 17:08:44 +00:00
|
|
|
/**
|
|
|
|
* @param string $code
|
|
|
|
* @return array[]
|
2021-09-01 18:29:19 +00:00
|
|
|
* @phan-return array<int,array{0:AFPToken,1:int}>
|
2019-04-23 17:08:44 +00:00
|
|
|
*/
|
2021-09-01 18:29:19 +00:00
|
|
|
private function tokenize( string $code ): array {
|
2017-06-15 14:23:34 +00:00
|
|
|
$tokens = [];
|
2015-10-22 20:29:53 +00:00
|
|
|
$curPos = 0;
|
2015-08-25 19:57:23 +00:00
|
|
|
|
2015-10-22 20:29:53 +00:00
|
|
|
do {
|
|
|
|
$prevPos = $curPos;
|
2019-09-02 08:25:56 +00:00
|
|
|
$token = $this->nextToken( $code, $curPos );
|
2017-06-15 14:23:34 +00:00
|
|
|
$tokens[ $token->pos ] = [ $token, $curPos ];
|
2015-10-22 20:29:53 +00:00
|
|
|
} while ( $curPos !== $prevPos );
|
|
|
|
|
2015-08-25 19:57:23 +00:00
|
|
|
return $tokens;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-08-25 21:15:13 +00:00
|
|
|
* @param string $code
|
2017-08-04 23:14:10 +00:00
|
|
|
* @param int &$offset
|
2015-08-25 19:57:23 +00:00
|
|
|
* @return AFPToken
|
2021-08-29 22:58:17 +00:00
|
|
|
* @throws UserVisibleException
|
2015-08-25 19:57:23 +00:00
|
|
|
*/
|
2019-09-02 08:25:56 +00:00
|
|
|
private function nextToken( $code, &$offset ) {
|
2017-06-15 14:23:34 +00:00
|
|
|
$matches = [];
|
2015-08-25 19:57:23 +00:00
|
|
|
$start = $offset;
|
|
|
|
|
|
|
|
// Read past comments
|
|
|
|
while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) {
|
2018-04-10 17:26:02 +00:00
|
|
|
if ( strpos( $code, '*/', $offset ) === false ) {
|
2021-08-29 22:58:17 +00:00
|
|
|
throw new UserVisibleException(
|
2018-04-10 17:26:02 +00:00
|
|
|
'unclosedcomment', $offset, [] );
|
|
|
|
}
|
2015-08-25 19:57:23 +00:00
|
|
|
$offset = strpos( $code, '*/', $offset ) + 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Spaces
|
|
|
|
$offset += strspn( $code, self::WHITESPACE, $offset );
|
|
|
|
if ( $offset >= strlen( $code ) ) {
|
2015-09-28 18:03:35 +00:00
|
|
|
return new AFPToken( AFPToken::TNONE, '', $start );
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
$chr = $code[$offset];
|
|
|
|
|
|
|
|
// Punctuation
|
2019-11-16 15:32:36 +00:00
|
|
|
if ( isset( self::PUNCTUATION[$chr] ) ) {
|
2015-08-25 19:57:23 +00:00
|
|
|
$offset++;
|
2019-11-16 15:32:36 +00:00
|
|
|
return new AFPToken( self::PUNCTUATION[$chr], $chr, $start );
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// String literal
|
|
|
|
if ( $chr === '"' || $chr === "'" ) {
|
|
|
|
return self::readStringLiteral( $code, $offset, $start );
|
|
|
|
}
|
|
|
|
|
2017-06-15 14:23:34 +00:00
|
|
|
$matches = [];
|
2015-08-25 19:57:23 +00:00
|
|
|
|
|
|
|
// Operators
|
|
|
|
if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) {
|
|
|
|
$token = $matches[0];
|
|
|
|
$offset += strlen( $token );
|
2015-09-28 18:03:35 +00:00
|
|
|
return new AFPToken( AFPToken::TOP, $token, $start );
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Numbers
|
2019-09-07 11:34:56 +00:00
|
|
|
$matchesv2 = [];
|
2019-11-13 13:26:46 +00:00
|
|
|
if ( preg_match( self::RADIX_RE, $code, $matchesv2, 0, $offset ) ) {
|
2019-09-07 11:34:56 +00:00
|
|
|
$token = $matchesv2[0];
|
|
|
|
$baseChar = $matchesv2['base'];
|
|
|
|
$input = $matchesv2['input'];
|
2019-11-16 15:32:36 +00:00
|
|
|
$base = $baseChar ? self::BASES[$baseChar] : 10;
|
|
|
|
if ( preg_match( self::BASE_CHARS_RES[$base], $input ) ) {
|
2015-08-25 19:57:23 +00:00
|
|
|
$num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input;
|
|
|
|
$offset += strlen( $token );
|
|
|
|
return ( strpos( $input, '.' ) !== false )
|
2015-09-28 18:03:35 +00:00
|
|
|
? new AFPToken( AFPToken::TFLOAT, floatval( $num ), $start )
|
|
|
|
: new AFPToken( AFPToken::TINT, intval( $num ), $start );
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// IDs / Keywords
|
|
|
|
|
|
|
|
if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) {
|
|
|
|
$token = $matches[0];
|
|
|
|
$offset += strlen( $token );
|
2019-11-16 15:32:36 +00:00
|
|
|
$type = in_array( $token, self::KEYWORDS )
|
2015-09-28 18:03:35 +00:00
|
|
|
? AFPToken::TKEYWORD
|
2015-08-25 19:57:23 +00:00
|
|
|
: AFPToken::TID;
|
|
|
|
return new AFPToken( $type, $token, $start );
|
|
|
|
}
|
|
|
|
|
2021-08-29 22:58:17 +00:00
|
|
|
throw new UserVisibleException(
|
2017-06-15 14:23:34 +00:00
|
|
|
'unrecognisedtoken', $start, [ substr( $code, $start ) ] );
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-08-25 21:15:13 +00:00
|
|
|
* @param string $code
|
|
|
|
* @param int &$offset
|
|
|
|
* @param int $start
|
|
|
|
* @return AFPToken
|
2021-08-29 22:58:17 +00:00
|
|
|
* @throws UserVisibleException
|
2015-08-25 19:57:23 +00:00
|
|
|
*/
|
2019-08-21 10:04:10 +00:00
|
|
|
private static function readStringLiteral( $code, &$offset, $start ) {
|
2015-08-25 19:57:23 +00:00
|
|
|
$type = $code[$offset];
|
|
|
|
$offset++;
|
|
|
|
$length = strlen( $code );
|
|
|
|
$token = '';
|
|
|
|
while ( $offset < $length ) {
|
|
|
|
if ( $code[$offset] === $type ) {
|
|
|
|
$offset++;
|
2015-09-28 18:03:35 +00:00
|
|
|
return new AFPToken( AFPToken::TSTRING, $token, $start );
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Performance: Use a PHP function (implemented in C)
|
|
|
|
// to scan ahead.
|
|
|
|
$addLength = strcspn( $code, $type . "\\", $offset );
|
|
|
|
if ( $addLength ) {
|
|
|
|
$token .= substr( $code, $offset, $addLength );
|
|
|
|
$offset += $addLength;
|
2018-08-26 08:34:42 +00:00
|
|
|
} elseif ( $code[$offset] === '\\' ) {
|
2015-09-28 18:03:35 +00:00
|
|
|
switch ( $code[$offset + 1] ) {
|
2015-08-25 19:57:23 +00:00
|
|
|
case '\\':
|
|
|
|
$token .= '\\';
|
|
|
|
break;
|
|
|
|
case $type:
|
|
|
|
$token .= $type;
|
|
|
|
break;
|
2020-10-29 00:44:48 +00:00
|
|
|
case 'n':
|
2015-08-25 19:57:23 +00:00
|
|
|
$token .= "\n";
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
$token .= "\r";
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
$token .= "\t";
|
|
|
|
break;
|
|
|
|
case 'x':
|
|
|
|
$chr = substr( $code, $offset + 2, 2 );
|
|
|
|
|
|
|
|
if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) {
|
2018-10-03 15:55:06 +00:00
|
|
|
$token .= chr( hexdec( $chr ) );
|
2018-04-04 21:14:25 +00:00
|
|
|
// \xXX -- 2 done later
|
|
|
|
$offset += 2;
|
2015-08-25 19:57:23 +00:00
|
|
|
} else {
|
2019-11-16 15:17:04 +00:00
|
|
|
$token .= '\\x';
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
$token .= "\\" . $code[$offset + 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
$offset += 2;
|
|
|
|
|
|
|
|
} else {
|
2018-08-25 08:10:15 +00:00
|
|
|
// Should never happen
|
2019-04-13 16:27:20 +00:00
|
|
|
// @codeCoverageIgnoreStart
|
2015-08-25 19:57:23 +00:00
|
|
|
$token .= $code[$offset];
|
|
|
|
$offset++;
|
2019-04-13 16:27:20 +00:00
|
|
|
// @codeCoverageIgnoreEnd
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
}
|
2021-08-29 22:58:17 +00:00
|
|
|
throw new UserVisibleException( 'unclosedstring', $offset, [] );
|
2015-08-25 19:57:23 +00:00
|
|
|
}
|
|
|
|
}
|