mediawiki-extensions-AbuseF.../AbuseFilterTokenizer.php

<?php
/**
 * Tokenizer for AbuseFilter rules.
 */
class AbuseFilterTokenizer {

	/** @var int Tokenizer cache version. Increment this when changing the syntax. **/
	const CACHE_VERSION = 1;
	const COMMENT_START_RE = '/\s*\/\*/A';
	const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
	const OPERATOR_RE = '/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
	const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([bxo])?/Au';
	const WHITESPACE = "\011\012\013\014\015\040";

	// Order is important. The punctuation-matching regex requires that
	//  ** comes before *, etc. They are sorted to make it easy to spot
	//  such errors.
	static $operators = array(
		'!==', '!=', '!',   // Inequality
		'**', '*',          // Multiplication/exponentiation
		'/', '+', '-', '%', // Other arithmetic
		'&', '|', '^',      // Logic
		':=',               // Setting
		'?', ':',           // Ternery
		'<=', '<',          // Less than
		'>=', '>',          // Greater than
		'===', '==', '=',   // Equality
	);

	static $punctuation = array(
		',' => AFPToken::TComma,
		'(' => AFPToken::TBrace,
		')' => AFPToken::TBrace,
		'[' => AFPToken::TSquareBracket,
		']' => AFPToken::TSquareBracket,
		';' => AFPToken::TStatementSeparator,
	);

	static $bases = array(
		'b' => 2,
		'x' => 16,
		'o' => 8
	);

	static $baseCharsRe = array(
		2  => '/^[01]+$/',
		8  => '/^[0-8]+$/',
		16 => '/^[0-9A-Fa-f]+$/',
		10 => '/^[0-9.]+$/',
	);

	static $keywords = array(
		'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
		'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
	);

	/**
	 * @param string $code
	 * @return array
	 * @throws AFPException
	 * @throws AFPUserVisibleException
	 */
	static function tokenize( $code ) {
		static $tokenizerCache = null;

		if ( !$tokenizerCache ) {
			$tokenizerCache = ObjectCache::newAccelerator( array(), 'hash' );
		}

		static $stats = null;

		if ( !$stats ) {
			$stats = RequestContext::getMain()->getStats();
		}

		$cacheKey = wfGlobalCacheKey( __CLASS__, self::CACHE_VERSION, crc32( $code ) );

		$tokens = $tokenizerCache->get( $cacheKey );

		if ( $tokens ) {
			$stats->increment( 'AbuseFilter.tokenizerCache.hit' );
			return $tokens;
		}

		$stats->increment( 'AbuseFilter.tokenizerCache.miss' );
		$tokens = array();
		$curPos = 0;

		do {
			$prevPos = $curPos;
			$token = self::nextToken( $code, $curPos );
			$tokens[ $token->pos ] = array( $token, $curPos );
		} while ( $curPos !== $prevPos );

		$tokenizerCache->set( $cacheKey, $tokens, 60 * 60 * 24 );

		return $tokens;
	}

	/**
	 * @param string $code
	 * @param integer &$offset
	 * @return AFPToken
	 * @throws AFPException
	 * @throws AFPUserVisibleException
	 */
	protected static function nextToken( $code, &$offset ) {
		$matches = array();
		$start = $offset;

		// Read past comments
		while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) {
			$offset = strpos( $code, '*/', $offset ) + 2;
		}

		// Spaces
		$offset += strspn( $code, self::WHITESPACE, $offset );
		if ( $offset >= strlen( $code ) ) {
			return new AFPToken( AFPToken::TNone, '', $start );
		}

		$chr = $code[$offset];

		// Punctuation
		if ( isset( self::$punctuation[$chr] ) ) {
			$offset++;
			return new AFPToken( self::$punctuation[$chr], $chr, $start );
		}

		// String literal
		if ( $chr === '"' || $chr === "'" ) {
			return self::readStringLiteral( $code, $offset, $start );
		}

		$matches = array();

		// Operators
		if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) {
			$token = $matches[0];
			$offset += strlen( $token );
			return new AFPToken( AFPToken::TOp, $token, $start );
		}

		// Numbers
		if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset ) ) {
			$token = $matches[0];
			$input = $matches[1];
			$baseChar = isset( $matches[2] ) ? $matches[2] : null;
			// Sometimes the base char gets mixed in with the rest of it because
			// the regex targets hex, too.
			// This mostly happens with binary
			if ( !$baseChar && !empty( self::$bases[ substr( $input, - 1 ) ] ) ) {
				$baseChar = substr( $input, - 1, 1 );
				$input = substr( $input, 0, - 1 );
			}

			$base = $baseChar ? self::$bases[$baseChar] : 10;

			// Check against the appropriate character class for input validation

			if ( preg_match( self::$baseCharsRe[$base], $input ) ) {
				$num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input;
				$offset += strlen( $token );
				return ( strpos( $input, '.' ) !== false )
					? new AFPToken( AFPToken::TFloat, floatval( $num ), $start )
					: new AFPToken( AFPToken::TInt, intval( $num ), $start );
			}
		}

		// IDs / Keywords

		if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) {
			$token = $matches[0];
			$offset += strlen( $token );
			$type = in_array( $token, self::$keywords )
				? AFPToken::TKeyword
				: AFPToken::TID;
			return new AFPToken( $type, $token, $start );
		}

		throw new AFPUserVisibleException(
			'unrecognisedtoken', $start, array( substr( $code, $start ) ) );
	}

	/**
	 * @param string $code
	 * @param int &$offset
	 * @param int $start
	 * @return AFPToken
	 * @throws AFPException
	 * @throws AFPUserVisibleException
	 */
	protected static function readStringLiteral( $code, &$offset, $start ) {
		$type = $code[$offset];
		$offset++;
		$length = strlen( $code );
		$token = '';
		while ( $offset < $length ) {
			if ( $code[$offset] === $type ) {
				$offset++;
				return new AFPToken( AFPToken::TString, $token, $start );
			}

			// Performance: Use a PHP function (implemented in C)
			// to scan ahead.
			$addLength = strcspn( $code, $type . "\\", $offset );
			if ( $addLength ) {
				$token .= substr( $code, $offset, $addLength );
				$offset += $addLength;
			} elseif ( $code[$offset] == '\\' ) {
				switch( $code[$offset + 1] ) {
					case '\\':
						$token .= '\\';
						break;
					case $type:
						$token .= $type;
						break;
					case 'n';
						$token .= "\n";
						break;
					case 'r':
						$token .= "\r";
						break;
					case 't':
						$token .= "\t";
						break;
					case 'x':
						$chr = substr( $code, $offset + 2, 2 );

						if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) {
							$chr = base_convert( $chr, 16, 10 );
							$token .= chr( $chr );
							$offset += 2; # \xXX -- 2 done later
						} else {
							$token .= 'x';
						}
						break;
					default:
						$token .= "\\" . $code[$offset + 1];
				}

				$offset += 2;

			} else {
				$token .= $code[$offset];
				$offset++;
			}
		}
		throw new AFPUserVisibleException( 'unclosedstring', $offset, array() );
	}
}
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00			`<?php`
			`/**`
			`* Tokenizer for AbuseFilter rules.`
			`*/`
			`class AbuseFilterTokenizer {`

			`/ @var int Tokenizer cache version. Increment this when changing the syntax. /`
			`const CACHE_VERSION = 1;`
			`const COMMENT_START_RE = '/\s\/\/A';`
			`const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';`
			`const OPERATOR_RE = '/(\!\=\=\|\!\=\|\!\|\\\|\*\|\/\|\+\|\-\|%\|&\|\\|\|\^\|\:\=\|\?\|\:\|\<\=\|\<\|\>\=\|\>\|\=\=\=\|\=\=\|\=)/A';`
			`const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?\|\.\d+)([bxo])?/Au';`
			`const WHITESPACE = "\011\012\013\014\015\040";`

			`// Order is important. The punctuation-matching regex requires that`
			`// ** comes before *, etc. They are sorted to make it easy to spot`
			`// such errors.`
			`static $operators = array(`
			`'!==', '!=', '!', // Inequality`
			`'*', '', // Multiplication/exponentiation`
			`'/', '+', '-', '%', // Other arithmetic`
			`'&', '\|', '^', // Logic`
			`':=', // Setting`
			`'?', ':', // Ternery`
			`'<=', '<', // Less than`
			`'>=', '>', // Greater than`
			`'===', '==', '=', // Equality`
			`);`

			`static $punctuation = array(`
			`',' => AFPToken::TComma,`
			`'(' => AFPToken::TBrace,`
			`')' => AFPToken::TBrace,`
			`'[' => AFPToken::TSquareBracket,`
			`']' => AFPToken::TSquareBracket,`
			`';' => AFPToken::TStatementSeparator,`
			`);`

			`static $bases = array(`
			`'b' => 2,`
			`'x' => 16,`
			`'o' => 8`
			`);`

			`static $baseCharsRe = array(`
			`2 => '/^[01]+$/',`
			`8 => '/^[0-8]+$/',`
			`16 => '/^[0-9A-Fa-f]+$/',`
			`10 => '/^[0-9.]+$/',`
			`);`

			`static $keywords = array(`
			`'in', 'like', 'true', 'false', 'null', 'contains', 'matches',`
			`'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',`
			`);`

			`/**`
Small doc fixes for AbuseFilterTokenizer Change-Id: Ic85fa1c772ff7023883ff84449471a310109ba07 2015-08-25 21:15:13 +00:00			`* @param string $code`
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00			`* @return array`
			`* @throws AFPException`
			`* @throws AFPUserVisibleException`
			`*/`
			`static function tokenize( $code ) {`
			`static $tokenizerCache = null;`

			`if ( !$tokenizerCache ) {`
			`$tokenizerCache = ObjectCache::newAccelerator( array(), 'hash' );`
			`}`

Track tokenizer cache hits / misses Change-Id: I65d4c6064c37e9957b6f0aca4d3032f26bdf9bde 2015-10-22 20:29:53 +00:00			`static $stats = null;`

			`if ( !$stats ) {`
			`$stats = RequestContext::getMain()->getStats();`
			`}`

Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00			`$cacheKey = wfGlobalCacheKey( __CLASS__, self::CACHE_VERSION, crc32( $code ) );`
Track tokenizer cache hits / misses Change-Id: I65d4c6064c37e9957b6f0aca4d3032f26bdf9bde 2015-10-22 20:29:53 +00:00
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00			`$tokens = $tokenizerCache->get( $cacheKey );`

Track tokenizer cache hits / misses Change-Id: I65d4c6064c37e9957b6f0aca4d3032f26bdf9bde 2015-10-22 20:29:53 +00:00			`if ( $tokens ) {`
			`$stats->increment( 'AbuseFilter.tokenizerCache.hit' );`
			`return $tokens;`
			`}`
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00
Track tokenizer cache hits / misses Change-Id: I65d4c6064c37e9957b6f0aca4d3032f26bdf9bde 2015-10-22 20:29:53 +00:00			`$stats->increment( 'AbuseFilter.tokenizerCache.miss' );`
			`$tokens = array();`
			`$curPos = 0;`
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00
Track tokenizer cache hits / misses Change-Id: I65d4c6064c37e9957b6f0aca4d3032f26bdf9bde 2015-10-22 20:29:53 +00:00			`do {`
			`$prevPos = $curPos;`
			`$token = self::nextToken( $code, $curPos );`
			`$tokens[ $token->pos ] = array( $token, $curPos );`
			`} while ( $curPos !== $prevPos );`

Increase AbuseFilterTokenizer cache TTL from 10m to 1d AbuseFilterTokenizer is referentially transparent -- the mapping of input to outputs does not vary on nonlocal state. So the cache TTL can be much longer. Change-Id: I9e6ec4347dbb940c3d73538d550a0f045706264c 2015-10-24 17:27:14 +00:00			`$tokenizerCache->set( $cacheKey, $tokens, 60 * 60 * 24 );`
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00
			`return $tokens;`
			`}`

			`/**`
Small doc fixes for AbuseFilterTokenizer Change-Id: Ic85fa1c772ff7023883ff84449471a310109ba07 2015-08-25 21:15:13 +00:00			`* @param string $code`
			`* @param integer &$offset`
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00			`* @return AFPToken`
			`* @throws AFPException`
			`* @throws AFPUserVisibleException`
			`*/`
			`protected static function nextToken( $code, &$offset ) {`
			`$matches = array();`
			`$start = $offset;`

			`// Read past comments`
			`while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) {`
			`$offset = strpos( $code, '*/', $offset ) + 2;`
			`}`

			`// Spaces`
			`$offset += strspn( $code, self::WHITESPACE, $offset );`
			`if ( $offset >= strlen( $code ) ) {`
			`return new AFPToken( AFPToken::TNone, '', $start );`
			`}`

			`$chr = $code[$offset];`

			`// Punctuation`
			`if ( isset( self::$punctuation[$chr] ) ) {`
			`$offset++;`
			`return new AFPToken( self::$punctuation[$chr], $chr, $start );`
			`}`

			`// String literal`
			`if ( $chr === '"' \|\| $chr === "'" ) {`
			`return self::readStringLiteral( $code, $offset, $start );`
			`}`

			`$matches = array();`

			`// Operators`
			`if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) {`
			`$token = $matches[0];`
			`$offset += strlen( $token );`
			`return new AFPToken( AFPToken::TOp, $token, $start );`
			`}`

			`// Numbers`
			`if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset ) ) {`
			`$token = $matches[0];`
			`$input = $matches[1];`
Use isset() to check array element exists rather than relying on @ operator Change-Id: I0ecdcdd1426b2e76a326bc50b6ea0ca1cbad3d22 2015-10-21 23:52:14 +00:00			`$baseChar = isset( $matches[2] ) ? $matches[2] : null;`
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00			`// Sometimes the base char gets mixed in with the rest of it because`
			`// the regex targets hex, too.`
			`// This mostly happens with binary`
			`if ( !$baseChar && !empty( self::$bases[ substr( $input, - 1 ) ] ) ) {`
			`$baseChar = substr( $input, - 1, 1 );`
			`$input = substr( $input, 0, - 1 );`
			`}`

			`$base = $baseChar ? self::$bases[$baseChar] : 10;`

			`// Check against the appropriate character class for input validation`

			`if ( preg_match( self::$baseCharsRe[$base], $input ) ) {`
			`$num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input;`
			`$offset += strlen( $token );`
			`return ( strpos( $input, '.' ) !== false )`
			`? new AFPToken( AFPToken::TFloat, floatval( $num ), $start )`
			`: new AFPToken( AFPToken::TInt, intval( $num ), $start );`
			`}`
			`}`

			`// IDs / Keywords`

			`if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) {`
			`$token = $matches[0];`
			`$offset += strlen( $token );`
			`$type = in_array( $token, self::$keywords )`
			`? AFPToken::TKeyword`
			`: AFPToken::TID;`
			`return new AFPToken( $type, $token, $start );`
			`}`

			`throw new AFPUserVisibleException(`
			`'unrecognisedtoken', $start, array( substr( $code, $start ) ) );`
			`}`

			`/**`
Small doc fixes for AbuseFilterTokenizer Change-Id: Ic85fa1c772ff7023883ff84449471a310109ba07 2015-08-25 21:15:13 +00:00			`* @param string $code`
			`* @param int &$offset`
			`* @param int $start`
			`* @return AFPToken`
Move rule tokenization to new AbuseFilterTokenizer class * Move AbuseFilterParser::nextToken() and the various AbuseFilterParser properties that accompanied it to a new class, AbuseFilterTokenizer. * Tokenize rules eagerly and cache the result in APC. Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d 2015-08-25 19:57:23 +00:00			`* @throws AFPException`
			`* @throws AFPUserVisibleException`
			`*/`
			`protected static function readStringLiteral( $code, &$offset, $start ) {`
			`$type = $code[$offset];`
			`$offset++;`
			`$length = strlen( $code );`
			`$token = '';`
			`while ( $offset < $length ) {`
			`if ( $code[$offset] === $type ) {`
			`$offset++;`
			`return new AFPToken( AFPToken::TString, $token, $start );`
			`}`

			`// Performance: Use a PHP function (implemented in C)`
			`// to scan ahead.`
			`$addLength = strcspn( $code, $type . "\\", $offset );`
			`if ( $addLength ) {`
			`$token .= substr( $code, $offset, $addLength );`
			`$offset += $addLength;`
			`} elseif ( $code[$offset] == '\\' ) {`
			`switch( $code[$offset + 1] ) {`
			`case '\\':`
			`$token .= '\\';`
			`break;`
			`case $type:`
			`$token .= $type;`
			`break;`
			`case 'n';`
			`$token .= "\n";`
			`break;`
			`case 'r':`
			`$token .= "\r";`
			`break;`
			`case 't':`
			`$token .= "\t";`
			`break;`
			`case 'x':`
			`$chr = substr( $code, $offset + 2, 2 );`

			`if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) {`
			`$chr = base_convert( $chr, 16, 10 );`
			`$token .= chr( $chr );`
			`$offset += 2; # \xXX -- 2 done later`
			`} else {`
			`$token .= 'x';`
			`}`
			`break;`
			`default:`
			`$token .= "\\" . $code[$offset + 1];`
			`}`

			`$offset += 2;`

			`} else {`
			`$token .= $code[$offset];`
			`$offset++;`
			`}`
			`}`
			`throw new AFPUserVisibleException( 'unclosedstring', $offset, array() );`
			`}`
			`}`