mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/AbuseFilter.git
synced 2024-12-13 06:29:35 +00:00
93e8cb5ac5
As follow-up of I10b1fd2d9bdfe518089c053d77fef568170ecb65, use 'AbuseFilter' instead of 'AbuseFilterDeprecatedVars' as channel name. Raise level for null-title filtering. Since with a null title several things are likely to break, a warning is more appropriate here. Tweaked the message as well, to include the bug number and to avoid pointlessly including the title (which is null). Lower the level for stashedit hit/miss (as it's really spammy and not that useful right now). Use 'abusefilter' instead of 'AbuseFilter' for statsd so that everything has the same prefix. Also raise the level for parser exceptions and unrecognized consequences. Change-Id: I1f9988155e924232b201281795cd322636da8082
270 lines
6.6 KiB
PHP
270 lines
6.6 KiB
PHP
<?php
|
|
|
|
use MediaWiki\MediaWikiServices;
|
|
|
|
/**
|
|
* Tokenizer for AbuseFilter rules.
|
|
*/
|
|
class AbuseFilterTokenizer {
|
|
/** @var int Tokenizer cache version. Increment this when changing the syntax. **/
|
|
const CACHE_VERSION = 1;
|
|
const COMMENT_START_RE = '/\s*\/\*/A';
|
|
const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
|
|
const OPERATOR_RE =
|
|
'/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
|
|
const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([bxo])?/Au';
|
|
const WHITESPACE = "\011\012\013\014\015\040";
|
|
|
|
// Order is important. The punctuation-matching regex requires that
|
|
// ** comes before *, etc. They are sorted to make it easy to spot
|
|
// such errors.
|
|
public static $operators = [
|
|
// Inequality
|
|
'!==', '!=', '!',
|
|
// Multiplication/exponentiation
|
|
'**', '*',
|
|
// Other arithmetic
|
|
'/', '+', '-', '%',
|
|
// Logic
|
|
'&', '|', '^',
|
|
// Setting
|
|
':=',
|
|
// Ternary
|
|
'?', ':',
|
|
// Less than
|
|
'<=', '<',
|
|
// Greater than
|
|
'>=', '>',
|
|
// Equality
|
|
'===', '==', '=',
|
|
];
|
|
|
|
public static $punctuation = [
|
|
',' => AFPToken::TCOMMA,
|
|
'(' => AFPToken::TBRACE,
|
|
')' => AFPToken::TBRACE,
|
|
'[' => AFPToken::TSQUAREBRACKET,
|
|
']' => AFPToken::TSQUAREBRACKET,
|
|
';' => AFPToken::TSTATEMENTSEPARATOR,
|
|
];
|
|
|
|
public static $bases = [
|
|
'b' => 2,
|
|
'x' => 16,
|
|
'o' => 8
|
|
];
|
|
|
|
public static $baseCharsRe = [
|
|
2 => '/^[01]+$/',
|
|
8 => '/^[0-8]+$/',
|
|
16 => '/^[0-9A-Fa-f]+$/',
|
|
10 => '/^[0-9.]+$/',
|
|
];
|
|
|
|
public static $keywords = [
|
|
'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
|
|
'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
|
|
];
|
|
|
|
/**
|
|
* @param string $code
|
|
* @return array
|
|
* @throws AFPException
|
|
* @throws AFPUserVisibleException
|
|
*/
|
|
public static function tokenize( $code ) {
|
|
static $tokenizerCache = null;
|
|
|
|
if ( !$tokenizerCache ) {
|
|
$tokenizerCache = ObjectCache::getLocalServerInstance( 'hash' );
|
|
}
|
|
|
|
static $stats = null;
|
|
|
|
if ( !$stats ) {
|
|
$stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
|
|
}
|
|
|
|
$cacheKey = wfGlobalCacheKey( __CLASS__, self::CACHE_VERSION, crc32( $code ) );
|
|
|
|
$tokens = $tokenizerCache->get( $cacheKey );
|
|
|
|
if ( $tokens ) {
|
|
$stats->increment( 'abusefilter.tokenizerCache.hit' );
|
|
return $tokens;
|
|
}
|
|
|
|
$stats->increment( 'abusefilter.tokenizerCache.miss' );
|
|
$tokens = [];
|
|
$curPos = 0;
|
|
|
|
do {
|
|
$prevPos = $curPos;
|
|
$token = self::nextToken( $code, $curPos );
|
|
$tokens[ $token->pos ] = [ $token, $curPos ];
|
|
} while ( $curPos !== $prevPos );
|
|
|
|
$tokenizerCache->set( $cacheKey, $tokens, 60 * 60 * 24 );
|
|
|
|
return $tokens;
|
|
}
|
|
|
|
/**
|
|
* @param string $code
|
|
* @param int &$offset
|
|
* @return AFPToken
|
|
* @throws AFPException
|
|
* @throws AFPUserVisibleException
|
|
*/
|
|
protected static function nextToken( $code, &$offset ) {
|
|
$matches = [];
|
|
$start = $offset;
|
|
|
|
// Read past comments
|
|
while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) {
|
|
if ( strpos( $code, '*/', $offset ) === false ) {
|
|
throw new AFPUserVisibleException(
|
|
'unclosedcomment', $offset, [] );
|
|
}
|
|
$offset = strpos( $code, '*/', $offset ) + 2;
|
|
}
|
|
|
|
// Spaces
|
|
$offset += strspn( $code, self::WHITESPACE, $offset );
|
|
if ( $offset >= strlen( $code ) ) {
|
|
return new AFPToken( AFPToken::TNONE, '', $start );
|
|
}
|
|
|
|
$chr = $code[$offset];
|
|
|
|
// Punctuation
|
|
if ( isset( self::$punctuation[$chr] ) ) {
|
|
$offset++;
|
|
return new AFPToken( self::$punctuation[$chr], $chr, $start );
|
|
}
|
|
|
|
// String literal
|
|
if ( $chr === '"' || $chr === "'" ) {
|
|
return self::readStringLiteral( $code, $offset, $start );
|
|
}
|
|
|
|
$matches = [];
|
|
|
|
// Operators
|
|
if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) {
|
|
$token = $matches[0];
|
|
$offset += strlen( $token );
|
|
return new AFPToken( AFPToken::TOP, $token, $start );
|
|
}
|
|
|
|
// Numbers
|
|
if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset ) ) {
|
|
$token = $matches[0];
|
|
$input = $matches[1];
|
|
$baseChar = $matches[2] ?? null;
|
|
// Sometimes the base char gets mixed in with the rest of it because
|
|
// the regex targets hex, too.
|
|
// This mostly happens with binary
|
|
if ( !$baseChar && !empty( self::$bases[ substr( $input, - 1 ) ] ) ) {
|
|
$baseChar = substr( $input, - 1, 1 );
|
|
$input = substr( $input, 0, - 1 );
|
|
}
|
|
|
|
$base = $baseChar ? self::$bases[$baseChar] : 10;
|
|
|
|
// Check against the appropriate character class for input validation
|
|
|
|
if ( preg_match( self::$baseCharsRe[$base], $input ) ) {
|
|
$num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input;
|
|
$offset += strlen( $token );
|
|
return ( strpos( $input, '.' ) !== false )
|
|
? new AFPToken( AFPToken::TFLOAT, floatval( $num ), $start )
|
|
: new AFPToken( AFPToken::TINT, intval( $num ), $start );
|
|
}
|
|
}
|
|
|
|
// IDs / Keywords
|
|
|
|
if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) {
|
|
$token = $matches[0];
|
|
$offset += strlen( $token );
|
|
$type = in_array( $token, self::$keywords )
|
|
? AFPToken::TKEYWORD
|
|
: AFPToken::TID;
|
|
return new AFPToken( $type, $token, $start );
|
|
}
|
|
|
|
throw new AFPUserVisibleException(
|
|
'unrecognisedtoken', $start, [ substr( $code, $start ) ] );
|
|
}
|
|
|
|
/**
|
|
* @param string $code
|
|
* @param int &$offset
|
|
* @param int $start
|
|
* @return AFPToken
|
|
* @throws AFPException
|
|
* @throws AFPUserVisibleException
|
|
*/
|
|
protected static function readStringLiteral( $code, &$offset, $start ) {
|
|
$type = $code[$offset];
|
|
$offset++;
|
|
$length = strlen( $code );
|
|
$token = '';
|
|
while ( $offset < $length ) {
|
|
if ( $code[$offset] === $type ) {
|
|
$offset++;
|
|
return new AFPToken( AFPToken::TSTRING, $token, $start );
|
|
}
|
|
|
|
// Performance: Use a PHP function (implemented in C)
|
|
// to scan ahead.
|
|
$addLength = strcspn( $code, $type . "\\", $offset );
|
|
if ( $addLength ) {
|
|
$token .= substr( $code, $offset, $addLength );
|
|
$offset += $addLength;
|
|
} elseif ( $code[$offset] == '\\' ) {
|
|
switch ( $code[$offset + 1] ) {
|
|
case '\\':
|
|
$token .= '\\';
|
|
break;
|
|
case $type:
|
|
$token .= $type;
|
|
break;
|
|
case 'n';
|
|
$token .= "\n";
|
|
break;
|
|
case 'r':
|
|
$token .= "\r";
|
|
break;
|
|
case 't':
|
|
$token .= "\t";
|
|
break;
|
|
case 'x':
|
|
$chr = substr( $code, $offset + 2, 2 );
|
|
|
|
if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) {
|
|
$chr = base_convert( $chr, 16, 10 );
|
|
$token .= chr( $chr );
|
|
// \xXX -- 2 done later
|
|
$offset += 2;
|
|
} else {
|
|
$token .= 'x';
|
|
}
|
|
break;
|
|
default:
|
|
$token .= "\\" . $code[$offset + 1];
|
|
}
|
|
|
|
$offset += 2;
|
|
|
|
} else {
|
|
// Should never happen
|
|
$token .= $code[$offset];
|
|
$offset++;
|
|
}
|
|
}
|
|
throw new AFPUserVisibleException( 'unclosedstring', $offset, [] );
|
|
}
|
|
}
|