mediawiki-extensions-AbuseF.../includes/parser/AFPTreeParser.php
Daimona Eaytoy d515af0ae6 Add a new class for the CachingParser's AST
This allows a little bit more of abstraction: we can store other data in the
tree, without having to store it in a specific node (e.g. the variables map,
which is still unused). It also adds a few typehints, and specializes
the return value of eval'ing the AST: previously, it was the one of
evalNode, which wasn't guaranteed to be an AFPData. Now we have this
guarantee. Last but not least, we can now measure runtime metrics for
evalTree, which doesn't recurse.
Bonus: fix a check in the old parser, which used the wrong variable when
reporting outofbounds errors.

Change-Id: Iff806793b1d968e9bb6220f1459f3d0ac587c7da
2019-08-25 17:29:16 +00:00

698 lines
18 KiB
PHP

<?php
/**
* A version of the abuse filter parser that separates parsing the filter and
* evaluating it into different passes, allowing the parse tree to be cached.
*
* @file
*/
/**
* A parser that transforms the text of the filter into a parse tree.
*/
class AFPTreeParser {
// The tokenized representation of the filter parsed.
public $mTokens;
// Current token handled by the parser and its position.
public $mCur, $mPos;
const CACHE_VERSION = 2;
/**
* @var bool[] Custom variable names, set of [ name => true ]
*/
private $variablesNames;
/**
* Create a new instance
*/
public function __construct() {
$this->resetState();
}
/**
* Resets the state
*/
public function resetState() {
$this->mTokens = [];
$this->mPos = 0;
$this->variablesNames = [];
}
/**
* Advances the parser to the next token in the filter code.
*/
protected function move() {
list( $this->mCur, $this->mPos ) = $this->mTokens[$this->mPos];
}
/**
* Get the next token. This is similar to move() but doesn't change class members,
* allowing to look ahead without rolling back the state.
*
* @return AFPToken
*/
protected function getNextToken() {
return $this->mTokens[$this->mPos][0];
}
/**
* getState() function allows parser state to be rollbacked to several tokens
* back.
*
* @return AFPParserState
*/
protected function getState() {
return new AFPParserState( $this->mCur, $this->mPos );
}
/**
* setState() function allows parser state to be rollbacked to several tokens
* back.
*
* @param AFPParserState $state
*/
protected function setState( AFPParserState $state ) {
$this->mCur = $state->token;
$this->mPos = $state->pos;
}
/**
* Parse the supplied filter source code into a tree.
*
* @param string $code
* @throws AFPUserVisibleException
* @return AFPSyntaxTree
*/
public function parse( $code ) : AFPSyntaxTree {
$this->mTokens = AbuseFilterTokenizer::getTokens( $code );
$this->mPos = 0;
return $this->buildSyntaxTree();
}
/**
* @return AFPSyntaxTree
*/
public function buildSyntaxTree() : AFPSyntaxTree {
$root = $this->doLevelEntry();
$variables = array_keys( $this->variablesNames );
return new AFPSyntaxTree( $variables, $root );
}
/* Levels */
/**
* Handles unexpected characters after the expression.
* @return AFPTreeNode|null Null only if no statements
* @throws AFPUserVisibleException
*/
protected function doLevelEntry() {
$result = $this->doLevelSemicolon();
if ( $this->mCur->type !== AFPToken::TNONE ) {
throw new AFPUserVisibleException(
'unexpectedatend',
$this->mPos, [ $this->mCur->type ]
);
}
return $result;
}
/**
* Handles the semicolon operator.
*
* @return AFPTreeNode|null
*/
protected function doLevelSemicolon() {
$statements = [];
do {
$this->move();
$position = $this->mPos;
if (
$this->mCur->type === AFPToken::TNONE ||
( $this->mCur->type === AFPToken::TBRACE && $this->mCur->value == ')' )
) {
// Handle special cases which the other parser handled in doLevelAtom
break;
}
// Allow empty statements.
if ( $this->mCur->type === AFPToken::TSTATEMENTSEPARATOR ) {
continue;
}
$statements[] = $this->doLevelSet();
$position = $this->mPos;
} while ( $this->mCur->type === AFPToken::TSTATEMENTSEPARATOR );
// Flatten the tree if possible.
if ( count( $statements ) === 0 ) {
return null;
} elseif ( count( $statements ) === 1 ) {
return $statements[0];
} else {
return new AFPTreeNode( AFPTreeNode::SEMICOLON, $statements, $position );
}
}
/**
* Handles variable assignment.
*
* @return AFPTreeNode
* @throws AFPUserVisibleException
*/
protected function doLevelSet() {
if ( $this->mCur->type === AFPToken::TID ) {
$varname = $this->mCur->value;
// Speculatively parse the assignment statement assuming it can
// potentially be an assignment, but roll back if it isn't.
$initialState = $this->getState();
$this->move();
if ( $this->mCur->type === AFPToken::TOP && $this->mCur->value === ':=' ) {
$position = $this->mPos;
$this->move();
$value = $this->doLevelSet();
$this->variablesNames[ $varname ] = true;
return new AFPTreeNode( AFPTreeNode::ASSIGNMENT, [ $varname, $value ], $position );
}
if ( $this->mCur->type === AFPToken::TSQUAREBRACKET && $this->mCur->value === '[' ) {
$this->move();
if ( $this->mCur->type === AFPToken::TSQUAREBRACKET && $this->mCur->value === ']' ) {
$index = 'append';
} else {
// Parse index offset.
$this->setState( $initialState );
$this->move();
$index = $this->doLevelSemicolon();
if ( !( $this->mCur->type === AFPToken::TSQUAREBRACKET && $this->mCur->value === ']' ) ) {
throw new AFPUserVisibleException( 'expectednotfound', $this->mPos,
[ ']', $this->mCur->type, $this->mCur->value ] );
}
}
$this->move();
if ( $this->mCur->type === AFPToken::TOP && $this->mCur->value === ':=' ) {
$position = $this->mPos;
$this->move();
$value = $this->doLevelSet();
$this->variablesNames[ $varname ] = true;
if ( $index === 'append' ) {
return new AFPTreeNode(
AFPTreeNode::ARRAY_APPEND, [ $varname, $value ], $position );
} else {
return new AFPTreeNode(
AFPTreeNode::INDEX_ASSIGNMENT,
[ $varname, $index, $value ],
$position
);
}
}
}
// If we reached this point, we did not find an assignment. Roll back
// and assume this was just a literal.
$this->setState( $initialState );
}
return $this->doLevelConditions();
}
/**
* Handles ternary operator and if-then-else-end.
*
* @return AFPTreeNode
* @throws AFPUserVisibleException
*/
protected function doLevelConditions() {
if ( $this->mCur->type === AFPToken::TKEYWORD && $this->mCur->value === 'if' ) {
$position = $this->mPos;
$this->move();
$condition = $this->doLevelBoolOps();
if ( !( $this->mCur->type === AFPToken::TKEYWORD && $this->mCur->value === 'then' ) ) {
throw new AFPUserVisibleException( 'expectednotfound',
$this->mPos,
[
'then',
$this->mCur->type,
$this->mCur->value
]
);
}
$this->move();
$valueIfTrue = $this->doLevelConditions();
if ( $this->mCur->type === AFPToken::TKEYWORD && $this->mCur->value === 'else' ) {
$this->move();
$valueIfFalse = $this->doLevelConditions();
} else {
$valueIfFalse = null;
}
if ( !( $this->mCur->type === AFPToken::TKEYWORD && $this->mCur->value === 'end' ) ) {
throw new AFPUserVisibleException( 'expectednotfound',
$this->mPos,
[
'end',
$this->mCur->type,
$this->mCur->value
]
);
}
$this->move();
return new AFPTreeNode(
AFPTreeNode::CONDITIONAL,
[ $condition, $valueIfTrue, $valueIfFalse ],
$position
);
}
$condition = $this->doLevelBoolOps();
if ( $this->mCur->type === AFPToken::TOP && $this->mCur->value === '?' ) {
$position = $this->mPos;
$this->move();
$valueIfTrue = $this->doLevelConditions();
if ( !( $this->mCur->type === AFPToken::TOP && $this->mCur->value === ':' ) ) {
throw new AFPUserVisibleException( 'expectednotfound',
$this->mPos,
[
':',
$this->mCur->type,
$this->mCur->value
]
);
}
$this->move();
$valueIfFalse = $this->doLevelConditions();
return new AFPTreeNode(
AFPTreeNode::CONDITIONAL,
[ $condition, $valueIfTrue, $valueIfFalse ],
$position
);
}
return $condition;
}
/**
* Handles logic operators.
*
* @return AFPTreeNode
*/
protected function doLevelBoolOps() {
$leftOperand = $this->doLevelCompares();
$ops = [ '&', '|', '^' ];
while ( $this->mCur->type === AFPToken::TOP && in_array( $this->mCur->value, $ops ) ) {
$op = $this->mCur->value;
$position = $this->mPos;
$this->move();
$rightOperand = $this->doLevelCompares();
$leftOperand = new AFPTreeNode(
AFPTreeNode::LOGIC,
[ $op, $leftOperand, $rightOperand ],
$position
);
}
return $leftOperand;
}
/**
* Handles comparison operators.
*
* @return AFPTreeNode
*/
protected function doLevelCompares() {
$leftOperand = $this->doLevelSumRels();
$equalityOps = [ '==', '===', '!=', '!==', '=' ];
$orderOps = [ '<', '>', '<=', '>=' ];
// Only allow either a single operation, or a combination of a single equalityOps and a single
// orderOps. This resembles what PHP does, and allows `a < b == c` while rejecting `a < b < c`
$allowedOps = array_merge( $equalityOps, $orderOps );
while ( $this->mCur->type === AFPToken::TOP && in_array( $this->mCur->value, $allowedOps ) ) {
$op = $this->mCur->value;
$allowedOps = in_array( $op, $equalityOps ) ?
array_diff( $allowedOps, $equalityOps ) :
array_diff( $allowedOps, $orderOps );
$position = $this->mPos;
$this->move();
$rightOperand = $this->doLevelSumRels();
$leftOperand = new AFPTreeNode(
AFPTreeNode::COMPARE,
[ $op, $leftOperand, $rightOperand ],
$position
);
}
return $leftOperand;
}
/**
* Handle addition and subtraction.
*
* @return AFPTreeNode
*/
protected function doLevelSumRels() {
$leftOperand = $this->doLevelMulRels();
$ops = [ '+', '-' ];
while ( $this->mCur->type === AFPToken::TOP && in_array( $this->mCur->value, $ops ) ) {
$op = $this->mCur->value;
$position = $this->mPos;
$this->move();
$rightOperand = $this->doLevelMulRels();
$leftOperand = new AFPTreeNode(
AFPTreeNode::SUM_REL,
[ $op, $leftOperand, $rightOperand ],
$position
);
}
return $leftOperand;
}
/**
* Handles multiplication and division.
*
* @return AFPTreeNode
*/
protected function doLevelMulRels() {
$leftOperand = $this->doLevelPow();
$ops = [ '*', '/', '%' ];
while ( $this->mCur->type === AFPToken::TOP && in_array( $this->mCur->value, $ops ) ) {
$op = $this->mCur->value;
$position = $this->mPos;
$this->move();
$rightOperand = $this->doLevelPow();
$leftOperand = new AFPTreeNode(
AFPTreeNode::MUL_REL,
[ $op, $leftOperand, $rightOperand ],
$position
);
}
return $leftOperand;
}
/**
* Handles exponentiation.
*
* @return AFPTreeNode
*/
protected function doLevelPow() {
$base = $this->doLevelBoolInvert();
while ( $this->mCur->type === AFPToken::TOP && $this->mCur->value === '**' ) {
$position = $this->mPos;
$this->move();
$exponent = $this->doLevelBoolInvert();
$base = new AFPTreeNode( AFPTreeNode::POW, [ $base, $exponent ], $position );
}
return $base;
}
/**
* Handles boolean inversion.
*
* @return AFPTreeNode
*/
protected function doLevelBoolInvert() {
if ( $this->mCur->type === AFPToken::TOP && $this->mCur->value === '!' ) {
$position = $this->mPos;
$this->move();
$argument = $this->doLevelKeywordOperators();
return new AFPTreeNode( AFPTreeNode::BOOL_INVERT, [ $argument ], $position );
}
return $this->doLevelKeywordOperators();
}
/**
* Handles keyword operators.
*
* @return AFPTreeNode
*/
protected function doLevelKeywordOperators() {
$leftOperand = $this->doLevelUnarys();
$keyword = strtolower( $this->mCur->value );
if ( $this->mCur->type === AFPToken::TKEYWORD &&
isset( AbuseFilterParser::$mKeywords[$keyword] )
) {
$position = $this->mPos;
$this->move();
$rightOperand = $this->doLevelUnarys();
return new AFPTreeNode(
AFPTreeNode::KEYWORD_OPERATOR,
[ $keyword, $leftOperand, $rightOperand ],
$position
);
}
return $leftOperand;
}
/**
* Handles unary operators.
*
* @return AFPTreeNode
*/
protected function doLevelUnarys() {
$op = $this->mCur->value;
if ( $this->mCur->type === AFPToken::TOP && ( $op === "+" || $op === "-" ) ) {
$position = $this->mPos;
$this->move();
$argument = $this->doLevelArrayElements();
return new AFPTreeNode( AFPTreeNode::UNARY, [ $op, $argument ], $position );
}
return $this->doLevelArrayElements();
}
/**
* Handles accessing an array element by an offset.
*
* @return AFPTreeNode
* @throws AFPUserVisibleException
*/
protected function doLevelArrayElements() {
$array = $this->doLevelParenthesis();
while ( $this->mCur->type === AFPToken::TSQUAREBRACKET && $this->mCur->value === '[' ) {
$position = $this->mPos;
$index = $this->doLevelSemicolon();
$array = new AFPTreeNode( AFPTreeNode::ARRAY_INDEX, [ $array, $index ], $position );
if ( !( $this->mCur->type === AFPToken::TSQUAREBRACKET && $this->mCur->value === ']' ) ) {
throw new AFPUserVisibleException( 'expectednotfound', $this->mPos,
[ ']', $this->mCur->type, $this->mCur->value ] );
}
$this->move();
}
return $array;
}
/**
* Handles parenthesis.
*
* @return AFPTreeNode
* @throws AFPUserVisibleException
*/
protected function doLevelParenthesis() {
if ( $this->mCur->type === AFPToken::TBRACE && $this->mCur->value === '(' ) {
$result = $this->doLevelSemicolon();
if ( !( $this->mCur->type === AFPToken::TBRACE && $this->mCur->value === ')' ) ) {
throw new AFPUserVisibleException(
'expectednotfound',
$this->mPos,
[ ')', $this->mCur->type, $this->mCur->value ]
);
}
$this->move();
return $result;
}
return $this->doLevelFunction();
}
/**
* Handles function calls.
*
* @return AFPTreeNode
* @throws AFPUserVisibleException
*/
protected function doLevelFunction() {
if ( $this->mCur->type === AFPToken::TID &&
isset( AbuseFilterParser::$mFunctions[$this->mCur->value] )
) {
$func = $this->mCur->value;
$position = $this->mPos;
$this->move();
if ( $this->mCur->type !== AFPToken::TBRACE || $this->mCur->value !== '(' ) {
throw new AFPUserVisibleException( 'expectednotfound',
$this->mPos,
[
'(',
$this->mCur->type,
$this->mCur->value
]
);
}
if ( ( $func === 'set' || $func === 'set_var' ) ) {
$state = $this->getState();
$this->move();
$next = $this->getNextToken();
if (
$this->mCur->type !== AFPToken::TSTRING ||
(
$next->type !== AFPToken::TCOMMA &&
// Let this fail later, when checking parameters count
!( $next->type === AFPToken::TBRACE && $next->value === ')' )
)
) {
throw new AFPUserVisibleException( 'variablevariable', $this->mPos, [] );
} else {
$this->variablesNames[ $this->mCur->value ] = true;
$this->setState( $state );
}
}
$args = [];
$next = $this->getNextToken();
if ( $next->type !== AFPToken::TBRACE || $next->value !== ')' ) {
do {
$args[] = $this->doLevelSemicolon();
} while ( $this->mCur->type === AFPToken::TCOMMA );
} else {
$this->move();
}
if ( $this->mCur->type !== AFPToken::TBRACE || $this->mCur->value !== ')' ) {
throw new AFPUserVisibleException( 'expectednotfound',
$this->mPos,
[
')',
$this->mCur->type,
$this->mCur->value
]
);
}
// Giving too few arguments to a function is a pretty common error. If we check it here
// (as well as at runtime, for OCD), we can make checkSyntax only try to build the AST, as
// there would be way less runtime errors. Moreover, this check will also be performed inside
// skipped branches, e.g. the discarded if/else branch.
$this->checkEnoughArguments( $args, $func );
$this->move();
array_unshift( $args, $func );
return new AFPTreeNode( AFPTreeNode::FUNCTION_CALL, $args, $position );
}
return $this->doLevelAtom();
}
/**
* Handle literals.
* @return AFPTreeNode
* @throws AFPUserVisibleException
*/
protected function doLevelAtom() {
$tok = $this->mCur->value;
switch ( $this->mCur->type ) {
case AFPToken::TID:
case AFPToken::TSTRING:
case AFPToken::TFLOAT:
case AFPToken::TINT:
$result = new AFPTreeNode( AFPTreeNode::ATOM, $this->mCur, $this->mPos );
break;
case AFPToken::TKEYWORD:
if ( in_array( $tok, [ "true", "false", "null" ] ) ) {
$result = new AFPTreeNode( AFPTreeNode::ATOM, $this->mCur, $this->mPos );
break;
}
throw new AFPUserVisibleException(
'unrecognisedkeyword',
$this->mPos,
[ $tok ]
);
/** @noinspection PhpMissingBreakStatementInspection */
case AFPToken::TSQUAREBRACKET:
if ( $this->mCur->value === '[' ) {
$array = [];
while ( true ) {
$this->move();
if ( $this->mCur->type === AFPToken::TSQUAREBRACKET && $this->mCur->value === ']' ) {
break;
}
$array[] = $this->doLevelSet();
if ( $this->mCur->type === AFPToken::TSQUAREBRACKET && $this->mCur->value === ']' ) {
break;
}
if ( $this->mCur->type !== AFPToken::TCOMMA ) {
throw new AFPUserVisibleException(
'expectednotfound',
$this->mPos,
[ ', or ]', $this->mCur->type, $this->mCur->value ]
);
}
}
$result = new AFPTreeNode( AFPTreeNode::ARRAY_DEFINITION, $array, $this->mPos );
break;
}
// Fallthrough expected
default:
throw new AFPUserVisibleException(
'unexpectedtoken',
$this->mPos,
[
$this->mCur->type,
$this->mCur->value
]
);
}
$this->move();
return $result;
}
/**
* Check that a built-in function has been provided enough arguments
*
* @param array $args The arguments supplied to the function
* @param string $func The function name
* @throws AFPUserVisibleException
* @see AbuseFilterParser::checkEnoughArguments()
* @todo This is a duplicate of AbuseFilter::checkEnoughArguments, and such duplication
* should be avoided when merging the parsers.
*/
protected function checkEnoughArguments( $args, $func ) {
if ( !array_key_exists( $func, AbuseFilterParser::$minArgFunc ) ) {
throw new InvalidArgumentException( "$func is not a valid function." );
}
$count = AbuseFilterParser::$minArgFunc[ $func ];
if ( count( $args ) < $count ) {
throw new AFPUserVisibleException(
$count === 1 ? 'noparams' : 'notenoughargs',
$this->mCur->pos,
[ $func, $count, count( $args ) ]
);
}
}
}