<?php namespace MediaWiki\Extension\AbuseFilter\Parser; use InvalidArgumentException; use LogicException; use MediaWiki\Extension\AbuseFilter\KeywordsManager; use MediaWiki\Extension\AbuseFilter\Parser\Exception\InternalException; use MediaWiki\Extension\AbuseFilter\Parser\Exception\UserVisibleException; use MediaWiki\Message\Message; /** * SyntaxChecker statically analyzes the code without actually running it. * Currently, it only checks for * * - unbound variables * - unused variables: note that a := 1; a := 1; a * is considered OK even though the first `a` seems unused * because the pattern "a := null; if ... then (a := ...) end; ..." * should not count first `a` as unused. * - assignment to built-in identifiers * - invalid function call (arity mismatch, non-valid function) * - first-order information of `set_var` and `set` * * Because it doesn't cover all checks that the current Check Syntax does, * it is currently complementary to the current Check Syntax. * In the future, it could subsume the current Check Syntax, and could be * extended to perform type checking or type inference. */ class SyntaxChecker { /** * @var AFPTreeNode|null Root of the AST to check */ private $treeRoot; /** @var KeywordsManager */ private $keywordsManager; public const MCONSERVATIVE = 'MODE_CONSERVATIVE'; public const MLIBERAL = 'MODE_LIBERAL'; public const DUMMYPOS = 0; public const CACHE_VERSION = 1; /** * @var string The mode of checking. The value should be either * * - MLIBERAL: which guarantees that all user-defined variables * will be bound, but incompatible with what the evaluator currently * permits. E.g., * * if true then (a := 1) else null end; a * * is rejected in this mode, even though `a` is in fact always bound. * * - MCONSERVATIVE which is compatible with what the evaluator * currently permits, but could allow undefined variables to occur. * E.g., * * if false then (a := 1) else null end; a * * is accepted in this mode, even though `a` is in fact always unbound. */ private $mode; /** * @var bool Whether we want to check for unused variables */ private $checkUnusedVars; /** * @param AFPSyntaxTree $tree * @param KeywordsManager $keywordsManager * @param string $mode * @param bool $checkUnusedVars */ public function __construct( AFPSyntaxTree $tree, KeywordsManager $keywordsManager, string $mode = self::MCONSERVATIVE, bool $checkUnusedVars = false ) { $this->treeRoot = $tree->getRoot(); $this->keywordsManager = $keywordsManager; $this->mode = $mode; $this->checkUnusedVars = $checkUnusedVars; } /** * Start the static analysis * * @throws UserVisibleException */ public function start(): void { if ( !$this->treeRoot ) { return; } $bound = $this->check( $this->desugar( $this->treeRoot ), [] ); $unused = array_keys( array_filter( $bound, static function ( $v ) { return !$v; } ) ); if ( $this->checkUnusedVars && $unused ) { throw new UserVisibleException( 'unusedvars', self::DUMMYPOS, [ Message::listParam( $unused, 'comma' ) ] ); } } /** * Remove syntactic sugar so that we don't need to deal with * too many cases. * * This could benefit the evaluator as well, but for now, this is * only used for static analysis. * * Postcondition: * - The tree will not contain nodes of * type ASSIGNMENT, LOGIC, COMPARE, SUM_REL, MUL_REL, POW, * KEYWORD_OPERATOR, and ARRAY_INDEX * - The tree may additionally contain a node of type BINOP. * - The tree should not have set_var function application. * - Conditionals will have both branches. * * @param AFPTreeNode $node * @return AFPTreeNode * @throws InternalException */ private function desugar( AFPTreeNode $node ): AFPTreeNode { switch ( $node->type ) { case AFPTreeNode::ATOM: return $node; case AFPTreeNode::FUNCTION_CALL: if ( $node->children[0] === 'set_var' ) { $node->children[0] = 'set'; } return $this->newNodeMapExceptFirst( $node ); case AFPTreeNode::ARRAY_INDEX: return $this->newNodeNamedBinop( $node, '[]' ); case AFPTreeNode::POW: return $this->newNodeNamedBinop( $node, '**' ); case AFPTreeNode::UNARY: case AFPTreeNode::INDEX_ASSIGNMENT: case AFPTreeNode::ARRAY_APPEND: return $this->newNodeMapExceptFirst( $node ); case AFPTreeNode::BOOL_INVERT: /* * @todo this should really be combined with UNARY, * but let's wait to change the meaning of UNARY across * the codebase together */ return $this->newNodeMapAll( $node ); case AFPTreeNode::KEYWORD_OPERATOR: case AFPTreeNode::MUL_REL: case AFPTreeNode::SUM_REL: case AFPTreeNode::COMPARE: return $this->newNodeBinop( $node ); case AFPTreeNode::LOGIC: $result = $this->newNodeBinop( $node ); [ $op, $left, $right ] = $result->children; if ( $op === '&' || $op === '|' ) { return $this->desugarAndOr( $op, $left, $right, $node->position ); } else { return $result; } case AFPTreeNode::ARRAY_DEFINITION: case AFPTreeNode::SEMICOLON: return $this->newNodeMapAll( $node ); case AFPTreeNode::CONDITIONAL: if ( $node->children[2] === null ) { $node->children[2] = new AFPTreeNode( AFPTreeNode::ATOM, new AFPToken( AFPToken::TKEYWORD, "null", $node->position ), $node->position ); } return $this->newNodeMapAll( $node ); case AFPTreeNode::ASSIGNMENT: [ $varname, $value ] = $node->children; return new AFPTreeNode( AFPTreeNode::FUNCTION_CALL, [ "set", new AFPTreeNode( AFPTreeNode::ATOM, new AFPToken( AFPToken::TSTRING, $varname, $node->position ), $node->position ), $this->desugar( $value ) ], $node->position ); default: // @codeCoverageIgnoreStart throw new InternalException( "Unknown node type passed: {$node->type}" ); // @codeCoverageIgnoreEnd } } /** * @param string $op * @param AFPTreeNode $left * @param AFPTreeNode $right * @param int $position * @return AFPTreeNode */ private function desugarAndOr( string $op, AFPTreeNode $left, AFPTreeNode $right, int $position ): AFPTreeNode { $trueNode = new AFPTreeNode( AFPTreeNode::ATOM, new AFPToken( AFPToken::TKEYWORD, "true", $position ), $position ); $falseNode = new AFPTreeNode( AFPTreeNode::ATOM, new AFPToken( AFPToken::TKEYWORD, "false", $position ), $position ); $conditionalNode = new AFPTreeNode( AFPTreeNode::CONDITIONAL, [ $right, $trueNode, $falseNode ], $position ); if ( $op === '&' ) { // <a> & <b> is supposed to be equivalent to // if <a> then (if <b> then true else false) else false end // See T237336 for why this is currently not the case. return new AFPTreeNode( AFPTreeNode::CONDITIONAL, [ $left, $conditionalNode, $falseNode ], $position ); } elseif ( $op === '|' ) { // <a> | <b> is supposed to be equivalent to // if <a> then true else (if <b> then true else false) end // See T237336 for why this is currently not the case. return new AFPTreeNode( AFPTreeNode::CONDITIONAL, [ $left, $trueNode, $conditionalNode ], $position ); } else { // @codeCoverageIgnoreStart throw new InternalException( "Unknown operator: {$op}" ); // @codeCoverageIgnoreEnd } } /** * Construct a new node with information based on the old node but * with different children * * @param AFPTreeNode $node * @param AFPTreeNode[]|string[]|AFPToken $children * @return AFPTreeNode */ private function newNode( AFPTreeNode $node, $children ): AFPTreeNode { return new AFPTreeNode( $node->type, $children, $node->position ); } /** * Construct a new node with information based on the old node but * with different type * * @param AFPTreeNode $node * @param string $type * @return AFPTreeNode */ private function newNodeReplaceType( AFPTreeNode $node, string $type ): AFPTreeNode { return new AFPTreeNode( $type, $node->children, $node->position ); } /** * Recursively desugar on all children * * @param AFPTreeNode $node * @return AFPTreeNode */ private function newNodeMapAll( AFPTreeNode $node ): AFPTreeNode { $children = $node->children; if ( !is_array( $children ) ) { // @codeCoverageIgnoreStart throw new LogicException( "Unexpected non-array children of an AFPTreeNode of type " . "{$node->type} at position {$node->position}" ); // @codeCoverageIgnoreEnd } return $this->newNode( $node, array_map( [ $this, 'desugar' ], $children ) ); } /** * Recursively desugar on all children except the first one * * @param AFPTreeNode $node * @return AFPTreeNode */ private function newNodeMapExceptFirst( AFPTreeNode $node ): AFPTreeNode { $items = [ $node->children[0] ]; $args = array_slice( $node->children, 1 ); foreach ( $args as $el ) { $items[] = $this->desugar( $el ); } return $this->newNode( $node, $items ); } /** * Convert a node with an operation into a BINOP * * @param AFPTreeNode $node * @return AFPTreeNode */ private function newNodeBinop( AFPTreeNode $node ): AFPTreeNode { return $this->newNodeReplaceType( $this->newNodeMapExceptFirst( $node ), AFPTreeNode::BINOP ); } /** * Convert a node without an operation into a BINOP with the specified operation * * @param AFPTreeNode $node * @param string $op * @return AFPTreeNode */ private function newNodeNamedBinop( AFPTreeNode $node, string $op ): AFPTreeNode { $items = $this->newNodeMapAll( $node )->children; array_unshift( $items, $op ); return $this->newNodeReplaceType( $this->newNode( $node, $items ), AFPTreeNode::BINOP ); } /** * - Statically compute what are bound after evaluating $node, * provided that variables in $bound are already bound. * - Similarly compute for each bound variable after evaluating $node * whether it is used provided that we already have $bound * that contains necessary information. * - Ensure function application's validity. * - Ensure that the first argument of set is a literal string. * - Ensure that all assignment is not done on built-in identifier. * * Precondition: * - The tree $node should be desugared and normalized. * * Postcondition: * - $node is guaranteed to have no unbound variables * provided that variables in $bound are already bound * (for the definition of unbound variable indicated by $this->mode) * - All function applications should be valid and have correct arity. * - The set function application's first argument should be * a literal string. * * @param AFPTreeNode $node * @param bool[] $bound Map of [ variable_name => used ] * @return bool[] Map of [ variable_name => used ] * @throws UserVisibleException * @throws InternalException */ private function check( AFPTreeNode $node, array $bound ): array { switch ( $node->type ) { // phpcs:ignore PSR2.ControlStructures.SwitchDeclaration.TerminatingComment case AFPTreeNode::ATOM: $tok = $node->children; switch ( $tok->type ) { case AFPToken::TID: return $this->lookupVar( $tok->value, $tok->pos, $bound ); case AFPToken::TSTRING: case AFPToken::TFLOAT: case AFPToken::TINT: case AFPToken::TKEYWORD: return $bound; default: // @codeCoverageIgnoreStart throw new InternalException( "Unknown token {$tok->type} provided in the ATOM node" ); // @codeCoverageIgnoreEnd } case AFPTreeNode::ARRAY_DEFINITION: // @phan-suppress-next-line PhanTypeSuspiciousNonTraversableForeach children is array here foreach ( $node->children as $el ) { $bound = $this->check( $el, $bound ); } return $bound; case AFPTreeNode::FUNCTION_CALL: $fname = $node->children[0]; $args = array_slice( $node->children, 1 ); if ( !array_key_exists( $fname, FilterEvaluator::FUNCTIONS ) ) { throw new UserVisibleException( 'unknownfunction', $node->position, [ $fname ] ); } $this->checkArgCount( $args, $fname, $node->position ); if ( $fname === 'set' ) { // arity is checked, so we know $args[0] and $args[1] exist $tok = $args[0]->children; if ( !( $tok instanceof AFPToken ) || $tok->type !== AFPToken::TSTRING ) { throw new UserVisibleException( 'variablevariable', $node->position, [] ); } $bound = $this->check( $args[1], $bound ); // set the variable as unused return $this->assignVar( $tok->value, $tok->pos, $bound ); } else { foreach ( $args as $arg ) { $bound = $this->check( $arg, $bound ); } return $bound; } case AFPTreeNode::BINOP: [ , $left, $right ] = $node->children; return $this->check( $right, $this->check( $left, $bound ) ); case AFPTreeNode::UNARY: [ , $argument ] = $node->children; return $this->check( $argument, $bound ); case AFPTreeNode::BOOL_INVERT: [ $argument ] = $node->children; return $this->check( $argument, $bound ); // phpcs:ignore PSR2.ControlStructures.SwitchDeclaration.TerminatingComment case AFPTreeNode::CONDITIONAL: [ $condition, $exprIfTrue, $exprIfFalse ] = $node->children; $bound = $this->check( $condition, $bound ); $boundLeft = $this->check( $exprIfTrue, $bound ); $boundRight = $this->check( $exprIfFalse, $bound ); switch ( $this->mode ) { case self::MCONSERVATIVE: return $this->mapUnion( $boundLeft, $boundRight ); case self::MLIBERAL: return $this->mapIntersect( $boundLeft, $boundRight ); default: // @codeCoverageIgnoreStart throw new LogicException( "Unknown mode: {$this->mode}" ); // @codeCoverageIgnoreEnd } case AFPTreeNode::INDEX_ASSIGNMENT: [ $varName, $offset, $value ] = $node->children; // deal with unbound $varName $bound = $this->lookupVar( $varName, $node->position, $bound ); $bound = $this->check( $offset, $bound ); $bound = $this->check( $value, $bound ); // deal with built-in $varName and set $varName as unused return $this->assignVar( $varName, $node->position, $bound ); case AFPTreeNode::ARRAY_APPEND: [ $varName, $value ] = $node->children; // deal with unbound $varName $bound = $this->lookupVar( $varName, $node->position, $bound ); $bound = $this->check( $value, $bound ); // deal with built-in $varName and set $varName as unused return $this->assignVar( $varName, $node->position, $bound ); case AFPTreeNode::SEMICOLON: // @phan-suppress-next-line PhanTypeSuspiciousNonTraversableForeach children is array here foreach ( $node->children as $statement ) { $bound = $this->check( $statement, $bound ); } return $bound; default: // @codeCoverageIgnoreStart throw new LogicException( "Unknown type: {$node->type}" ); // @codeCoverageIgnoreEnd } } /** * @param array $left * @param array $right * @return array */ private function mapUnion( array $left, array $right ): array { foreach ( $right as $key => $val ) { if ( array_key_exists( $key, $left ) ) { $left[ $key ] = $left[ $key ] || $val; } else { $left[ $key ] = $val; } } return $left; } /** * @param array $left * @param array $right * @return array */ private function mapIntersect( array $left, array $right ): array { $keys = array_intersect_key( $left, $right ); $result = []; foreach ( $keys as $key => $val ) { $result[ $key ] = $left[ $key ] || $right[ $key ]; } return $result; } /** * @param string $var * @param int $pos * @param array $bound * @return array */ private function assignVar( string $var, int $pos, array $bound ): array { $var = strtolower( $var ); if ( $this->isReservedIdentifier( $var ) ) { throw new UserVisibleException( 'overridebuiltin', $pos, [ $var ] ); } $bound[ $var ] = false; return $bound; } /** * @param string $var * @param int $pos * @param array $bound * @return array */ private function lookupVar( string $var, int $pos, array $bound ): array { $var = strtolower( $var ); if ( array_key_exists( $var, $bound ) ) { // user-defined variable $bound[ $var ] = true; return $bound; } elseif ( $this->keywordsManager->isVarDisabled( $var ) ) { // disabled built-in variables throw new UserVisibleException( 'disabledvar', $pos, [ $var ] ); } elseif ( $this->keywordsManager->varExists( $var ) ) { // non-disabled built-in variables return $bound; } elseif ( $this->isReservedIdentifier( $var ) ) { // other built-in identifiers throw new UserVisibleException( 'usebuiltin', $pos, [ $var ] ); } else { // unbound variables throw new UserVisibleException( 'unrecognisedvar', $pos, [ $var ] ); } } /** * Check that a built-in function has been provided the right amount of arguments * * @param array $args The arguments supplied to the function * @param string $func The function name * @param int $position * @throws UserVisibleException */ private function checkArgCount( array $args, string $func, int $position ): void { if ( !array_key_exists( $func, FilterEvaluator::FUNC_ARG_COUNT ) ) { // @codeCoverageIgnoreStart throw new InvalidArgumentException( "$func is not a valid function." ); // @codeCoverageIgnoreEnd } [ $min, $max ] = FilterEvaluator::FUNC_ARG_COUNT[ $func ]; if ( count( $args ) < $min ) { throw new UserVisibleException( $min === 1 ? 'noparams' : 'notenoughargs', $position, [ $func, $min, count( $args ) ] ); } elseif ( count( $args ) > $max ) { throw new UserVisibleException( 'toomanyargs', $position, [ $func, $max, count( $args ) ] ); } } /** * Check whether the given name is a reserved identifier, e.g. the name of a built-in variable, * function, or keyword. * * @param string $name * @return bool */ private function isReservedIdentifier( string $name ): bool { return $this->keywordsManager->varExists( $name ) || array_key_exists( $name, FilterEvaluator::FUNCTIONS ) || // We need to check for true, false, if/then/else etc. because, even if they have a different // AFPToken type, they may be used inside set/set_var() in_array( $name, AbuseFilterTokenizer::KEYWORDS, true ); } }