textExtractor = $textExtractor; $this->hookRunner = $hookRunner; $this->logger = $logger; $this->loadBalancer = $loadBalancer; $this->wanCache = $wanCache; $this->revisionLookup = $revisionLookup; $this->revisionStore = $revisionStore; $this->contentLanguage = $contentLanguage; $this->parser = $parser; $this->wikiID = $wikiID; } /** * XXX: $getVarCB is a hack to hide the cyclic dependency with VariablesManager. See T261069 for possible * solutions. This might also be merged into VariablesManager, but it would bring a ton of dependencies. * @todo Should we remove $vars parameter (check hooks)? * * @param LazyLoadedVariable $var * @param VariableHolder $vars * @param callable $getVarCB * @phan-param callable(string $name):AFPData $getVarCB * @return AFPData * @throws AFPException */ public function compute( LazyLoadedVariable $var, VariableHolder $vars, callable $getVarCB ) { $parameters = $var->getParameters(); $varMethod = $var->getMethod(); $result = null; if ( !$this->hookRunner->onAbuseFilterInterceptVariable( $varMethod, $vars, $parameters, $result ) ) { return $result instanceof AFPData ? $result : AFPData::newFromPHPVar( $result ); } switch ( $varMethod ) { case 'diff': $text1Var = $parameters['oldtext-var']; $text2Var = $parameters['newtext-var']; $text1 = $getVarCB( $text1Var )->toString(); $text2 = $getVarCB( $text2Var )->toString(); // T74329: if there's no text, don't return an array with the empty string $text1 = $text1 === '' ? [] : explode( "\n", $text1 ); $text2 = $text2 === '' ? [] : explode( "\n", $text2 ); $diffs = new Diff( $text1, $text2 ); $format = new UnifiedDiffFormatter(); $result = $format->format( $diffs ); break; case 'diff-split': $diff = $getVarCB( $parameters['diff-var'] )->toString(); $line_prefix = $parameters['line-prefix']; $diff_lines = explode( "\n", $diff ); $result = []; foreach ( $diff_lines as $line ) { if ( substr( $line, 0, 1 ) === $line_prefix ) { $result[] = substr( $line, strlen( $line_prefix ) ); } } break; case 'links-from-wikitext': // This should ONLY be used when sharing a parse operation with the edit. /** @var WikiPage $article */ $article = $parameters['article']; if ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) { // Shared with the edit, don't count it in profiling $startTime = microtime( true ); $textVar = $parameters['text-var']; $new_text = $getVarCB( $textVar )->toString(); $content = ContentHandler::makeContent( $new_text, $article->getTitle() ); $editInfo = $article->prepareContentForEdit( $content ); $result = array_keys( $editInfo->output->getExternalLinks() ); self::$profilingExtraTime += ( microtime( true ) - $startTime ); break; } // Otherwise fall back to database case 'links-from-wikitext-or-database': // TODO: use Content object instead, if available! /** @var WikiPage $article */ $article = $article ?? $parameters['article']; if ( $vars->forFilter ) { $links = $this->getLinksFromDB( $article ); $this->logger->debug( 'Loading old links from DB' ); } elseif ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) { $this->logger->debug( 'Loading old links from Parser' ); $textVar = $parameters['text-var']; $wikitext = $getVarCB( $textVar )->toString(); $editInfo = $this->parseNonEditWikitext( $wikitext, $article, $parameters['contextUser'] ); $links = array_keys( $editInfo->output->getExternalLinks() ); } else { // TODO: Get links from Content object. But we don't have the content object. // And for non-text content, $wikitext is usually not going to be a valid // serialization, but rather some dummy text for filtering. $links = []; } $result = $links; break; case 'link-diff-added': case 'link-diff-removed': $oldLinkVar = $parameters['oldlink-var']; $newLinkVar = $parameters['newlink-var']; $oldLinks = $getVarCB( $oldLinkVar )->toString(); $newLinks = $getVarCB( $newLinkVar )->toString(); $oldLinks = explode( "\n", $oldLinks ); $newLinks = explode( "\n", $newLinks ); if ( $varMethod === 'link-diff-added' ) { $result = array_diff( $newLinks, $oldLinks ); } if ( $varMethod === 'link-diff-removed' ) { $result = array_diff( $oldLinks, $newLinks ); } break; case 'parse-wikitext': // Should ONLY be used when sharing a parse operation with the edit. // TODO: use Content object instead, if available! /* @var WikiPage $article */ $article = $parameters['article']; if ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) { // Shared with the edit, don't count it in profiling $startTime = microtime( true ); $textVar = $parameters['wikitext-var']; $new_text = $getVarCB( $textVar )->toString(); $content = ContentHandler::makeContent( $new_text, $article->getTitle() ); $editInfo = $article->prepareContentForEdit( $content ); if ( isset( $parameters['pst'] ) && $parameters['pst'] ) { $result = $editInfo->pstContent->serialize( $editInfo->format ); } else { $newHTML = $editInfo->output->getText(); // Kill the PP limit comments. Ideally we'd just remove these by not setting the // parser option, but then we can't share a parse operation with the edit, which is bad. // @fixme No awfulness scale can measure how awful this hack is. $re = '/\s*(?:\s*)?(?:<\/div>\s*)?$/i'; $result = preg_replace( $re, '', $newHTML ); } self::$profilingExtraTime += ( microtime( true ) - $startTime ); break; } // Otherwise fall back to database $textVar = $parameters['wikitext-var']; if ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) { if ( isset( $parameters['pst'] ) && $parameters['pst'] ) { // $textVar is already PSTed when it's not loaded from an ongoing edit. $result = $getVarCB( $textVar )->toString(); } else { $text = $getVarCB( $textVar )->toString(); $editInfo = $this->parseNonEditWikitext( $text, $article, $parameters['contextUser'] ); $result = $editInfo->output->getText(); } } else { // TODO: Parser Output from Content object. But we don't have the content object. // And for non-text content, $wikitext is usually not going to be a valid // serialization, but rather some dummy text for filtering. $result = ''; } break; case 'strip-html': $htmlVar = $parameters['html-var']; $html = $getVarCB( $htmlVar )->toString(); $stripped = StringUtils::delimiterReplace( '<', '>', '', $html ); // We strip extra spaces to the right because the stripping above // could leave a lot of whitespace. // @fixme Find a better way to do this. $result = TextContent::normalizeLineEndings( $stripped ); break; case 'load-recent-authors': $result = $this->getLastPageAuthors( $parameters['title'] ); break; case 'load-first-author': $revision = $this->revisionLookup->getFirstRevision( $parameters['title'] ); if ( $revision ) { $user = $revision->getUser(); $result = $user === null ? '' : $user->getName(); } else { $result = ''; } break; case 'get-page-restrictions': $action = $parameters['action']; /** @var Title $title */ $title = $parameters['title']; $result = $title->getRestrictions( $action ); break; case 'simple-user-accessor': $user = $parameters['user']; $method = $parameters['method']; $result = $user->$method(); break; case 'user-block': // @todo Support partial blocks $user = $parameters['user']; $result = (bool)$user->getBlock(); break; case 'user-age': /** @var User $user */ $user = $parameters['user']; $asOf = $parameters['asof']; if ( !$user->isRegistered() ) { $result = 0; } else { $registration = $user->getRegistration(); // HACK: If there's no registration date, assume 2008-01-15, Wikipedia Day // in the year before the new user log was created. See T243469. if ( $registration === null ) { $registration = "20080115000000"; } $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $registration ); } break; case 'page-age': /** @var Title $title */ $title = $parameters['title']; $firstRev = $this->revisionLookup->getFirstRevision( $title ); $firstRevisionTime = $firstRev ? $firstRev->getTimestamp() : null; if ( !$firstRevisionTime ) { $result = 0; break; } $asOf = $parameters['asof']; $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $firstRevisionTime ); break; case 'length': $s = $getVarCB( $parameters['length-var'] )->toString(); $result = strlen( $s ); break; case 'subtract-int': $v1 = $getVarCB( $parameters['val1-var'] )->toInt(); $v2 = $getVarCB( $parameters['val2-var'] )->toInt(); $result = $v1 - $v2; break; case 'revision-text-by-id': $revRec = $this->revisionLookup->getRevisionById( $parameters['revid'] ); $result = $this->textExtractor->revisionToString( $revRec, $parameters['contextUser'] ); break; case 'get-wiki-name': $result = $this->wikiID; break; case 'get-wiki-language': $result = $this->contentLanguage->getCode(); break; default: if ( $this->hookRunner->onAbuseFilterComputeVariable( $varMethod, $vars, $parameters, $result ) ) { throw new AFPException( 'Unknown variable compute type ' . $varMethod ); } } return $result instanceof AFPData ? $result : AFPData::newFromPHPVar( $result ); } /** * @param WikiPage $article * @return array */ private function getLinksFromDB( WikiPage $article ) { // Stolen from ConfirmEdit, SimpleCaptcha::getLinksFromTracker $id = $article->getId(); if ( !$id ) { return []; } $dbr = $this->loadBalancer->getConnectionRef( DB_REPLICA ); return $dbr->selectFieldValues( 'externallinks', 'el_to', [ 'el_from' => $id ], __METHOD__ ); } /** * @param Title $title * @return string[] Usernames of the last 10 (unique) authors from $title */ private function getLastPageAuthors( Title $title ) { if ( !$title->exists() ) { return []; } $fname = __METHOD__; return $this->wanCache->getWithSetCallback( $this->wanCache->makeKey( 'last-10-authors', 'revision', $title->getLatestRevID() ), WANObjectCache::TTL_MINUTE, function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) { $dbr = $this->loadBalancer->getConnectionRef( DB_REPLICA ); $setOpts += Database::getCacheSetOptions( $dbr ); // Get the last 100 edit authors with a trivial query (avoid T116557) $revQuery = $this->revisionStore->getQueryInfo(); $revAuthors = $dbr->selectFieldValues( $revQuery['tables'], $revQuery['fields']['rev_user_text'], [ 'rev_page' => $title->getArticleID() ], $fname, // Some pages have < 10 authors but many revisions (e.g. bot pages) [ 'ORDER BY' => 'rev_timestamp DESC, rev_id DESC', 'LIMIT' => 100, // Force index per T116557 'USE INDEX' => [ 'revision' => 'page_timestamp' ], ], $revQuery['joins'] ); // Get the last 10 distinct authors within this set of edits $users = []; foreach ( $revAuthors as $author ) { $users[$author] = 1; if ( count( $users ) >= 10 ) { break; } } return array_keys( $users ); } ); } /** * It's like Article::prepareContentForEdit, but not for editing (old wikitext usually) * * * @param string $wikitext * @param WikiPage $article * @param User $user Context user * * @return stdClass */ private function parseNonEditWikitext( $wikitext, WikiPage $article, User $user ) { static $cache = []; $cacheKey = md5( $wikitext ) . ':' . $article->getTitle()->getPrefixedText(); if ( isset( $cache[$cacheKey] ) ) { return $cache[$cacheKey]; } $edit = (object)[]; $options = ParserOptions::newFromUser( $user ); $edit->output = $this->parser->parse( $wikitext, $article->getTitle(), $options ); $cache[$cacheKey] = $edit; return $edit; } }