<?php if ( !defined( 'MEDIAWIKI' ) ) { exit; } use \MediaWiki\MediaWikiServices; class SpamBlacklist extends BaseBlacklist { const STASH_TTL = 60; const STASH_AGE_DYING = 30; /** * Changes to external links, for logging purposes * @var array[] */ private $urlChangeLog = array(); /** * Returns the code for the blacklist implementation * * @return string */ protected function getBlacklistType() { return 'spam'; } /** * Apply some basic anti-spoofing to the links before they get filtered, * see @bug 12896 * * @param string $text * * @return string */ protected function antiSpoof( $text ) { $text = str_replace( '.', '.', $text ); return $text; } /** * @param string[] $links An array of links to check against the blacklist * @param Title $title The title of the page to which the filter shall be applied. * This is used to load the old links already on the page, so * the filter is only applied to links that got added. If not given, * the filter is applied to all $links. * @param boolean $preventLog Whether to prevent logging of hits. Set to true when * the action is testing the links rather than attempting to save them * (e.g. the API spamblacklist action) * @param string $mode Either 'check' or 'stash' * * @return string[]|bool Matched text(s) if the edit should not be allowed; false otherwise */ function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) { $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory(); $cache = ObjectCache::getLocalClusterInstance(); sort( $links ); $key = $cache->makeKey( 'blacklist', $this->getBlacklistType(), 'pass', sha1( implode( "\n", $links ) ), (string)$title ); // Skip blacklist checks if nothing matched during edit stashing... $knownNonMatchAsOf = $cache->get( $key ); if ( $mode === 'check' ) { if ( $knownNonMatchAsOf ) { $statsd->increment( 'spamblacklist.check-stash.hit' ); return false; } else { $statsd->increment( 'spamblacklist.check-stash.miss' ); } } elseif ( $mode === 'stash' ) { if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) { return false; // OK; not about to expire soon } } $blacklists = $this->getBlacklists(); $whitelists = $this->getWhitelists(); if ( count( $blacklists ) ) { // poor man's anti-spoof, see bug 12896 $newLinks = array_map( array( $this, 'antiSpoof' ), $links ); $oldLinks = array(); if ( $title !== null ) { $oldLinks = $this->getCurrentLinks( $title ); $addedLinks = array_diff( $newLinks, $oldLinks ); } else { // can't load old links, so treat all links as added. $addedLinks = $newLinks; } wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) ); wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) ); wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) ); if ( !$preventLog ) { $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks ); } $links = implode( "\n", $addedLinks ); # Strip whitelisted URLs from the match if( is_array( $whitelists ) ) { wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) . " regexes: " . implode( ', ', $whitelists ) . "\n" ); foreach( $whitelists as $regex ) { wfSuppressWarnings(); $newLinks = preg_replace( $regex, '', $links ); wfRestoreWarnings(); if( is_string( $newLinks ) ) { // If there wasn't a regex error, strip the matching URLs $links = $newLinks; } } } # Do the match wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) . " regexes: " . implode( ', ', $blacklists ) . "\n" ); $retVal = false; foreach( $blacklists as $regex ) { wfSuppressWarnings(); $matches = array(); $check = ( preg_match_all( $regex, $links, $matches ) > 0 ); wfRestoreWarnings(); if( $check ) { wfDebugLog( 'SpamBlacklist', "Match!\n" ); global $wgRequest; $ip = $wgRequest->getIP(); $fullUrls = array(); $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim'; preg_match_all( $fullLineRegex, $links, $fullUrls ); $imploded = implode( ' ', $fullUrls[0] ); wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" ); if( !$preventLog ) { $this->logFilterHit( $title, $imploded ); // Log it } if( $retVal === false ){ $retVal = array(); } $retVal = array_merge( $retVal, $fullUrls[1] ); } } if ( is_array( $retVal ) ) { $retVal = array_unique( $retVal ); } } else { $retVal = false; } if ( $retVal === false ) { // Cache the typical negative results $cache->set( $key, time(), $cache::TTL_MINUTE ); if ( $mode === 'stash' ) { $statsd->increment( 'spamblacklist.check-stash.store' ); } } return $retVal; } private function isLoggingEnabled() { global $wgSpamBlacklistEventLogging; return $wgSpamBlacklistEventLogging && class_exists( 'EventLogging' ); } /** * Diff added/removed urls and generate events for them * * @param string[] $oldLinks * @param string[] $newLinks * @param string[] $addedLinks */ private function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) { if ( !$this->isLoggingEnabled() ) { return; } $removedLinks = array_diff( $oldLinks, $newLinks ); foreach ( $addedLinks as $url ) { $this->logUrlChange( $url, 'insert' ); } foreach ( $removedLinks as $url ) { $this->logUrlChange( $url, 'remove' ); } } /** * Actually push the url change events post-save * * @param User $user * @param Title $title * @param Revision $rev */ public function doLogging( User $user, Title $title, Revision $rev ) { if ( !$this->isLoggingEnabled() ) { return; } $baseInfo = array( 'revId' => $rev->getId(), 'pageId' => $title->getArticleID(), 'pageNamespace' => $title->getNamespace(), 'userId' => $user->getId(), 'userText' => $user->getName(), ); $changes = $this->urlChangeLog; // Empty the changes queue in case this function gets called more than once $this->urlChangeLog = array(); DeferredUpdates::addCallableUpdate( function() use ( $changes, $baseInfo ) { foreach ( $changes as $change ) { EventLogging::logEvent( 'ExternalLinksChange', 15716074, $baseInfo + $change ); } } ); } /** * Queue log data about change for a url addition or removal * * @param string $url * @param string $action 'insert' or 'remove' */ private function logUrlChange( $url, $action ) { $parsed = wfParseUrl( $url ); if ( !isset( $parsed['host'] ) ) { wfDebugLog( 'SpamBlacklist', "Unable to parse $url" ); return; } $info = array( 'action' => $action, 'protocol' => $parsed['scheme'], 'domain' => $parsed['host'], 'path' => $parsed['path'] !== null ? $parsed['path'] : '', 'query' => $parsed['query'] !== null ? $parsed['query'] : '', 'fragment' => $parsed['fragment'] !== null ? $parsed['fragment'] : '', ); $this->urlChangeLog[] = $info; } /** * Look up the links currently in the article, so we can * ignore them on a second run. * * WARNING: I can add more *of the same link* with no problem here. * @param $title Title * @return array */ function getCurrentLinks( Title $title ) { $cache = ObjectCache::getMainWANInstance(); return $cache->getWithSetCallback( // Key is warmed via warmCachesForFilter() from ApiStashEdit $cache->makeKey( 'external-link-list', $title->getLatestRevID() ), $cache::TTL_MINUTE, function ( $oldValue, &$ttl, array &$setOpts ) use ( $title ) { $dbr = wfGetDB( DB_SLAVE ); $setOpts += Database::getCacheSetOptions( $dbr ); return $dbr->selectFieldValues( 'externallinks', 'el_to', array( 'el_from' => $title->getArticleID() ), // should be zero queries __METHOD__ ); } ); } public function warmCachesForFilter( Title $title, array $entries ) { $this->filter( $entries, $title, true /* no logging */, 'stash' ); } /** * Returns the start of the regex for matches * * @return string */ public function getRegexStart() { return '/(?:https?:)?\/\/+[a-z0-9_\-.]*('; } /** * Returns the end of the regex for matches * * @param $batchSize * @return string */ public function getRegexEnd( $batchSize ) { return ')' . parent::getRegexEnd( $batchSize ); } /** * Logs the filter hit to Special:Log if * $wgLogSpamBlacklistHits is enabled. * * @param Title $title * @param string $url URL that the user attempted to add */ public function logFilterHit( $title, $url ) { global $wgUser, $wgLogSpamBlacklistHits; if ( $wgLogSpamBlacklistHits ) { $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' ); $logEntry->setPerformer( $wgUser ); $logEntry->setTarget( $title ); $logEntry->setParameters( array( '4::url' => $url, ) ); $logid = $logEntry->insert(); $logEntry->publish( $logid, "rc" ); } } }