2004-12-11 09:59:06 +00:00
|
|
|
|
<?php
|
|
|
|
|
|
2022-04-08 13:05:02 +00:00
|
|
|
|
namespace MediaWiki\Extension\SpamBlacklist;
|
|
|
|
|
|
|
|
|
|
use ExtensionRegistry;
|
|
|
|
|
use LogPage;
|
|
|
|
|
use ManualLogEntry;
|
|
|
|
|
use MediaWiki\CheckUser\Hooks as CUHooks;
|
2024-06-10 18:42:06 +00:00
|
|
|
|
use MediaWiki\Context\RequestContext;
|
2023-04-24 13:09:17 +00:00
|
|
|
|
use MediaWiki\ExternalLinks\ExternalLinksLookup;
|
2019-10-11 18:58:00 +00:00
|
|
|
|
use MediaWiki\MediaWikiServices;
|
2023-08-19 04:19:23 +00:00
|
|
|
|
use MediaWiki\Title\Title;
|
2024-01-05 18:28:12 +00:00
|
|
|
|
use MediaWiki\User\User;
|
2022-02-24 21:15:28 +00:00
|
|
|
|
use Wikimedia\AtEase\AtEase;
|
2018-02-25 01:25:04 +00:00
|
|
|
|
use Wikimedia\Rdbms\Database;
|
2016-06-30 21:26:36 +00:00
|
|
|
|
|
2012-01-17 06:13:46 +00:00
|
|
|
|
class SpamBlacklist extends BaseBlacklist {
|
2020-05-20 00:05:42 +00:00
|
|
|
|
private const STASH_TTL = 180;
|
|
|
|
|
private const STASH_AGE_DYING = 150;
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
|
/**
|
2012-01-17 06:13:46 +00:00
|
|
|
|
* Returns the code for the blacklist implementation
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
2007-07-20 21:13:26 +00:00
|
|
|
|
*/
|
2012-01-17 06:13:46 +00:00
|
|
|
|
protected function getBlacklistType() {
|
|
|
|
|
return 'spam';
|
2006-06-22 19:59:43 +00:00
|
|
|
|
}
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2007-11-12 07:44:17 +00:00
|
|
|
|
/**
|
2013-07-25 14:05:13 +00:00
|
|
|
|
* Apply some basic anti-spoofing to the links before they get filtered,
|
|
|
|
|
* see @bug 12896
|
|
|
|
|
*
|
|
|
|
|
* @param string $text
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
2007-11-12 07:44:17 +00:00
|
|
|
|
*/
|
2013-07-25 14:05:13 +00:00
|
|
|
|
protected function antiSpoof( $text ) {
|
|
|
|
|
$text = str_replace( '.', '.', $text );
|
|
|
|
|
return $text;
|
|
|
|
|
}
|
2006-01-19 17:14:10 +00:00
|
|
|
|
|
2013-07-25 14:05:13 +00:00
|
|
|
|
/**
|
|
|
|
|
* @param string[] $links An array of links to check against the blacklist
|
2019-11-04 19:04:48 +00:00
|
|
|
|
* @param ?Title $title The title of the page to which the filter shall be applied.
|
2013-07-25 14:05:13 +00:00
|
|
|
|
* This is used to load the old links already on the page, so
|
|
|
|
|
* the filter is only applied to links that got added. If not given,
|
|
|
|
|
* the filter is applied to all $links.
|
2020-01-04 12:48:07 +00:00
|
|
|
|
* @param User $user Relevant user
|
2017-09-01 04:57:27 +00:00
|
|
|
|
* @param bool $preventLog Whether to prevent logging of hits. Set to true when
|
2013-09-21 23:47:21 +00:00
|
|
|
|
* the action is testing the links rather than attempting to save them
|
|
|
|
|
* (e.g. the API spamblacklist action)
|
2016-08-30 03:36:36 +00:00
|
|
|
|
* @param string $mode Either 'check' or 'stash'
|
2013-07-25 14:05:13 +00:00
|
|
|
|
*
|
2016-06-30 21:26:36 +00:00
|
|
|
|
* @return string[]|bool Matched text(s) if the edit should not be allowed; false otherwise
|
2013-07-25 14:05:13 +00:00
|
|
|
|
*/
|
2020-01-02 05:50:30 +00:00
|
|
|
|
public function filter(
|
|
|
|
|
array $links,
|
|
|
|
|
?Title $title,
|
2020-01-04 12:48:07 +00:00
|
|
|
|
User $user,
|
2020-01-02 05:50:30 +00:00
|
|
|
|
$preventLog = false,
|
2020-01-04 11:16:36 +00:00
|
|
|
|
$mode = 'check'
|
2020-01-02 05:50:30 +00:00
|
|
|
|
) {
|
2024-05-28 00:24:22 +00:00
|
|
|
|
$services = MediaWikiServices::getInstance();
|
|
|
|
|
$statsd = $services->getStatsdDataFactory();
|
|
|
|
|
$cache = $services->getObjectCacheFactory()->getLocalClusterInstance();
|
2016-09-09 00:44:26 +00:00
|
|
|
|
|
2016-09-27 20:12:09 +00:00
|
|
|
|
if ( !$links ) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-09 00:44:26 +00:00
|
|
|
|
sort( $links );
|
2016-06-30 21:26:36 +00:00
|
|
|
|
$key = $cache->makeKey(
|
|
|
|
|
'blacklist',
|
|
|
|
|
$this->getBlacklistType(),
|
|
|
|
|
'pass',
|
|
|
|
|
sha1( implode( "\n", $links ) ),
|
2018-07-10 11:38:47 +00:00
|
|
|
|
md5( (string)$title )
|
2016-06-30 21:26:36 +00:00
|
|
|
|
);
|
|
|
|
|
// Skip blacklist checks if nothing matched during edit stashing...
|
2016-08-30 03:36:36 +00:00
|
|
|
|
$knownNonMatchAsOf = $cache->get( $key );
|
|
|
|
|
if ( $mode === 'check' ) {
|
|
|
|
|
if ( $knownNonMatchAsOf ) {
|
|
|
|
|
$statsd->increment( 'spamblacklist.check-stash.hit' );
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
} else {
|
|
|
|
|
$statsd->increment( 'spamblacklist.check-stash.miss' );
|
|
|
|
|
}
|
|
|
|
|
} elseif ( $mode === 'stash' ) {
|
|
|
|
|
if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
|
2022-04-08 13:10:31 +00:00
|
|
|
|
// OK; not about to expire soon
|
|
|
|
|
return false;
|
2016-08-30 03:36:36 +00:00
|
|
|
|
}
|
2016-06-30 21:26:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2007-07-07 17:21:49 +00:00
|
|
|
|
$blacklists = $this->getBlacklists();
|
2006-09-18 09:56:57 +00:00
|
|
|
|
$whitelists = $this->getWhitelists();
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2007-07-07 17:21:49 +00:00
|
|
|
|
if ( count( $blacklists ) ) {
|
2013-07-25 14:05:13 +00:00
|
|
|
|
// poor man's anti-spoof, see bug 12896
|
2017-06-06 16:29:27 +00:00
|
|
|
|
$newLinks = array_map( [ $this, 'antiSpoof' ], $links );
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2017-06-06 16:29:27 +00:00
|
|
|
|
$oldLinks = [];
|
2013-07-25 14:05:13 +00:00
|
|
|
|
if ( $title !== null ) {
|
|
|
|
|
$oldLinks = $this->getCurrentLinks( $title );
|
|
|
|
|
$addedLinks = array_diff( $newLinks, $oldLinks );
|
|
|
|
|
} else {
|
|
|
|
|
// can't load old links, so treat all links as added.
|
|
|
|
|
$addedLinks = $newLinks;
|
2011-01-23 10:34:56 +00:00
|
|
|
|
}
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2008-05-13 23:31:33 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
|
|
|
|
|
wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
|
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2008-05-13 23:31:33 +00:00
|
|
|
|
$links = implode( "\n", $addedLinks );
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2006-06-22 19:59:43 +00:00
|
|
|
|
# Strip whitelisted URLs from the match
|
2017-06-06 16:29:27 +00:00
|
|
|
|
if ( is_array( $whitelists ) ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
|
2006-09-18 09:56:57 +00:00
|
|
|
|
" regexes: " . implode( ', ', $whitelists ) . "\n" );
|
2017-06-06 16:29:27 +00:00
|
|
|
|
foreach ( $whitelists as $regex ) {
|
2022-02-24 21:15:28 +00:00
|
|
|
|
AtEase::suppressWarnings();
|
2007-10-03 00:48:57 +00:00
|
|
|
|
$newLinks = preg_replace( $regex, '', $links );
|
2022-02-24 21:15:28 +00:00
|
|
|
|
AtEase::restoreWarnings();
|
2017-06-06 16:29:27 +00:00
|
|
|
|
if ( is_string( $newLinks ) ) {
|
2007-10-03 00:48:57 +00:00
|
|
|
|
// If there wasn't a regex error, strip the matching URLs
|
|
|
|
|
$links = $newLinks;
|
|
|
|
|
}
|
2006-09-18 09:56:57 +00:00
|
|
|
|
}
|
2006-06-22 19:59:43 +00:00
|
|
|
|
}
|
2006-04-12 04:59:27 +00:00
|
|
|
|
|
2004-12-11 09:59:06 +00:00
|
|
|
|
# Do the match
|
2007-10-03 00:19:36 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
|
2007-07-07 17:21:49 +00:00
|
|
|
|
" regexes: " . implode( ', ', $blacklists ) . "\n" );
|
2006-09-18 09:56:57 +00:00
|
|
|
|
$retVal = false;
|
2017-06-06 16:29:27 +00:00
|
|
|
|
foreach ( $blacklists as $regex ) {
|
2022-02-24 21:15:28 +00:00
|
|
|
|
AtEase::suppressWarnings();
|
2017-06-06 16:29:27 +00:00
|
|
|
|
$matches = [];
|
2012-03-27 20:42:49 +00:00
|
|
|
|
$check = ( preg_match_all( $regex, $links, $matches ) > 0 );
|
2022-02-24 21:15:28 +00:00
|
|
|
|
AtEase::restoreWarnings();
|
2017-06-06 16:29:27 +00:00
|
|
|
|
if ( $check ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Match!\n" );
|
2020-01-02 05:50:30 +00:00
|
|
|
|
$ip = RequestContext::getMain()->getRequest()->getIP();
|
2017-06-06 16:29:27 +00:00
|
|
|
|
$fullUrls = [];
|
2014-10-28 04:05:27 +00:00
|
|
|
|
$fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
|
|
|
|
|
preg_match_all( $fullLineRegex, $links, $fullUrls );
|
|
|
|
|
$imploded = implode( ' ', $fullUrls[0] );
|
2014-03-31 06:01:45 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
|
2019-12-22 05:21:52 +00:00
|
|
|
|
if ( !$preventLog && $title ) {
|
2022-04-08 13:10:31 +00:00
|
|
|
|
$this->logFilterHit( $user, $title, $imploded );
|
2013-09-21 23:47:21 +00:00
|
|
|
|
}
|
2017-07-08 10:59:04 +00:00
|
|
|
|
if ( $retVal === false ) {
|
2017-06-06 16:29:27 +00:00
|
|
|
|
$retVal = [];
|
2012-03-27 20:42:49 +00:00
|
|
|
|
}
|
2014-10-28 04:05:27 +00:00
|
|
|
|
$retVal = array_merge( $retVal, $fullUrls[1] );
|
2006-09-18 09:56:57 +00:00
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
|
}
|
2013-01-11 20:02:17 +00:00
|
|
|
|
if ( is_array( $retVal ) ) {
|
|
|
|
|
$retVal = array_unique( $retVal );
|
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
|
} else {
|
|
|
|
|
$retVal = false;
|
|
|
|
|
}
|
2015-02-10 22:07:45 +00:00
|
|
|
|
|
2016-06-30 21:26:36 +00:00
|
|
|
|
if ( $retVal === false ) {
|
|
|
|
|
// Cache the typical negative results
|
2016-09-09 00:36:39 +00:00
|
|
|
|
$cache->set( $key, time(), self::STASH_TTL );
|
2016-08-30 03:36:36 +00:00
|
|
|
|
if ( $mode === 'stash' ) {
|
|
|
|
|
$statsd->increment( 'spamblacklist.check-stash.store' );
|
|
|
|
|
}
|
2016-06-30 21:26:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2004-12-11 09:59:06 +00:00
|
|
|
|
return $retVal;
|
|
|
|
|
}
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2008-05-13 23:31:33 +00:00
|
|
|
|
/**
|
|
|
|
|
* Look up the links currently in the article, so we can
|
|
|
|
|
* ignore them on a second run.
|
|
|
|
|
*
|
|
|
|
|
* WARNING: I can add more *of the same link* with no problem here.
|
2017-10-07 09:15:06 +00:00
|
|
|
|
* @param Title $title
|
2012-01-17 06:13:46 +00:00
|
|
|
|
* @return array
|
2008-05-13 23:31:33 +00:00
|
|
|
|
*/
|
2018-11-02 11:54:59 +00:00
|
|
|
|
public function getCurrentLinks( Title $title ) {
|
2019-07-03 12:52:16 +00:00
|
|
|
|
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
|
2018-09-30 11:24:28 +00:00
|
|
|
|
$fname = __METHOD__;
|
2016-02-16 21:39:06 +00:00
|
|
|
|
return $cache->getWithSetCallback(
|
|
|
|
|
// Key is warmed via warmCachesForFilter() from ApiStashEdit
|
|
|
|
|
$cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
|
|
|
|
|
$cache::TTL_MINUTE,
|
2021-05-13 19:42:44 +00:00
|
|
|
|
static function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) {
|
2024-03-18 20:34:56 +00:00
|
|
|
|
$dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase();
|
2016-02-16 21:39:06 +00:00
|
|
|
|
$setOpts += Database::getCacheSetOptions( $dbr );
|
2023-04-24 13:09:17 +00:00
|
|
|
|
return ExternalLinksLookup::getExternalLinksForPage(
|
|
|
|
|
$title->getArticleID(),
|
|
|
|
|
$dbr,
|
2018-09-30 11:24:28 +00:00
|
|
|
|
$fname
|
2016-02-16 21:39:06 +00:00
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-04 11:16:36 +00:00
|
|
|
|
public function warmCachesForFilter( Title $title, array $entries, User $user ) {
|
|
|
|
|
$this->filter(
|
|
|
|
|
$entries,
|
|
|
|
|
$title,
|
|
|
|
|
$user,
|
2022-04-08 13:10:31 +00:00
|
|
|
|
// no logging
|
|
|
|
|
true,
|
2020-01-04 11:16:36 +00:00
|
|
|
|
'stash'
|
|
|
|
|
);
|
2008-05-13 23:31:33 +00:00
|
|
|
|
}
|
2012-01-18 23:29:37 +00:00
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the start of the regex for matches
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
public function getRegexStart() {
|
2012-07-14 17:40:05 +00:00
|
|
|
|
return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
|
2012-01-18 23:29:37 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the end of the regex for matches
|
|
|
|
|
*
|
2017-10-07 09:15:06 +00:00
|
|
|
|
* @param int $batchSize
|
2012-01-18 23:29:37 +00:00
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
public function getRegexEnd( $batchSize ) {
|
|
|
|
|
return ')' . parent::getRegexEnd( $batchSize );
|
|
|
|
|
}
|
2019-07-16 01:27:17 +00:00
|
|
|
|
|
2013-06-18 11:20:19 +00:00
|
|
|
|
/**
|
|
|
|
|
* Logs the filter hit to Special:Log if
|
|
|
|
|
* $wgLogSpamBlacklistHits is enabled.
|
|
|
|
|
*
|
2020-01-02 05:50:30 +00:00
|
|
|
|
* @param User $user
|
2013-06-18 11:20:19 +00:00
|
|
|
|
* @param Title $title
|
|
|
|
|
* @param string $url URL that the user attempted to add
|
|
|
|
|
*/
|
2020-01-02 05:50:30 +00:00
|
|
|
|
public function logFilterHit( User $user, $title, $url ) {
|
|
|
|
|
global $wgLogSpamBlacklistHits;
|
2013-06-18 11:20:19 +00:00
|
|
|
|
if ( $wgLogSpamBlacklistHits ) {
|
|
|
|
|
$logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
|
2020-01-02 05:50:30 +00:00
|
|
|
|
$logEntry->setPerformer( $user );
|
2013-06-18 11:20:19 +00:00
|
|
|
|
$logEntry->setTarget( $title );
|
2017-06-06 16:29:27 +00:00
|
|
|
|
$logEntry->setParameters( [
|
2013-06-18 11:20:19 +00:00
|
|
|
|
'4::url' => $url,
|
2017-06-06 16:29:27 +00:00
|
|
|
|
] );
|
2013-06-18 11:20:19 +00:00
|
|
|
|
$logid = $logEntry->insert();
|
2017-01-25 01:23:31 +00:00
|
|
|
|
$log = new LogPage( 'spamblacklist' );
|
|
|
|
|
if ( $log->isRestricted() ) {
|
|
|
|
|
// Make sure checkusers can see this action if the log is restricted
|
|
|
|
|
// (which is the default)
|
2021-02-11 19:29:23 +00:00
|
|
|
|
if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' ) ) {
|
2017-01-25 01:23:31 +00:00
|
|
|
|
$rc = $logEntry->getRecentChange( $logid );
|
2022-04-08 13:05:02 +00:00
|
|
|
|
CUHooks::updateCheckUserData( $rc );
|
2017-01-25 01:23:31 +00:00
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// If the log is unrestricted, publish normally to RC,
|
|
|
|
|
// which will also update checkuser
|
|
|
|
|
$logEntry->publish( $logid, "rc" );
|
|
|
|
|
}
|
2013-06-18 11:20:19 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
2012-02-03 20:15:02 +00:00
|
|
|
|
}
|