2004-12-11 09:59:06 +00:00
|
|
|
<?php
|
|
|
|
|
2007-11-12 07:44:17 +00:00
|
|
|
if ( !defined( 'MEDIAWIKI' ) ) {
|
|
|
|
exit;
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
|
|
|
|
class SpamBlacklist {
|
2006-09-18 09:56:57 +00:00
|
|
|
var $regexes = false;
|
2004-12-11 09:59:06 +00:00
|
|
|
var $previousFilter = false;
|
2007-07-20 21:13:26 +00:00
|
|
|
var $files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );
|
2005-06-25 15:49:21 +00:00
|
|
|
var $warningTime = 600;
|
|
|
|
var $expiryTime = 900;
|
|
|
|
var $warningChance = 100;
|
2007-01-06 20:56:46 +00:00
|
|
|
|
2005-07-08 16:29:22 +00:00
|
|
|
function SpamBlacklist( $settings = array() ) {
|
|
|
|
foreach ( $settings as $name => $value ) {
|
|
|
|
$this->$name = $value;
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
2005-07-08 16:29:22 +00:00
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
/**
|
|
|
|
* Check if the given local page title is a spam regex source.
|
|
|
|
* @param Title $title
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
function isLocalSource( $title ) {
|
|
|
|
global $wgDBname;
|
|
|
|
|
|
|
|
if( $title->getNamespace() == NS_MEDIAWIKI ) {
|
|
|
|
$sources = array(
|
|
|
|
"Spam-blacklist",
|
|
|
|
"Spam-whitelist" );
|
2008-01-14 10:09:08 +00:00
|
|
|
if( in_array( $title->getDBkey(), $sources ) ) {
|
2007-07-20 21:13:26 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$thisHttp = $title->getFullUrl( 'action=raw' );
|
|
|
|
$thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
|
|
|
|
|
|
|
|
foreach( $this->files as $fileName ) {
|
|
|
|
if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
|
|
|
|
if ( $wgDBname == $matches[1] ) {
|
|
|
|
$sources[] = $matches[2];
|
|
|
|
if( $matches[2] == $title->getPrefixedDbKey() ) {
|
|
|
|
// Local DB fetch of this page...
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} elseif( preg_match( $thisHttpRegex, $fileName ) ) {
|
|
|
|
// Raw view of this page
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2007-07-07 17:21:49 +00:00
|
|
|
/**
|
|
|
|
* @deprecated back-compat
|
|
|
|
*/
|
2006-09-18 09:56:57 +00:00
|
|
|
function getRegexes() {
|
2007-07-07 17:21:49 +00:00
|
|
|
return $this->getBlacklists();
|
|
|
|
}
|
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
/**
|
|
|
|
* Fetch local and (possibly cached) remote blacklists.
|
|
|
|
* Will be cached locally across multiple invocations.
|
|
|
|
* @return array set of regular expressions, potentially empty.
|
|
|
|
*/
|
2007-07-07 17:21:49 +00:00
|
|
|
function getBlacklists() {
|
2007-07-20 21:13:26 +00:00
|
|
|
if( $this->regexes === false ) {
|
|
|
|
$this->regexes = array_merge(
|
|
|
|
$this->getLocalBlacklists(),
|
|
|
|
$this->getSharedBlacklists() );
|
|
|
|
}
|
|
|
|
return $this->regexes;
|
2007-07-07 17:21:49 +00:00
|
|
|
}
|
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
/**
|
|
|
|
* Fetch (possibly cached) remote blacklists.
|
|
|
|
* @return array
|
|
|
|
*/
|
2007-07-07 17:21:49 +00:00
|
|
|
function getSharedBlacklists() {
|
2007-07-20 21:13:26 +00:00
|
|
|
global $wgMemc, $wgDBname;
|
2006-01-19 17:14:10 +00:00
|
|
|
$fname = 'SpamBlacklist::getRegex';
|
2004-12-11 09:59:06 +00:00
|
|
|
wfProfileIn( $fname );
|
|
|
|
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Loading spam regex..." );
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
|
|
if ( count( $this->files ) == 0 ){
|
2004-12-11 09:59:06 +00:00
|
|
|
# No lists
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "no files specified\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
wfProfileOut( $fname );
|
2007-07-07 17:21:49 +00:00
|
|
|
return array();
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
// This used to be cached per-site, but that could be bad on a shared
|
|
|
|
// server where not all wikis have the same configuration.
|
|
|
|
$cachedRegexes = $wgMemc->get( "$wgDBname:spam_blacklist_regexes" );
|
|
|
|
if( is_array( $cachedRegexes ) ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
|
2007-07-20 21:13:26 +00:00
|
|
|
wfProfileOut( $fname );
|
|
|
|
return $cachedRegexes;
|
|
|
|
}
|
|
|
|
|
|
|
|
$regexes = $this->buildSharedBlacklists();
|
|
|
|
$wgMemc->set( "$wgDBname:spam_blacklist_regexes", $regexes, $this->expiryTime );
|
|
|
|
|
|
|
|
return $regexes;
|
|
|
|
}
|
|
|
|
|
|
|
|
function clearCache() {
|
|
|
|
global $wgMemc, $wgDBname;
|
|
|
|
$wgMemc->delete( "$wgDBname:spam_blacklist_regexes" );
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Spam blacklist local cache cleared.\n" );
|
2007-07-20 21:13:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function buildSharedBlacklists() {
|
|
|
|
$regexes = array();
|
|
|
|
# Load lists
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Constructing spam blacklist\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
foreach ( $this->files as $fileName ) {
|
2007-07-20 21:13:26 +00:00
|
|
|
if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
|
|
|
|
$text = $this->getArticleText( $matches[1], $matches[2] );
|
|
|
|
} elseif ( preg_match( '/^http:\/\//', $fileName ) ) {
|
|
|
|
$text = $this->getHttpText( $fileName );
|
|
|
|
} else {
|
|
|
|
$text = file_get_contents( $fileName );
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
2007-07-20 21:13:26 +00:00
|
|
|
|
|
|
|
// Build a separate batch of regexes from each source.
|
|
|
|
// While in theory we could squeeze a little efficiency
|
|
|
|
// out of combining multiple sources in one regex, if
|
|
|
|
// there's a bad line in one of them we'll gain more
|
|
|
|
// from only having to break that set into smaller pieces.
|
|
|
|
$regexes = array_merge( $regexes,
|
|
|
|
SpamRegexBatch::regexesFromText( $text, $fileName ) );
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
2007-07-20 21:13:26 +00:00
|
|
|
|
|
|
|
return $regexes;
|
|
|
|
}
|
|
|
|
|
|
|
|
function getHttpText( $fileName ) {
|
|
|
|
global $wgDBname, $messageMemc;
|
|
|
|
|
|
|
|
# HTTP request
|
|
|
|
# To keep requests to a minimum, we save results into $messageMemc, which is
|
|
|
|
# similar to $wgMemc except almost certain to exist. By default, it is stored
|
|
|
|
# in the database
|
|
|
|
#
|
|
|
|
# There are two keys, when the warning key expires, a random thread will refresh
|
|
|
|
# the real key. This reduces the chance of multiple requests under high traffic
|
|
|
|
# conditions.
|
|
|
|
$key = "spam_blacklist_file:$fileName";
|
|
|
|
$warningKey = "$wgDBname:spamfilewarning:$fileName";
|
|
|
|
$httpText = $messageMemc->get( $key );
|
|
|
|
$warning = $messageMemc->get( $warningKey );
|
2007-01-06 20:56:46 +00:00
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Loading spam blacklist from $fileName\n" );
|
2007-07-20 21:13:26 +00:00
|
|
|
$httpText = $this->getHTTP( $fileName );
|
|
|
|
if( $httpText === false ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Error loading blacklist from $fileName\n" );
|
2006-01-19 17:14:10 +00:00
|
|
|
}
|
2007-07-20 21:13:26 +00:00
|
|
|
$messageMemc->set( $warningKey, 1, $this->warningTime );
|
|
|
|
$messageMemc->set( $key, $httpText, $this->expiryTime );
|
|
|
|
} else {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Got spam blacklist from HTTP cache for $fileName\n" );
|
2007-01-06 20:56:46 +00:00
|
|
|
}
|
2007-07-20 21:13:26 +00:00
|
|
|
return $httpText;
|
2006-01-19 17:14:10 +00:00
|
|
|
}
|
2007-07-07 17:21:49 +00:00
|
|
|
|
|
|
|
function getLocalBlacklists() {
|
2007-07-20 21:13:26 +00:00
|
|
|
return SpamRegexBatch::regexesFromMessage( 'spam-blacklist' );
|
2007-07-07 17:21:49 +00:00
|
|
|
}
|
|
|
|
|
2006-09-18 09:56:57 +00:00
|
|
|
function getWhitelists() {
|
2007-07-20 21:13:26 +00:00
|
|
|
return SpamRegexBatch::regexesFromMessage( 'spam-whitelist' );
|
2006-06-22 19:59:43 +00:00
|
|
|
}
|
2007-01-06 20:56:46 +00:00
|
|
|
|
2007-11-12 07:44:17 +00:00
|
|
|
/**
|
|
|
|
* @param Title $title
|
|
|
|
* @param string $text Text of section, or entire text if $editPage!=false
|
|
|
|
* @param string $section Section number or name
|
|
|
|
* @param EditPage $editPage EditPage if EditFilterMerged was called, false otherwise
|
|
|
|
* @return True if the edit should not be allowed, false otherwise
|
|
|
|
* If the return value is true, an error will have been sent to $wgOut
|
|
|
|
*/
|
|
|
|
function filter( &$title, $text, $section, $editPage = false ) {
|
2006-06-22 21:12:18 +00:00
|
|
|
global $wgArticle, $wgVersion, $wgOut, $wgParser, $wgUser;
|
2006-01-19 17:14:10 +00:00
|
|
|
|
|
|
|
$fname = 'wfSpamBlacklistFilter';
|
|
|
|
wfProfileIn( $fname );
|
|
|
|
|
|
|
|
# Call the rest of the hook chain first
|
|
|
|
if ( $this->previousFilter ) {
|
|
|
|
$f = $this->previousFilter;
|
|
|
|
if ( $f( $title, $text, $section ) ) {
|
|
|
|
wfProfileOut( $fname );
|
|
|
|
return true;
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
}
|
2006-01-19 17:14:10 +00:00
|
|
|
|
2006-01-23 01:35:39 +00:00
|
|
|
$this->title = $title;
|
|
|
|
$this->text = $text;
|
|
|
|
$this->section = $section;
|
|
|
|
|
2007-07-07 17:21:49 +00:00
|
|
|
$blacklists = $this->getBlacklists();
|
2006-09-18 09:56:57 +00:00
|
|
|
$whitelists = $this->getWhitelists();
|
2007-01-06 20:56:46 +00:00
|
|
|
|
2007-07-07 17:21:49 +00:00
|
|
|
if ( count( $blacklists ) ) {
|
2006-06-22 20:35:49 +00:00
|
|
|
# Run parser to strip SGML comments and such out of the markup
|
2006-04-12 04:59:27 +00:00
|
|
|
# This was being used to circumvent the filter (see bug 5185)
|
2007-11-12 07:44:17 +00:00
|
|
|
if ( $editPage ) {
|
|
|
|
$editInfo = $editPage->mArticle->prepareTextForEdit( $text );
|
|
|
|
$out = $editInfo->output;
|
|
|
|
} else {
|
|
|
|
$options = new ParserOptions();
|
|
|
|
$text = $wgParser->preSaveTransform( $text, $title, $wgUser, $options );
|
|
|
|
$out = $wgParser->parse( $text, $title, $options );
|
|
|
|
}
|
2006-06-22 20:35:49 +00:00
|
|
|
$links = implode( "\n", array_keys( $out->getExternalLinks() ) );
|
2007-01-06 20:56:46 +00:00
|
|
|
|
2006-06-22 19:59:43 +00:00
|
|
|
# Strip whitelisted URLs from the match
|
2006-09-18 09:56:57 +00:00
|
|
|
if( is_array( $whitelists ) ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
|
2006-09-18 09:56:57 +00:00
|
|
|
" regexes: " . implode( ', ', $whitelists ) . "\n" );
|
|
|
|
foreach( $whitelists as $regex ) {
|
2007-08-08 15:42:36 +00:00
|
|
|
wfSuppressWarnings();
|
2007-10-03 00:48:57 +00:00
|
|
|
$newLinks = preg_replace( $regex, '', $links );
|
2007-08-08 15:42:36 +00:00
|
|
|
wfRestoreWarnings();
|
2007-10-03 00:48:57 +00:00
|
|
|
if( is_string( $newLinks ) ) {
|
|
|
|
// If there wasn't a regex error, strip the matching URLs
|
|
|
|
$links = $newLinks;
|
|
|
|
}
|
2006-09-18 09:56:57 +00:00
|
|
|
}
|
2006-06-22 19:59:43 +00:00
|
|
|
}
|
2006-04-12 04:59:27 +00:00
|
|
|
|
2004-12-11 09:59:06 +00:00
|
|
|
# Do the match
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
|
2007-07-07 17:21:49 +00:00
|
|
|
" regexes: " . implode( ', ', $blacklists ) . "\n" );
|
2006-09-18 09:56:57 +00:00
|
|
|
$retVal = false;
|
2007-07-07 17:21:49 +00:00
|
|
|
foreach( $blacklists as $regex ) {
|
2007-07-20 21:13:26 +00:00
|
|
|
wfSuppressWarnings();
|
|
|
|
$check = preg_match( $regex, $links, $matches );
|
|
|
|
wfRestoreWarnings();
|
|
|
|
if( $check ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Match!\n" );
|
2007-11-12 07:44:17 +00:00
|
|
|
if ( $editPage ) {
|
|
|
|
$editPage->spamPage( $matches[0] );
|
|
|
|
} else {
|
|
|
|
EditPage::spamPage( $matches[0] );
|
|
|
|
}
|
2006-09-18 09:56:57 +00:00
|
|
|
$retVal = true;
|
|
|
|
break;
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$retVal = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
wfProfileOut( $fname );
|
|
|
|
return $retVal;
|
|
|
|
}
|
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
/**
|
|
|
|
* Fetch an article from this or another local MediaWiki database.
|
|
|
|
* This is probably *very* fragile, and shouldn't be used perhaps.
|
|
|
|
* @param string $db
|
|
|
|
* @param string $article
|
|
|
|
*/
|
|
|
|
function getArticleText( $db, $article ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Fetching local spam blacklist from '$article' on '$db'...\n" );
|
2006-01-23 01:35:39 +00:00
|
|
|
global $wgDBname;
|
2004-12-11 09:59:06 +00:00
|
|
|
$dbr = wfGetDB( DB_READ );
|
2006-01-23 01:35:39 +00:00
|
|
|
$dbr->selectDB( $db );
|
|
|
|
$text = false;
|
|
|
|
if ( $dbr->tableExists( 'page' ) ) {
|
|
|
|
// 1.5 schema
|
|
|
|
$dbw =& wfGetDB( DB_READ );
|
|
|
|
$dbw->selectDB( $db );
|
|
|
|
$revision = Revision::newFromTitle( Title::newFromText( $article ) );
|
|
|
|
if ( $revision ) {
|
|
|
|
$text = $revision->getText();
|
|
|
|
}
|
|
|
|
$dbw->selectDB( $wgDBname );
|
|
|
|
} else {
|
|
|
|
// 1.4 schema
|
|
|
|
$title = Title::newFromText( $article );
|
|
|
|
$text = $dbr->selectField( 'cur', 'cur_text', array( 'cur_namespace' => $title->getNamespace(),
|
2007-07-20 21:13:26 +00:00
|
|
|
'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleText' );
|
2006-01-23 01:35:39 +00:00
|
|
|
}
|
|
|
|
$dbr->selectDB( $wgDBname );
|
2007-07-20 21:13:26 +00:00
|
|
|
return strval( $text );
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
2005-07-02 09:03:53 +00:00
|
|
|
|
|
|
|
function getHTTP( $url ) {
|
|
|
|
// Use wfGetHTTP from MW 1.5 if it is available
|
2007-07-07 17:21:49 +00:00
|
|
|
global $IP;
|
|
|
|
include_once( "$IP/includes/HttpFunctions.php" );
|
2007-07-20 21:13:26 +00:00
|
|
|
wfSuppressWarnings();
|
2005-07-02 09:03:53 +00:00
|
|
|
if ( function_exists( 'wfGetHTTP' ) ) {
|
|
|
|
$text = wfGetHTTP( $url );
|
|
|
|
} else {
|
|
|
|
$url_fopen = ini_set( 'allow_url_fopen', 1 );
|
|
|
|
$text = file_get_contents( $url );
|
|
|
|
ini_set( 'allow_url_fopen', $url_fopen );
|
|
|
|
}
|
2007-07-20 21:13:26 +00:00
|
|
|
wfRestoreWarnings();
|
2005-07-02 09:03:53 +00:00
|
|
|
return $text;
|
|
|
|
}
|
2007-11-12 07:44:17 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Confirm that a local blacklist page being saved is valid,
|
|
|
|
* and toss back a warning to the user if it isn't.
|
|
|
|
* This is an EditFilter hook.
|
|
|
|
*/
|
|
|
|
function validate( $editPage, $text, $section, &$hookError ) {
|
|
|
|
$thisPageName = $editPage->mTitle->getPrefixedDBkey();
|
|
|
|
|
|
|
|
if( !$this->isLocalSource( $editPage->mTitle ) ) {
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" );
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
$lines = explode( "\n", $text );
|
|
|
|
|
|
|
|
$badLines = SpamRegexBatch::getBadLines( $lines );
|
|
|
|
if( $badLines ) {
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " .
|
|
|
|
implode( ', ', $badLines ) . "\n" );
|
|
|
|
|
|
|
|
$badList = "*<tt>" .
|
|
|
|
implode( "</tt>\n*<tt>",
|
|
|
|
array_map( 'wfEscapeWikiText', $badLines ) ) .
|
|
|
|
"</tt>\n";
|
|
|
|
$hookError =
|
|
|
|
"<div class='errorbox'>" .
|
|
|
|
wfMsgExt( 'spam-invalid-lines', array( 'parsemag' ), count( $badLines ) ) .
|
|
|
|
$badList .
|
|
|
|
"</div>\n" .
|
|
|
|
"<br clear='all' />\n";
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" );
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function onArticleSave( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) {
|
|
|
|
if( $this->isLocalSource( $article->getTitle() ) ) {
|
|
|
|
$this->clearCache();
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
|
2007-01-06 20:56:46 +00:00
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
class SpamRegexBatch {
|
|
|
|
/**
|
|
|
|
* Build a set of regular expressions matching URLs with the list of regex fragments.
|
|
|
|
* Returns an empty list if the input list is empty.
|
|
|
|
*
|
|
|
|
* @param array $lines list of fragments which will match in URLs
|
|
|
|
* @param int $batchSize largest allowed batch regex;
|
|
|
|
* if 0, will produce one regex per line
|
|
|
|
* @return array
|
|
|
|
* @private
|
|
|
|
* @static
|
|
|
|
*/
|
|
|
|
function buildRegexes( $lines, $batchSize=4096 ) {
|
|
|
|
# Make regex
|
|
|
|
# It's faster using the S modifier even though it will usually only be run once
|
2007-09-11 18:13:40 +00:00
|
|
|
//$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
|
2007-07-20 21:13:26 +00:00
|
|
|
//return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Si';
|
|
|
|
$regexes = array();
|
2007-09-11 18:13:40 +00:00
|
|
|
$regexStart = '/https?:\/\/+[a-z0-9_\-.]*(';
|
2007-07-20 21:13:26 +00:00
|
|
|
$regexEnd = ($batchSize > 0 ) ? ')/Si' : ')/i';
|
|
|
|
$build = false;
|
|
|
|
foreach( $lines as $line ) {
|
|
|
|
// FIXME: not very robust size check, but should work. :)
|
|
|
|
if( $build === false ) {
|
|
|
|
$build = $line;
|
|
|
|
} elseif( strlen( $build ) + strlen( $line ) > $batchSize ) {
|
|
|
|
$regexes[] = $regexStart .
|
|
|
|
str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $build) ) .
|
|
|
|
$regexEnd;
|
|
|
|
$build = $line;
|
|
|
|
} else {
|
|
|
|
$build .= '|';
|
|
|
|
$build .= $line;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if( $build !== false ) {
|
|
|
|
$regexes[] = $regexStart .
|
|
|
|
str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $build) ) .
|
|
|
|
$regexEnd;
|
|
|
|
}
|
|
|
|
return $regexes;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Confirm that a set of regexes is either empty or valid.
|
|
|
|
* @param array $lines set of regexes
|
|
|
|
* @return bool true if ok, false if contains invalid lines
|
|
|
|
* @private
|
|
|
|
* @static
|
|
|
|
*/
|
|
|
|
function validateRegexes( $regexes ) {
|
|
|
|
foreach( $regexes as $regex ) {
|
|
|
|
wfSuppressWarnings();
|
|
|
|
$ok = preg_match( $regex, '' );
|
|
|
|
wfRestoreWarnings();
|
|
|
|
|
|
|
|
if( $ok === false ) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Strip comments and whitespace, then remove blanks
|
|
|
|
* @private
|
|
|
|
* @static
|
|
|
|
*/
|
|
|
|
function stripLines( $lines ) {
|
|
|
|
return array_filter(
|
|
|
|
array_map( 'trim',
|
|
|
|
preg_replace( '/#.*$/', '',
|
|
|
|
$lines ) ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Do a sanity check on the batch regex.
|
|
|
|
* @param lines unsanitized input lines
|
|
|
|
* @param string $fileName optional for debug reporting
|
|
|
|
* @return array of regexes
|
|
|
|
* @private
|
|
|
|
* @static
|
|
|
|
*/
|
|
|
|
function buildSafeRegexes( $lines, $fileName=false ) {
|
|
|
|
$lines = SpamRegexBatch::stripLines( $lines );
|
|
|
|
$regexes = SpamRegexBatch::buildRegexes( $lines );
|
|
|
|
if( SpamRegexBatch::validateRegexes( $regexes ) ) {
|
|
|
|
return $regexes;
|
|
|
|
} else {
|
|
|
|
// _Something_ broke... rebuild line-by-line; it'll be
|
|
|
|
// slower if there's a lot of blacklist lines, but one
|
|
|
|
// broken line won't take out hundreds of its brothers.
|
|
|
|
if( $fileName ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
|
2007-07-20 21:13:26 +00:00
|
|
|
}
|
|
|
|
return SpamRegexBatch::buildRegexes( $lines, 0 );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param array $lines
|
|
|
|
* @return array of input lines which produce invalid input, or empty array if no problems
|
|
|
|
* @static
|
|
|
|
*/
|
|
|
|
function getBadLines( $lines ) {
|
|
|
|
$lines = SpamRegexBatch::stripLines( $lines );
|
|
|
|
$regexes = SpamRegexBatch::buildRegexes( $lines );
|
|
|
|
if( SpamRegexBatch::validateRegexes( $regexes ) ) {
|
|
|
|
// No problems!
|
|
|
|
return array();
|
|
|
|
}
|
|
|
|
|
|
|
|
$badLines = array();
|
|
|
|
foreach( $lines as $line ) {
|
|
|
|
$regexes = SpamRegexBatch::buildRegexes( array( $line ) );
|
|
|
|
if( !SpamRegexBatch::validateRegexes( $regexes ) ) {
|
|
|
|
$badLines[] = $line;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $badLines;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Build a set of regular expressions from the given multiline input text,
|
|
|
|
* with empty lines and comments stripped.
|
|
|
|
*
|
|
|
|
* @param string $source
|
|
|
|
* @param string $fileName optional, for reporting of bad files
|
|
|
|
* @return array of regular expressions, potentially empty
|
|
|
|
* @static
|
|
|
|
*/
|
|
|
|
function regexesFromText( $source, $fileName=false ) {
|
|
|
|
$lines = explode( "\n", $source );
|
|
|
|
return SpamRegexBatch::buildSafeRegexes( $lines, $fileName );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Build a set of regular expressions from a MediaWiki message.
|
|
|
|
* Will be correctly empty if the message isn't present.
|
|
|
|
* @param string $source
|
|
|
|
* @return array of regular expressions, potentially empty
|
|
|
|
* @static
|
|
|
|
*/
|
|
|
|
function regexesFromMessage( $message ) {
|
|
|
|
$source = wfMsgForContent( $message );
|
|
|
|
if( $source && !wfEmptyMsg( $message, $source ) ) {
|
|
|
|
return SpamRegexBatch::regexesFromText( $source );
|
|
|
|
} else {
|
|
|
|
return array();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|