2004-12-11 09:59:06 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
if ( defined( 'MEDIAWIKI' ) ) {
|
|
|
|
|
|
|
|
class SpamBlacklist {
|
|
|
|
var $regex = false;
|
|
|
|
var $previousFilter = false;
|
|
|
|
var $files = array();
|
2005-06-25 15:49:21 +00:00
|
|
|
var $warningTime = 600;
|
|
|
|
var $expiryTime = 900;
|
|
|
|
var $warningChance = 100;
|
2004-12-11 09:59:06 +00:00
|
|
|
|
2005-07-08 16:29:22 +00:00
|
|
|
function SpamBlacklist( $settings = array() ) {
|
2004-12-11 09:59:06 +00:00
|
|
|
global $IP;
|
2005-07-08 16:29:22 +00:00
|
|
|
$this->files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );
|
|
|
|
|
|
|
|
foreach ( $settings as $name => $value ) {
|
|
|
|
$this->$name = $value;
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
2005-07-08 16:29:22 +00:00
|
|
|
|
2006-01-19 17:14:10 +00:00
|
|
|
function &getRegex() {
|
|
|
|
global $wgMemc, $wgDBname, $messageMemc;
|
|
|
|
$fname = 'SpamBlacklist::getRegex';
|
2004-12-11 09:59:06 +00:00
|
|
|
wfProfileIn( $fname );
|
|
|
|
|
2006-01-19 17:14:10 +00:00
|
|
|
if ( $this->regex !== false ) {
|
|
|
|
return $this->regex;
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
|
2006-01-19 17:14:10 +00:00
|
|
|
wfDebug( "Loading spam regex..." );
|
|
|
|
|
2004-12-11 09:59:06 +00:00
|
|
|
if ( !is_array( $this->files ) ) {
|
|
|
|
$this->files = array( $this->files );
|
|
|
|
}
|
|
|
|
if ( count( $this->files ) == 0 ){
|
|
|
|
# No lists
|
2006-01-19 17:14:10 +00:00
|
|
|
wfDebug( "no files specified\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
wfProfileOut( $fname );
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Refresh cache if we are saving the blacklist
|
|
|
|
$recache = false;
|
|
|
|
foreach ( $this->files as $fileName ) {
|
|
|
|
if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
|
2005-02-20 09:09:29 +00:00
|
|
|
if ( $wgDBname == $matches[1] && $title->getPrefixedDBkey() == $matches[2] ) {
|
2004-12-11 09:59:06 +00:00
|
|
|
$recache = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( $this->regex === false || $recache ) {
|
|
|
|
if ( !$recache ) {
|
|
|
|
$this->regex = $wgMemc->get( "spam_blacklist_regex" );
|
|
|
|
}
|
|
|
|
if ( !$this->regex ) {
|
|
|
|
# Load lists
|
|
|
|
$lines = array();
|
2005-06-25 15:49:21 +00:00
|
|
|
wfDebug( "Constructing spam blacklist\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
foreach ( $this->files as $fileName ) {
|
|
|
|
if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
|
2005-02-20 09:03:13 +00:00
|
|
|
if ( $wgDBname == $matches[1] && $title->getPrefixedDBkey() == $matches[2] ) {
|
2005-11-01 07:25:33 +00:00
|
|
|
$lines = array_merge( $lines, explode( "\n", $text ) );
|
2004-12-11 09:59:06 +00:00
|
|
|
} else {
|
2005-11-01 07:25:33 +00:00
|
|
|
$lines = array_merge( $lines, $this->getArticleLines( $matches[1], $matches[2] ) );
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
2006-01-19 17:14:10 +00:00
|
|
|
wfDebug( "got from DB\n" );
|
2005-06-25 15:49:21 +00:00
|
|
|
} elseif ( preg_match( '/^http:\/\//', $fileName ) ) {
|
|
|
|
# HTTP request
|
|
|
|
# To keep requests to a minimum, we save results into $messageMemc, which is
|
|
|
|
# similar to $wgMemc except almost certain to exist. By default, it is stored
|
|
|
|
# in the database
|
|
|
|
#
|
|
|
|
# There are two keys, when the warning key expires, a random thread will refresh
|
|
|
|
# the real key. This reduces the chance of multiple requests under high traffic
|
|
|
|
# conditions.
|
|
|
|
$key = "spam_blacklist_file:$fileName";
|
|
|
|
$warningKey = "$wgDBname:spamfilewarning:$fileName";
|
|
|
|
$httpText = $messageMemc->get( $key );
|
|
|
|
$warning = $messageMemc->get( $warningKey );
|
|
|
|
|
|
|
|
if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
|
|
|
|
wfDebug( "Loading spam blacklist from $fileName\n" );
|
2005-07-02 09:03:53 +00:00
|
|
|
$httpText = $this->getHTTP( $fileName );
|
2005-06-25 15:49:21 +00:00
|
|
|
$messageMemc->set( $warningKey, 1, $this->warningTime );
|
|
|
|
$messageMemc->set( $key, $httpText, $this->expiryTime );
|
2006-01-19 17:14:10 +00:00
|
|
|
} else {
|
|
|
|
wfDebug( "got from HTTP cache\n" );
|
|
|
|
}
|
2005-11-01 07:25:33 +00:00
|
|
|
$lines = array_merge( $lines, explode( "\n", $httpText ) );
|
2004-12-11 09:59:06 +00:00
|
|
|
} else {
|
2005-11-01 07:25:33 +00:00
|
|
|
$lines = array_merge( $lines, file( $fileName ) );
|
2006-01-19 17:14:10 +00:00
|
|
|
wfDebug( "got from file\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Strip comments and whitespace, then remove blanks
|
|
|
|
$lines = array_filter( array_map( 'trim', preg_replace( '/#.*$/', '', $lines ) ) );
|
|
|
|
|
|
|
|
# No lines, don't make a regex which will match everything
|
|
|
|
if ( count( $lines ) == 0 ) {
|
2005-06-25 15:49:21 +00:00
|
|
|
wfDebug( "No lines\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
$this->regex = true;
|
|
|
|
} else {
|
|
|
|
# Make regex
|
|
|
|
# It's faster using the S modifier even though it will usually only be run once
|
2005-11-16 09:56:13 +00:00
|
|
|
$this->regex = 'http://[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
|
2005-02-20 08:02:18 +00:00
|
|
|
$this->regex = '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $this->regex) ) . '/Si';
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
2004-12-11 11:12:00 +00:00
|
|
|
$wgMemc->set( "spam_blacklist_regex", $this->regex, 3600 );
|
2006-01-19 17:14:10 +00:00
|
|
|
} else {
|
|
|
|
wfDebug( "got from cache\n" );
|
|
|
|
}
|
|
|
|
}
|
2006-01-21 23:27:39 +00:00
|
|
|
if ( $this->regex[0] != '/' || substr( $this->regex, -3 ) != '/Si' ) {
|
2006-01-19 17:14:10 +00:00
|
|
|
// Corrupt regex
|
|
|
|
wfDebug( "Corrupt regex\n" );
|
|
|
|
$this->regex = false;
|
|
|
|
}
|
|
|
|
wfProfileOut( $fname );
|
|
|
|
return $this->regex;
|
|
|
|
}
|
|
|
|
|
|
|
|
function filter( &$title, $text, $section ) {
|
|
|
|
global $wgArticle, $wgVersion, $wgOut;
|
|
|
|
|
|
|
|
$fname = 'wfSpamBlacklistFilter';
|
|
|
|
wfProfileIn( $fname );
|
|
|
|
|
|
|
|
# Call the rest of the hook chain first
|
|
|
|
if ( $this->previousFilter ) {
|
|
|
|
$f = $this->previousFilter;
|
|
|
|
if ( $f( $title, $text, $section ) ) {
|
|
|
|
wfProfileOut( $fname );
|
|
|
|
return true;
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
}
|
2006-01-19 17:14:10 +00:00
|
|
|
|
|
|
|
$regex =& $this->getRegex();
|
|
|
|
|
|
|
|
if ( $regex && $regex[0] == '/' ) {
|
2004-12-11 09:59:06 +00:00
|
|
|
# Do the match
|
2006-01-19 17:14:10 +00:00
|
|
|
wfDebug( "Checking text against regex: $regex\n" );
|
|
|
|
if ( preg_match( $regex, $text, $matches ) ) {
|
2005-06-25 15:49:21 +00:00
|
|
|
wfDebug( "Match!\n" );
|
2004-12-11 09:59:06 +00:00
|
|
|
EditPage::spamPage( $matches[0] );
|
|
|
|
$retVal = true;
|
|
|
|
} else {
|
|
|
|
$retVal = false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$retVal = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
wfProfileOut( $fname );
|
|
|
|
return $retVal;
|
|
|
|
}
|
|
|
|
|
|
|
|
function getArticleLines( $db, $article ) {
|
|
|
|
$dbr = wfGetDB( DB_READ );
|
2005-03-09 14:17:22 +00:00
|
|
|
$cur = $dbr->tableName( 'cur' );
|
2005-03-09 14:20:35 +00:00
|
|
|
$res = $dbr->query( "SELECT cur_text FROM $db.$cur WHERE cur_namespace=0 AND cur_title='$article'" );
|
2004-12-11 09:59:06 +00:00
|
|
|
$row = $dbr->fetchObject( $res );
|
|
|
|
if ( $row ) {
|
|
|
|
return explode( "\n", $row->cur_text );
|
|
|
|
} else {
|
|
|
|
return array();
|
|
|
|
}
|
|
|
|
}
|
2005-07-02 09:03:53 +00:00
|
|
|
|
|
|
|
function getHTTP( $url ) {
|
|
|
|
// Use wfGetHTTP from MW 1.5 if it is available
|
|
|
|
include_once( 'HttpFunctions.php' );
|
|
|
|
if ( function_exists( 'wfGetHTTP' ) ) {
|
|
|
|
$text = wfGetHTTP( $url );
|
|
|
|
} else {
|
|
|
|
$url_fopen = ini_set( 'allow_url_fopen', 1 );
|
|
|
|
$text = file_get_contents( $url );
|
|
|
|
ini_set( 'allow_url_fopen', $url_fopen );
|
|
|
|
}
|
|
|
|
return $text;
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} # End invocation guard
|
|
|
|
?>
|