2004-12-11 09:59:06 +00:00
|
|
|
|
<?php
|
|
|
|
|
|
2007-11-12 07:44:17 +00:00
|
|
|
|
if ( !defined( 'MEDIAWIKI' ) ) {
|
|
|
|
|
exit;
|
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
|
|
2012-01-17 06:13:46 +00:00
|
|
|
|
class SpamBlacklist extends BaseBlacklist {
|
2007-07-20 21:13:26 +00:00
|
|
|
|
var $files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );
|
2011-05-03 20:23:35 +00:00
|
|
|
|
var $ignoreEditSummary = false;
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2007-07-20 21:13:26 +00:00
|
|
|
|
/**
|
2012-01-17 06:13:46 +00:00
|
|
|
|
* Returns the code for the blacklist implementation
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
2007-07-20 21:13:26 +00:00
|
|
|
|
*/
|
2012-01-17 06:13:46 +00:00
|
|
|
|
protected function getBlacklistType() {
|
|
|
|
|
return 'spam';
|
2006-06-22 19:59:43 +00:00
|
|
|
|
}
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2007-11-12 07:44:17 +00:00
|
|
|
|
/**
|
|
|
|
|
* @param Title $title
|
|
|
|
|
* @param string $text Text of section, or entire text if $editPage!=false
|
|
|
|
|
* @param string $section Section number or name
|
2012-09-02 15:41:39 +00:00
|
|
|
|
* @param string $editsummary Edit summary if one exists, some people use urls there too
|
2008-08-16 21:40:30 +00:00
|
|
|
|
* @param EditPage $editPage EditPage if EditFilterMerged was called, null otherwise
|
2012-03-27 20:42:49 +00:00
|
|
|
|
* @return Array Matched text(s) if the edit should not be allowed, false otherwise
|
2007-11-12 07:44:17 +00:00
|
|
|
|
*/
|
2008-08-16 21:40:30 +00:00
|
|
|
|
function filter( &$title, $text, $section, $editsummary = '', EditPage &$editPage = null ) {
|
2012-01-17 06:13:46 +00:00
|
|
|
|
/**
|
|
|
|
|
* @var $wgParser Parser
|
|
|
|
|
*/
|
2010-07-25 17:12:50 +00:00
|
|
|
|
global $wgParser, $wgUser;
|
2006-01-19 17:14:10 +00:00
|
|
|
|
|
|
|
|
|
$fname = 'wfSpamBlacklistFilter';
|
|
|
|
|
wfProfileIn( $fname );
|
|
|
|
|
|
2012-01-17 06:13:46 +00:00
|
|
|
|
# These don't do anything, commenting out...
|
|
|
|
|
#$this->title = $title;
|
|
|
|
|
#$this->text = $text;
|
|
|
|
|
#$this->section = $section;
|
2008-02-03 18:58:27 +00:00
|
|
|
|
$text = str_replace( '.', '.', $text ); //@bug 12896
|
2006-01-23 01:35:39 +00:00
|
|
|
|
|
2007-07-07 17:21:49 +00:00
|
|
|
|
$blacklists = $this->getBlacklists();
|
2006-09-18 09:56:57 +00:00
|
|
|
|
$whitelists = $this->getWhitelists();
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2007-07-07 17:21:49 +00:00
|
|
|
|
if ( count( $blacklists ) ) {
|
2006-06-22 20:35:49 +00:00
|
|
|
|
# Run parser to strip SGML comments and such out of the markup
|
2006-04-12 04:59:27 +00:00
|
|
|
|
# This was being used to circumvent the filter (see bug 5185)
|
2007-11-12 07:44:17 +00:00
|
|
|
|
if ( $editPage ) {
|
|
|
|
|
$editInfo = $editPage->mArticle->prepareTextForEdit( $text );
|
|
|
|
|
$out = $editInfo->output;
|
|
|
|
|
} else {
|
|
|
|
|
$options = new ParserOptions();
|
|
|
|
|
$text = $wgParser->preSaveTransform( $text, $title, $wgUser, $options );
|
|
|
|
|
$out = $wgParser->parse( $text, $title, $options );
|
|
|
|
|
}
|
2008-05-13 23:31:33 +00:00
|
|
|
|
$newLinks = array_keys( $out->getExternalLinks() );
|
|
|
|
|
$oldLinks = $this->getCurrentLinks( $title );
|
|
|
|
|
$addedLinks = array_diff( $newLinks, $oldLinks );
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2008-06-19 03:14:34 +00:00
|
|
|
|
// We add the edit summary if one exists
|
2011-01-23 10:34:56 +00:00
|
|
|
|
if ( !$this->ignoreEditSummary && !empty( $editsummary ) ) {
|
|
|
|
|
$addedLinks[] = $editsummary;
|
|
|
|
|
}
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2008-05-13 23:31:33 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
|
|
|
|
|
wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
|
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2008-05-13 23:31:33 +00:00
|
|
|
|
$links = implode( "\n", $addedLinks );
|
2007-01-06 20:56:46 +00:00
|
|
|
|
|
2006-06-22 19:59:43 +00:00
|
|
|
|
# Strip whitelisted URLs from the match
|
2006-09-18 09:56:57 +00:00
|
|
|
|
if( is_array( $whitelists ) ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
|
2006-09-18 09:56:57 +00:00
|
|
|
|
" regexes: " . implode( ', ', $whitelists ) . "\n" );
|
|
|
|
|
foreach( $whitelists as $regex ) {
|
2007-08-08 15:42:36 +00:00
|
|
|
|
wfSuppressWarnings();
|
2007-10-03 00:48:57 +00:00
|
|
|
|
$newLinks = preg_replace( $regex, '', $links );
|
2007-08-08 15:42:36 +00:00
|
|
|
|
wfRestoreWarnings();
|
2007-10-03 00:48:57 +00:00
|
|
|
|
if( is_string( $newLinks ) ) {
|
|
|
|
|
// If there wasn't a regex error, strip the matching URLs
|
|
|
|
|
$links = $newLinks;
|
|
|
|
|
}
|
2006-09-18 09:56:57 +00:00
|
|
|
|
}
|
2006-06-22 19:59:43 +00:00
|
|
|
|
}
|
2006-04-12 04:59:27 +00:00
|
|
|
|
|
2004-12-11 09:59:06 +00:00
|
|
|
|
# Do the match
|
2007-10-03 00:19:36 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
|
2007-07-07 17:21:49 +00:00
|
|
|
|
" regexes: " . implode( ', ', $blacklists ) . "\n" );
|
2006-09-18 09:56:57 +00:00
|
|
|
|
$retVal = false;
|
2007-07-07 17:21:49 +00:00
|
|
|
|
foreach( $blacklists as $regex ) {
|
2007-07-20 21:13:26 +00:00
|
|
|
|
wfSuppressWarnings();
|
2011-01-23 10:34:56 +00:00
|
|
|
|
$matches = array();
|
2012-03-27 20:42:49 +00:00
|
|
|
|
$check = ( preg_match_all( $regex, $links, $matches ) > 0 );
|
2007-07-20 21:13:26 +00:00
|
|
|
|
wfRestoreWarnings();
|
|
|
|
|
if( $check ) {
|
2007-10-03 00:19:36 +00:00
|
|
|
|
wfDebugLog( 'SpamBlacklist', "Match!\n" );
|
2012-09-02 15:41:39 +00:00
|
|
|
|
global $wgRequest;
|
|
|
|
|
$ip = $wgRequest->getIP();
|
2012-03-27 20:42:49 +00:00
|
|
|
|
$imploded = implode( ' ', $matches[0] );
|
|
|
|
|
wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
|
|
|
|
|
if( $retVal === false ){
|
|
|
|
|
$retVal = array();
|
|
|
|
|
}
|
|
|
|
|
$retVal = array_merge( $retVal, $matches[0] );
|
2006-09-18 09:56:57 +00:00
|
|
|
|
}
|
2004-12-11 09:59:06 +00:00
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
$retVal = false;
|
|
|
|
|
}
|
|
|
|
|
wfProfileOut( $fname );
|
|
|
|
|
return $retVal;
|
|
|
|
|
}
|
2008-11-02 22:40:02 +00:00
|
|
|
|
|
2008-05-13 23:31:33 +00:00
|
|
|
|
/**
|
|
|
|
|
* Look up the links currently in the article, so we can
|
|
|
|
|
* ignore them on a second run.
|
|
|
|
|
*
|
|
|
|
|
* WARNING: I can add more *of the same link* with no problem here.
|
2012-01-17 06:13:46 +00:00
|
|
|
|
* @param $title Title
|
|
|
|
|
* @return array
|
2008-05-13 23:31:33 +00:00
|
|
|
|
*/
|
|
|
|
|
function getCurrentLinks( $title ) {
|
2010-02-13 23:03:40 +00:00
|
|
|
|
$dbr = wfGetDB( DB_SLAVE );
|
2012-03-11 19:04:37 +00:00
|
|
|
|
$id = $title->getArticleID(); // should be zero queries
|
2008-11-02 22:40:02 +00:00
|
|
|
|
$res = $dbr->select( 'externallinks', array( 'el_to' ),
|
2008-05-13 23:31:33 +00:00
|
|
|
|
array( 'el_from' => $id ), __METHOD__ );
|
|
|
|
|
$links = array();
|
2010-10-29 21:30:20 +00:00
|
|
|
|
foreach ( $res as $row ) {
|
2008-05-13 23:31:33 +00:00
|
|
|
|
$links[] = $row->el_to;
|
|
|
|
|
}
|
|
|
|
|
return $links;
|
|
|
|
|
}
|
2012-01-18 23:29:37 +00:00
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the start of the regex for matches
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
public function getRegexStart() {
|
2012-07-14 17:40:05 +00:00
|
|
|
|
return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
|
2012-01-18 23:29:37 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the end of the regex for matches
|
|
|
|
|
*
|
|
|
|
|
* @param $batchSize
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
public function getRegexEnd( $batchSize ) {
|
|
|
|
|
return ')' . parent::getRegexEnd( $batchSize );
|
|
|
|
|
}
|
2012-02-03 20:15:02 +00:00
|
|
|
|
}
|