2006-01-19 17:17:03 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
|
|
|
* An aggressive spam cleanup script.
|
|
|
|
* Searches the database for matching pages, and reverts them to the last non-spammed revision.
|
|
|
|
* If all revisions contain spam, deletes the page
|
|
|
|
*/
|
|
|
|
|
|
|
|
require_once( '../../maintenance/commandLine.inc' );
|
|
|
|
require_once( 'SpamBlacklist_body.php' );
|
|
|
|
|
2007-01-06 20:56:46 +00:00
|
|
|
/**
|
2006-01-19 17:17:03 +00:00
|
|
|
* Find the latest revision of the article that does not contain spam and revert to it
|
|
|
|
*/
|
2012-09-02 15:41:39 +00:00
|
|
|
function cleanupArticle( Revision $rev, $regexes, $match ) {
|
2006-01-19 17:17:03 +00:00
|
|
|
$title = $rev->getTitle();
|
|
|
|
$revId = $rev->getId();
|
2006-09-18 09:56:57 +00:00
|
|
|
while ( $rev ) {
|
|
|
|
$matches = false;
|
2012-10-08 23:33:43 +00:00
|
|
|
foreach ( $regexes as $regex ) {
|
2006-09-18 09:56:57 +00:00
|
|
|
$matches = $matches || preg_match( $regex, $rev->getText() );
|
|
|
|
}
|
2012-10-08 23:33:43 +00:00
|
|
|
if ( !$matches ) {
|
2006-09-18 09:56:57 +00:00
|
|
|
// Didn't find any spam
|
|
|
|
break;
|
|
|
|
}
|
2006-01-19 17:17:03 +00:00
|
|
|
# Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26)
|
|
|
|
#$rev = $rev->getPrevious();
|
|
|
|
$revId = $title->getPreviousRevisionID( $revId );
|
|
|
|
if ( $revId ) {
|
|
|
|
$rev = Revision::newFromTitle( $title, $revId );
|
|
|
|
} else {
|
|
|
|
$rev = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ( !$rev ) {
|
|
|
|
// Didn't find a non-spammy revision, delete the page
|
2012-10-08 23:33:43 +00:00
|
|
|
/*
|
2006-01-19 17:17:03 +00:00
|
|
|
print "All revisions are spam, deleting...\n";
|
|
|
|
$article = new Article( $title );
|
|
|
|
$article->doDeleteArticle( "All revisions matched the spam blacklist" );
|
2012-10-08 23:33:43 +00:00
|
|
|
*/
|
2006-01-21 23:27:39 +00:00
|
|
|
// Too scary, blank instead
|
|
|
|
print "All revisions are spam, blanking...\n";
|
2010-12-18 18:42:13 +00:00
|
|
|
$text = '';
|
|
|
|
$comment = "All revisions matched the spam blacklist ($match), blanking";
|
2006-01-19 17:17:03 +00:00
|
|
|
} else {
|
|
|
|
// Revert to this revision
|
2010-12-18 18:42:13 +00:00
|
|
|
$text = $rev->getText();
|
|
|
|
$comment = "Cleaning up links to $match";
|
2006-01-19 17:17:03 +00:00
|
|
|
}
|
2012-09-02 15:41:39 +00:00
|
|
|
$wikiPage = new WikiPage( $title );
|
2016-09-15 19:54:09 +00:00
|
|
|
$wikiPage->doEditContent( ContentHandler::makeContent( $text, $title ), $comment );
|
2006-01-19 17:17:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
|
2006-04-02 03:50:06 +00:00
|
|
|
$username = 'Spam cleanup script';
|
2016-05-13 18:35:41 +00:00
|
|
|
if ( method_exists( 'User', 'newSystemUser' ) ) {
|
|
|
|
$wgUser = User::newSystemUser( $username, array( 'steal' => true ) );
|
|
|
|
} else {
|
|
|
|
$wgUser = User::newFromName( $username );
|
|
|
|
if ( $wgUser->idForName() == 0 ) {
|
|
|
|
// Create the user
|
|
|
|
$status = $wgUser->addToDatabase();
|
|
|
|
if ( $status === null || $status->isOK() ) {
|
|
|
|
$dbw = wfGetDB( DB_MASTER );
|
|
|
|
$dbw->update( 'user', array( 'user_password' => 'nologin' ),
|
|
|
|
array( 'user_name' => $username ), $username );
|
|
|
|
}
|
2012-10-08 23:33:43 +00:00
|
|
|
}
|
2006-04-02 03:50:06 +00:00
|
|
|
}
|
|
|
|
|
2006-01-21 23:27:39 +00:00
|
|
|
if ( isset( $options['n'] ) ) {
|
|
|
|
$dryRun = true;
|
|
|
|
} else {
|
|
|
|
$dryRun = false;
|
|
|
|
}
|
2006-01-19 17:17:03 +00:00
|
|
|
|
|
|
|
$sb = new SpamBlacklist( $wgSpamBlacklistSettings );
|
|
|
|
if ( $wgSpamBlacklistFiles ) {
|
|
|
|
$sb->files = $wgSpamBlacklistFiles;
|
|
|
|
}
|
2011-05-06 23:52:52 +00:00
|
|
|
$regexes = $sb->getBlacklists();
|
2006-09-18 09:56:57 +00:00
|
|
|
if ( !$regexes ) {
|
2006-01-19 17:17:03 +00:00
|
|
|
print "Invalid regex, can't clean up spam\n";
|
2012-10-08 23:33:43 +00:00
|
|
|
exit( 1 );
|
2006-01-19 17:17:03 +00:00
|
|
|
}
|
|
|
|
|
2010-02-13 23:03:40 +00:00
|
|
|
$dbr = wfGetDB( DB_SLAVE );
|
2006-01-19 17:17:03 +00:00
|
|
|
$maxID = $dbr->selectField( 'page', 'MAX(page_id)' );
|
|
|
|
$reportingInterval = 100;
|
|
|
|
|
2006-09-18 09:56:57 +00:00
|
|
|
print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n";
|
2006-01-19 17:17:03 +00:00
|
|
|
print "Searching for spam in $maxID pages...\n";
|
2006-01-21 23:27:39 +00:00
|
|
|
if ( $dryRun ) {
|
|
|
|
print "Dry run only\n";
|
|
|
|
}
|
2006-01-19 17:17:03 +00:00
|
|
|
|
2012-10-08 23:33:43 +00:00
|
|
|
for ( $id = 1; $id <= $maxID; $id++ ) {
|
2006-01-19 17:17:03 +00:00
|
|
|
if ( $id % $reportingInterval == 0 ) {
|
|
|
|
printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 );
|
|
|
|
}
|
|
|
|
$revision = Revision::loadFromPageId( $dbr, $id );
|
|
|
|
if ( $revision ) {
|
|
|
|
$text = $revision->getText();
|
|
|
|
if ( $text ) {
|
2012-10-08 23:33:43 +00:00
|
|
|
foreach ( $regexes as $regex ) {
|
2006-09-18 09:56:57 +00:00
|
|
|
if ( preg_match( $regex, $text, $matches ) ) {
|
|
|
|
$title = $revision->getTitle();
|
|
|
|
$titleText = $title->getPrefixedText();
|
|
|
|
if ( $dryRun ) {
|
|
|
|
print "\nFound spam in [[$titleText]]\n";
|
|
|
|
} else {
|
|
|
|
print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n";
|
2012-10-08 23:33:43 +00:00
|
|
|
$match = str_replace( 'http://', '', $matches[0] );
|
2006-09-18 09:56:57 +00:00
|
|
|
cleanupArticle( $revision, $regexes, $match );
|
|
|
|
}
|
2006-01-21 23:27:39 +00:00
|
|
|
}
|
2006-01-19 17:17:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Just for satisfaction
|
2012-10-08 23:33:43 +00:00
|
|
|
printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );
|
2006-01-19 17:17:03 +00:00
|
|
|
|