mediawiki-extensions-SpamBl.../cleanup.php
Tim Starling c9d346125c * Revert r66878, completely misses the point of factoring out doEdit() in the first place, which was to separate the UI layer from the backend layer, to support callers with alternate UIs or no UIs.
* Reverted followup 66880.
* Reverted dependent changes r67752, r68606, r68608, r68609. The point of deprecating insertArticle()/updateArticle() was to allow the UI code to be moved to EditPage. If you move that exact EditPage-specific functionality back into Article::doEdit(), and call it from all sorts of non-EditPage places, then we'll hit the same sorts of bugs we had before r14834.
2010-08-09 08:33:42 +00:00

145 lines
3.9 KiB
PHP

<?php
/**
* An aggressive spam cleanup script.
* Searches the database for matching pages, and reverts them to the last non-spammed revision.
* If all revisions contain spam, deletes the page
*/
require_once( '../../maintenance/commandLine.inc' );
require_once( 'SpamBlacklist_body.php' );
/**
* Find the latest revision of the article that does not contain spam and revert to it
*/
function cleanupArticle( $rev, $regexes, $match ) {
$title = $rev->getTitle();
$revId = $rev->getId();
while ( $rev ) {
$matches = false;
foreach( $regexes as $regex ) {
$matches = $matches || preg_match( $regex, $rev->getText() );
}
if( !$matches ) {
// Didn't find any spam
break;
}
# Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26)
#$rev = $rev->getPrevious();
$revId = $title->getPreviousRevisionID( $revId );
if ( $revId ) {
$rev = Revision::newFromTitle( $title, $revId );
} else {
$rev = false;
}
}
$dbw = wfGetDB( DB_MASTER );
$dbw->begin();
if ( !$rev ) {
// Didn't find a non-spammy revision, delete the page
/*
print "All revisions are spam, deleting...\n";
$article = new Article( $title );
$article->doDeleteArticle( "All revisions matched the spam blacklist" );
*/
// Too scary, blank instead
print "All revisions are spam, blanking...\n";
$article = new Article( $title );
$article->updateArticle( '', "All revisions matched the spam blacklist ($match), blanking",
false, false );
} else {
// Revert to this revision
$article = new Article( $title );
$article->updateArticle( $rev->getText(), "Cleaning up links to $match", false, false );
}
$dbw->commit();
wfDoUpdates();
}
/**
* Do any deferred updates and clear the list
* TODO: This could be in Wiki.php if that class made any sense at all
*/
if ( !function_exists( 'wfDoUpdates' ) ) {
function wfDoUpdates()
{
global $wgPostCommitUpdateList, $wgDeferredUpdateList;
foreach ( $wgDeferredUpdateList as $update ) {
$update->doUpdate();
}
foreach ( $wgPostCommitUpdateList as $update ) {
$update->doUpdate();
}
$wgDeferredUpdateList = array();
$wgPostCommitUpdateList = array();
}
}
//------------------------------------------------------------------------------
$username = 'Spam cleanup script';
$wgUser = User::newFromName( $username );
if ( $wgUser->idForName() == 0 ) {
// Create the user
$wgUser->addToDatabase();
$dbw = wfGetDB( DB_MASTER );
$dbw->update( 'user', array( 'user_password' => 'nologin' ),
array( 'user_name' => $username ), $username );
}
if ( isset( $options['n'] ) ) {
$dryRun = true;
} else {
$dryRun = false;
}
$sb = new SpamBlacklist( $wgSpamBlacklistSettings );
if ( $wgSpamBlacklistFiles ) {
$sb->files = $wgSpamBlacklistFiles;
}
$regexes = $sb->getregexes();
if ( !$regexes ) {
print "Invalid regex, can't clean up spam\n";
exit(1);
}
$dbr = wfGetDB( DB_SLAVE );
$maxID = $dbr->selectField( 'page', 'MAX(page_id)' );
$reportingInterval = 100;
print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n";
print "Searching for spam in $maxID pages...\n";
if ( $dryRun ) {
print "Dry run only\n";
}
for ( $id=1; $id <= $maxID; $id++ ) {
if ( $id % $reportingInterval == 0 ) {
printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 );
}
$revision = Revision::loadFromPageId( $dbr, $id );
if ( $revision ) {
$text = $revision->getText();
if ( $text ) {
foreach( $regexes as $regex ) {
if ( preg_match( $regex, $text, $matches ) ) {
$title = $revision->getTitle();
$titleText = $title->getPrefixedText();
if ( $dryRun ) {
print "\nFound spam in [[$titleText]]\n";
} else {
print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n";
$match = str_replace('http://', '', $matches[0] );
cleanupArticle( $revision, $regexes, $match );
}
}
}
}
}
}
// Just for satisfaction
printf( "%-8d %-5.2f%%\n", $id-1, ($id-1) / $maxID * 100 );