getTitle(); $revId = $rev->getId(); while ( $rev ) { $matches = false; foreach ( $regexes as $regex ) { $matches = $matches || preg_match( $regex, ContentHandler::getContentText( $rev->getContent() ) ); } if ( !$matches ) { // Didn't find any spam break; } # Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26) # $rev = $rev->getPrevious(); $revId = $title->getPreviousRevisionID( $revId ); if ( $revId ) { $rev = Revision::newFromTitle( $title, $revId ); } else { $rev = false; } } if ( !$rev ) { // Didn't find a non-spammy revision, delete the page /* print "All revisions are spam, deleting...\n"; $article = new Article( $title ); $article->doDeleteArticle( "All revisions matched the spam blacklist" ); */ // Too scary, blank instead print "All revisions are spam, blanking...\n"; $text = ''; $comment = "All revisions matched the spam blacklist ($match), blanking"; } else { // Revert to this revision $text = ContentHandler::getContentText( $rev->getContent() ); $comment = "Cleaning up links to $match"; } $wikiPage = new WikiPage( $title ); $wikiPage->doEditContent( ContentHandler::makeContent( $text, $title ), $comment ); } // ------------------------------------------------------------------------------ $username = 'Spam cleanup script'; if ( method_exists( 'User', 'newSystemUser' ) ) { $wgUser = User::newSystemUser( $username, [ 'steal' => true ] ); } else { $wgUser = User::newFromName( $username ); if ( $wgUser->idForName() == 0 ) { // Create the user $status = $wgUser->addToDatabase(); if ( $status === null || $status->isOK() ) { $dbw = wfGetDB( DB_MASTER ); $dbw->update( 'user', [ 'user_password' => 'nologin' ], [ 'user_name' => $username ], $username ); } } } if ( isset( $options['n'] ) ) { $dryRun = true; } else { $dryRun = false; } $sb = new SpamBlacklist( $wgSpamBlacklistSettings ); if ( $wgSpamBlacklistFiles ) { $sb->files = $wgSpamBlacklistFiles; } $regexes = $sb->getBlacklists(); if ( !$regexes ) { print "Invalid regex, can't clean up spam\n"; exit( 1 ); } $dbr = wfGetDB( DB_REPLICA ); $maxID = $dbr->selectField( 'page', 'MAX(page_id)' ); $reportingInterval = 100; print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n"; print "Searching for spam in $maxID pages...\n"; if ( $dryRun ) { print "Dry run only\n"; } for ( $id = 1; $id <= $maxID; $id++ ) { if ( $id % $reportingInterval == 0 ) { printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 ); } $revision = Revision::loadFromPageId( $dbr, $id ); if ( $revision ) { $text = ContentHandler::getContentText( $revision->getContent() ); if ( $text ) { foreach ( $regexes as $regex ) { if ( preg_match( $regex, $text, $matches ) ) { $title = $revision->getTitle(); $titleText = $title->getPrefixedText(); if ( $dryRun ) { print "\nFound spam in [[$titleText]]\n"; } else { print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n"; $match = str_replace( 'http://', '', $matches[0] ); cleanupArticle( $revision, $regexes, $match ); } } } } } } // Just for satisfaction printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );