2006-01-19 17:17:03 +00:00
|
|
|
<?php
|
|
|
|
/**
|
|
|
|
* An aggressive spam cleanup script.
|
2018-04-06 17:52:42 +00:00
|
|
|
* Searches the database for matching pages, and reverts them to
|
|
|
|
* the last non-spammed revision.
|
|
|
|
* If all revisions contain spam, blanks the page
|
2006-01-19 17:17:03 +00:00
|
|
|
*/
|
|
|
|
|
2020-03-04 00:33:41 +00:00
|
|
|
use MediaWiki\MediaWikiServices;
|
|
|
|
use MediaWiki\Revision\RevisionLookup;
|
|
|
|
use MediaWiki\Revision\RevisionRecord;
|
|
|
|
use MediaWiki\Revision\SlotRecord;
|
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
$IP = getenv( 'MW_INSTALL_PATH' );
|
|
|
|
if ( $IP === false ) {
|
|
|
|
$IP = __DIR__ . '/../../..';
|
2006-01-19 17:17:03 +00:00
|
|
|
}
|
2018-04-06 17:52:42 +00:00
|
|
|
require_once "$IP/maintenance/Maintenance.php";
|
2006-01-19 17:17:03 +00:00
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
class Cleanup extends Maintenance {
|
2020-03-04 00:33:41 +00:00
|
|
|
/** @var RevisionLookup */
|
|
|
|
private $revisionLookup;
|
|
|
|
/** @var TitleFormatter */
|
|
|
|
private $titleFormatter;
|
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
public function __construct() {
|
|
|
|
parent::__construct();
|
2020-03-04 00:33:41 +00:00
|
|
|
$this->revisionLookup = MediaWikiServices::getInstance()->getRevisionLookup();
|
|
|
|
$this->titleFormatter = MediaWikiServices::getInstance()->getTitleFormatter();
|
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
$this->requireExtension( 'SpamBlacklist' );
|
|
|
|
$this->addOption( 'dry-run', 'Only do a dry run' );
|
2012-10-08 23:33:43 +00:00
|
|
|
}
|
2006-04-02 03:50:06 +00:00
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
public function execute() {
|
|
|
|
$user = User::newSystemUser( 'Spam cleanup script', [ 'steal' => true ] );
|
2006-01-19 17:17:03 +00:00
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
$sb = BaseBlacklist::getSpamBlacklist();
|
|
|
|
$regexes = $sb->getBlacklists();
|
|
|
|
if ( !$regexes ) {
|
|
|
|
$this->fatalError( "Invalid regex, can't clean up spam" );
|
|
|
|
}
|
|
|
|
$dryRun = $this->hasOption( 'dry-run' );
|
2006-01-19 17:17:03 +00:00
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
$dbr = wfGetDB( DB_REPLICA );
|
|
|
|
$maxID = (int)$dbr->selectField( 'page', 'MAX(page_id)' );
|
|
|
|
$reportingInterval = 100;
|
2006-01-19 17:17:03 +00:00
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
$this->output( "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n" );
|
|
|
|
$this->output( "Searching for spam in $maxID pages...\n" );
|
|
|
|
if ( $dryRun ) {
|
|
|
|
$this->output( "Dry run only\n" );
|
|
|
|
}
|
2006-01-19 17:17:03 +00:00
|
|
|
|
2018-04-06 17:52:42 +00:00
|
|
|
for ( $id = 1; $id <= $maxID; $id++ ) {
|
|
|
|
if ( $id % $reportingInterval == 0 ) {
|
|
|
|
printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 );
|
|
|
|
}
|
2020-03-04 00:33:41 +00:00
|
|
|
$revision = $this->revisionLookup->getRevisionByPageId( $id );
|
2018-04-06 17:52:42 +00:00
|
|
|
if ( $revision ) {
|
2020-03-04 00:33:41 +00:00
|
|
|
$text = ContentHandler::getContentText( $revision->getContent( SlotRecord::MAIN ) );
|
2018-04-06 17:52:42 +00:00
|
|
|
if ( $text ) {
|
|
|
|
foreach ( $regexes as $regex ) {
|
|
|
|
if ( preg_match( $regex, $text, $matches ) ) {
|
2020-03-04 00:33:41 +00:00
|
|
|
$titleText = $this->titleFormatter->getPrefixedText( $revision->getPageAsLinkTarget() );
|
2018-04-06 17:52:42 +00:00
|
|
|
if ( $dryRun ) {
|
|
|
|
$this->output( "Found spam in [[$titleText]]\n" );
|
|
|
|
} else {
|
|
|
|
$this->output( "Cleaning up links to {$matches[0]} in [[$titleText]]\n" );
|
|
|
|
$match = str_replace( 'http://', '', $matches[0] );
|
|
|
|
$this->cleanupArticle( $revision, $regexes, $match, $user );
|
|
|
|
}
|
|
|
|
}
|
2006-09-18 09:56:57 +00:00
|
|
|
}
|
2006-01-21 23:27:39 +00:00
|
|
|
}
|
2006-01-19 17:17:03 +00:00
|
|
|
}
|
|
|
|
}
|
2018-04-06 17:52:42 +00:00
|
|
|
// Just for satisfaction
|
|
|
|
printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Find the latest revision of the article that does not contain spam and revert to it
|
2020-03-04 00:33:41 +00:00
|
|
|
* @param RevisionRecord $rev
|
2018-04-06 17:52:42 +00:00
|
|
|
* @param array $regexes
|
2019-03-02 12:28:18 +00:00
|
|
|
* @param string $match
|
2018-04-06 17:52:42 +00:00
|
|
|
* @param User $user
|
|
|
|
*/
|
2020-03-04 00:33:41 +00:00
|
|
|
private function cleanupArticle( RevisionRecord $rev, $regexes, $match, User $user ) {
|
|
|
|
$title = Title::newFromLinkTarget( $rev->getPageAsLinkTarget() );
|
2018-04-06 17:52:42 +00:00
|
|
|
while ( $rev ) {
|
|
|
|
$matches = false;
|
|
|
|
foreach ( $regexes as $regex ) {
|
|
|
|
$matches = $matches
|
|
|
|
|| preg_match(
|
|
|
|
$regex,
|
2020-03-04 00:33:41 +00:00
|
|
|
ContentHandler::getContentText( $rev->getContent( SlotRecord::MAIN ) )
|
2018-04-06 17:52:42 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
if ( !$matches ) {
|
|
|
|
// Didn't find any spam
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-03-04 00:33:41 +00:00
|
|
|
$rev = $this->revisionLookup->getPreviousRevision( $rev );
|
2018-04-06 17:52:42 +00:00
|
|
|
}
|
|
|
|
if ( !$rev ) {
|
|
|
|
// Didn't find a non-spammy revision, blank the page
|
|
|
|
$this->output( "All revisions are spam, blanking...\n" );
|
2019-12-22 05:21:52 +00:00
|
|
|
$content = ContentHandler::makeContent( '', $title );
|
2018-04-06 17:52:42 +00:00
|
|
|
$comment = "All revisions matched the spam blacklist ($match), blanking";
|
|
|
|
} else {
|
|
|
|
// Revert to this revision
|
2020-03-04 00:33:41 +00:00
|
|
|
$content = $rev->getContent( SlotRecord::MAIN ) ?:
|
|
|
|
ContentHandler::makeContent( '', $title );
|
2018-04-06 17:52:42 +00:00
|
|
|
$comment = "Cleaning up links to $match";
|
|
|
|
}
|
|
|
|
$wikiPage = new WikiPage( $title );
|
|
|
|
$wikiPage->doEditContent(
|
2019-12-22 05:21:52 +00:00
|
|
|
$content, $comment,
|
2018-04-06 17:52:42 +00:00
|
|
|
0, false, $user
|
|
|
|
);
|
2006-01-19 17:17:03 +00:00
|
|
|
}
|
|
|
|
}
|
2018-04-06 17:52:42 +00:00
|
|
|
|
|
|
|
$maintClass = Cleanup::class;
|
|
|
|
require_once RUN_MAINTENANCE_IF_MAIN;
|