mediawiki-extensions-AbuseF.../maintenance/updateVarDumps.php
Daimona Eaytoy 55ba083b13 Introduce a KeywordsManager service
This will decouple a bit the huge and chaotic tangle of AF classes. Some
boilerplate code for AbuseFilter services is also added with this patch.

Note that this requires injecting a KeywordsManager in
AbuseFilterVariableHolder, or unit tests would fail. This is still
incomplete, and the Manager is only injected in tests, because
VariableHolder still has to be refactored.

The test for the UpdateVarDumps script had to be updated, because
serializing VHs in there was a bad choice. As pointed out in a comment,
the test is likely going to break again once we remove the BC code, but
I hope that we'll be able to remove the test at that point.

Change-Id: I12a656a310adb8c5f75cab63f6db9e121e109717
2020-09-28 23:03:52 +00:00

701 lines
21 KiB
PHP

<?php
use MediaWiki\Extension\AbuseFilter\AbuseFilterServices;
use MediaWiki\Extension\AbuseFilter\KeywordsManager;
use MediaWiki\MediaWikiServices;
use Wikimedia\AtEase\AtEase;
use Wikimedia\Rdbms\Database;
use Wikimedia\Rdbms\IResultWrapper;
/**
* Performs several tasks aiming to update the stored var dumps for filter hits.
* See T213006 for a list.
*
* @ingroup Maintenance
*/
if ( getenv( 'MW_INSTALL_PATH' ) ) {
$IP = getenv( 'MW_INSTALL_PATH' );
} else {
$IP = __DIR__ . '/../../..';
}
require_once "$IP/maintenance/Maintenance.php";
class UpdateVarDumps extends LoggedUpdateMaintenance {
/** @var Database A connection to replica */
private $dbr;
/** @var Database A connection to the master */
private $dbw;
/** @var bool Whether we're performing a dry run */
private $dryRun = false;
/** @var int Count of rows in the abuse_filter_log table */
private $allRowsCount;
/** @var bool Whether to print progress markers */
private $progressMarkers;
/** @var string|null */
private $printOrphanedFile;
/** @var int|null How many seconds to sleep after each batch. */
private $sleep;
/** @var KeywordsManager */
private $keywordsManager;
/**
* @inheritDoc
*/
public function __construct() {
parent::__construct();
$this->addDescription( 'Update AbuseFilter var dumps - T213006' );
$this->addOption( 'dry-run-verbose', 'Perform a verbose dry run' );
$this->addOption( 'dry-run', 'Perform a dry run' );
$this->addOption( 'progress-markers', 'Print progress markers every 10 batches' );
$this->addOption(
'print-orphaned-records-to',
'Print ExternalStore urls of orphaned ExternalStore records (if any) ' .
'to the given file. Can use stdout, but it\'s not recommended for big databases.',
false,
true
);
$this->addOption( 'sleep', 'Sleep this many seconds after each batch', false, true );
$this->requireExtension( 'Abuse Filter' );
$this->setBatchSize( 500 );
$this->keywordsManager = AbuseFilterServices::getKeywordsManager();
}
/**
* @inheritDoc
*/
public function getUpdateKey() {
return __CLASS__;
}
/**
* @inheritDoc
*/
public function doDBUpdates() {
if ( $this->hasOption( 'dry-run-verbose' ) || $this->hasOption( 'dry-run' ) ) {
// This way the script can be called with dry-run-verbose only and we can check for dry-run
$this->dryRun = true;
}
$this->progressMarkers = $this->hasOption( 'progress-markers' );
$this->printOrphanedFile = $this->getOption( 'print-orphaned-records-to' );
$this->sleep = $this->getOption( 'sleep' );
// Faulty rows aren't inserted anymore, hence we can query the replica and update the master.
$this->dbr = wfGetDB( DB_REPLICA );
$this->dbw = wfGetDB( DB_MASTER );
// Control batching with the primary key to keep the queries performant and allow gaps
$this->allRowsCount = (int)$this->dbr->selectField(
'abuse_filter_log',
'MAX(afl_id)',
[],
__METHOD__
);
if ( $this->allRowsCount === 0 ) {
$this->output( "...the abuse_filter_log table is empty.\n" );
return !$this->dryRun;
}
// Do the actual work. Note that several actions are superfluous (e.g. in fixMissingDumps
// we use "stored-text" but then we replace it in updateAflVarDump), but that's because of SRP.
// First, ensure that afl_var_dump isn't empty
$this->fixMissingDumps();
// Then, ensure that abuse_filter_log.afl_var_dump only contains "stored-text:xxxx"
$this->moveToText();
// Then update the storage format in the text table
$this->updateText();
// Finally, replace "stored-text:xxxx" with "tt:xxxx" for all rows
$this->updateAflVarDump();
return !$this->dryRun;
}
/**
* Handle empty afl_var_dump. gerrit/16527 fixed a bug which caused an extra abuse_filter_log
* row to be inserted without the var dump for a given action. If we find a row identical to
* the current one but with a valid dump, just delete the current one. Otherwise, store a
* very basic var dump for sanity.
* This handles point 7. of T213006.
*/
private function fixMissingDumps() {
$this->output( "...Checking for missing dumps (1/4)\n" );
$batchSize = $this->getBatchSize();
$prevID = 0;
$curID = $batchSize;
$deleted = $rebuilt = 0;
do {
$this->maybePrintProgress( $prevID );
$brokenRows = $this->dbr->select(
'abuse_filter_log',
'*',
[
'afl_var_dump' => '',
"afl_id > $prevID",
"afl_id <= $curID"
],
__METHOD__,
[ 'ORDER BY' => 'afl_id ASC' ]
);
$prevID = $curID;
$curID += $batchSize;
$res = $this->doFixMissingDumps( $brokenRows );
$deleted += $res['deleted'];
$rebuilt += $res['rebuilt'];
MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
$this->maybeSleep();
} while ( $prevID <= $this->allRowsCount );
if ( $this->dryRun ) {
$this->output(
"...found $deleted rows with blank afl_var_dump to delete, and " .
"$rebuilt rows to rebuild.\n"
);
} else {
$this->output(
"...deleted $deleted rows with blank afl_var_dump, and rebuilt " .
"$rebuilt rows.\n"
);
}
}
/**
* @param IResultWrapper $brokenRows
* @return int[]
*/
private function doFixMissingDumps( IResultWrapper $brokenRows ) {
$deleted = 0;
foreach ( $brokenRows as $row ) {
if ( $row->afl_var_dump === '' ) {
$findRow = array_diff_key(
get_object_vars( $row ),
[ 'afl_var_dump' => true, 'afl_id' => true ]
);
// This is the case where we may have a duplicate row. The wrong insertion happened
// right before the correct one, so their afl_id should only differ by 1, but let's
// play safe and only assume it's greater. Note that the two entries are guaranteed
// to have the same timestamp.
$findRow[] = 'afl_id > ' . $this->dbr->addQuotes( $row->afl_id );
$saneDuplicate = $this->dbr->selectRow(
'abuse_filter_log',
'1',
$findRow,
__METHOD__
);
if ( $saneDuplicate ) {
// Just delete the row!
$deleted++;
if ( !$this->dryRun ) {
$this->dbw->delete(
'abuse_filter_log',
[ 'afl_id' => $row->afl_id ],
__METHOD__
);
}
continue;
}
}
if ( $this->dryRun ) {
continue;
}
// Build a VariableHolder with the only values we can be sure of
$vars = AbuseFilterVariableHolder::newFromArray( [
'timestamp' => wfTimestamp( TS_UNIX, $row->afl_timestamp ),
'action' => $row->afl_action
] );
// Add some action-specific variables
if ( strpos( $row->afl_action, 'createaccount' ) !== false ) {
$vars->setVar( 'accountname', $row->afl_user_text );
} else {
$vars->setVar( 'user_name', $row->afl_user_text );
$title = Title::makeTitle( $row->afl_namespace, $row->afl_title );
if ( $row->afl_action !== 'move' ) {
$vars->setVar( 'page_title', $title->getText() );
$vars->setVar( 'page_prefixedtitle', $title->getPrefixedText() );
} else {
$vars->setVar( 'moved_from_title', $title->getText() );
$vars->setVar( 'moved_from_prefixedtitle', $title->getPrefixedText() );
}
}
$storedID = AbuseFilter::storeVarDump( $vars );
$this->dbw->update(
'abuse_filter_log',
[ 'afl_var_dump' => "tt:$storedID" ],
[ 'afl_id' => $row->afl_id ],
__METHOD__
);
}
$rebuilt = $brokenRows->numRows() - $deleted;
return [ 'rebuilt' => $rebuilt, 'deleted' => $deleted ];
}
/**
* If afl_var_dump contains serialized data, move the dump to the text table.
* This handles point 1. of T213006.
*/
private function moveToText() {
$this->output( "...Moving serialized data away from the abuse_filter_log table (2/4).\n" );
$batchSize = $this->getBatchSize();
$prevID = 0;
$curID = $batchSize;
$changeRows = $truncatedDumps = 0;
do {
$this->maybePrintProgress( $prevID );
$res = $this->dbr->select(
'abuse_filter_log',
[ 'afl_id', 'afl_var_dump' ],
[
'afl_var_dump NOT ' . $this->dbr->buildLike(
'stored-text:',
$this->dbr->anyString()
),
'afl_var_dump NOT ' . $this->dbr->buildLike(
'tt:',
$this->dbr->anyString()
),
"afl_id > $prevID",
"afl_id <= $curID"
],
__METHOD__,
[ 'ORDER BY' => 'afl_id ASC' ]
);
$prevID = $curID;
$curID += $batchSize;
$result = $this->doMoveToText( $res );
$changeRows += $result['change'];
$truncatedDumps += $result['truncated'];
MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
$this->maybeSleep();
} while ( $prevID <= $this->allRowsCount );
$msg = $this->dryRun ?
"...found $changeRows abuse_filter_log rows with serialized data and $truncatedDumps " .
"truncated dumps to rebuild.\n" :
"...moved $changeRows abuse_filter_log rows and rebuilt $truncatedDumps " .
"truncated dumps.\n";
$this->output( $msg );
}
/**
* @param IResultWrapper $rows
* @return int[]
*/
private function doMoveToText( IResultWrapper $rows ) {
$changeRows = $truncatedDumps = 0;
foreach ( $rows as $row ) {
// Sanity: perform a very raw check to confirm that the dump is indeed a serialized value
$re = '/^(a:\d+:{|O:25:"[Aa]buse[Ff]ilter[Vv]ariable[Hh]older":\d+:{)/';
if ( !preg_match( $re, $row->afl_var_dump ) ) {
$this->fatalError(
"...found a value in afl_var_dump for afl_id {$row->afl_id} which is " .
"neither a reference to the text table or a serialized value: {$row->afl_var_dump}.\n"
);
}
AtEase::suppressWarnings();
$stored = unserialize( $row->afl_var_dump );
AtEase::restoreWarnings();
if ( !$stored ) {
$re = '/^O:25:"[Aa]buse[Ff]ilter[Vv]ariable[Hh]older":\d+:{/';
if ( preg_match( $re, $row->afl_var_dump ) ) {
$this->fatalError(
"...found a corrupted afl_var_dump for afl_id {$row->afl_id} containing " .
"a truncated object: {$row->afl_var_dump}.\n"
);
}
$stored = $this->restoreTruncatedDump( $row->afl_var_dump );
$truncatedDumps++;
}
if ( !is_array( $stored ) && !( $stored instanceof AbuseFilterVariableHolder ) ) {
$this->fatalError(
'...found unexpected data type ( ' . gettype( $stored ) . ' ) in ' .
"afl_var_dump for afl_id {$row->afl_id}.\n"
);
}
$changeRows++;
if ( !$this->dryRun ) {
$holder = is_array( $stored ) ? AbuseFilterVariableHolder::newFromArray( $stored ) : $stored;
// Note: this will upgrade to the new JSON format, so we use tt:
$newDump = AbuseFilter::storeVarDump( $holder );
$this->dbw->update(
'abuse_filter_log',
[ 'afl_var_dump' => "tt:$newDump" ],
[ 'afl_id' => $row->afl_id ],
__METHOD__
);
}
}
return [ 'change' => $changeRows, 'truncated' => $truncatedDumps ];
}
/**
* Try to restore a truncated dumps. This could happen for very old rows, where afl_var_dump
* was a blob instead of a longblob, and we tried to insert very long strings there.
* This handles point 9. of T214193.
*
* @param string $dump The broken serialized dump
* @return array With everything that we can restore from $dump on success
*/
private function restoreTruncatedDump( $dump ) {
// This method makes various assumptions:
// 1 - Everything is wrapped inside an array
// 2 - Array elements can only be strings, integers, bools or null
// 3 - Array keys can only be strings
// As this is what a serialized dump should look like.
$string = preg_replace( '/^a:\d+:{/', '', $dump );
$ret = [];
$key = null;
while ( strlen( $string ) > 2 || $string === 'N;' ) {
$type = substr( $string, 0, 2 );
switch ( $type ) {
case 's:':
// Quotes aren't escaped, so we need to figure out how many characters to include
$matches = [];
if ( !preg_match( '/^s:(\d+):"/', $string, $matches ) ) {
break 2;
}
$len = (int)$matches[1];
$val = substr( $string, strlen( $matches[0] ), $len );
if ( strlen( $val ) === $len ) {
if ( $key === null ) {
// It's an array key
$key = $val;
} else {
$ret[$key] = $val;
$key = null;
}
$offset = strlen( $matches[0] ) + $len + 2;
break;
} else {
// The truncation happened in the middle of the string
break 2;
}
case 'i:':
if ( preg_match( '/^i:(-?\d+);/', $string, $matches ) ) {
if ( $key === null ) {
throw new UnexpectedValueException( "Unexpected integer key: $string" );
}
$ret[$key] = intval( $matches[1] );
$key = null;
$offset = strlen( $matches[0] );
break;
} else {
break 2;
}
case 'b:':
if ( preg_match( '/^b:([01]);/', $string, $matches ) ) {
if ( $key === null ) {
throw new UnexpectedValueException( "Unexpected bool key: $string" );
}
$ret[$key] = (bool)$matches[1];
$key = null;
$offset = 4;
break;
} else {
break 2;
}
case 'N;':
if ( $key === null ) {
throw new UnexpectedValueException( "Unexpected null key: $string" );
}
$ret[$key] = null;
$key = null;
$offset = 2;
break;
default:
break 2;
}
// Remove the value we have just parsed
// @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
// @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal
$string = substr( $string, $offset );
}
if ( $this->hasOption( 'dry-run-verbose' ) ) {
$this->output(
"...converted the following corrupted dump:\n\n$dump\n\n to this:\n\n" .
var_export( $ret, true ) . "\n\n"
);
}
return $ret;
}
/**
* If the text table (or the External Storage) contains a serialized AbuseFilterVariableHolder
* or array, re-store it as a JSON-encoded array. This assumes that afl_var_dump rows starting
* with 'tt:' already point to JSON dumps, and afl_var_dump rows starting with 'stored-text:'
* only point to serialized dumps.
* This handles point 2. and 6. of T213006.
*/
private function updateText() {
$this->output(
"...Re-storing serialized dumps as JSON-encoded arrays for all rows (3/4).\n"
);
if ( $this->printOrphanedFile !== null && !$this->dryRun ) {
$this->output( "Printing orphaned records to $this->printOrphanedFile.\n" );
file_put_contents(
$this->printOrphanedFile,
"Records orphaned by AbuseFilter's updateVarDumps sccript\n",
FILE_APPEND
);
}
$batchSize = $this->getBatchSize();
$prevID = 0;
$curID = $batchSize;
$count = 0;
$idSQL = $this->dbr->buildIntegerCast( $this->dbr->strreplace(
'afl_var_dump',
$this->dbr->addQuotes( 'stored-text:' ),
$this->dbr->addQuotes( '' )
) );
$dumpLike = $this->dbr->buildLike( 'stored-text:', $this->dbr->anyString() );
$esAccess = MediaWikiServices::getInstance()->getExternalStoreAccess();
do {
$this->maybePrintProgress( $prevID );
$res = $this->dbr->select(
[ 'text', 'abuse_filter_log' ],
[ 'old_id', 'old_text', 'old_flags' ],
[
"afl_var_dump $dumpLike",
"afl_id > $prevID",
"afl_id <= $curID"
],
__METHOD__,
[ 'DISTINCT', 'ORDER BY' => 'old_id ASC' ],
[ 'abuse_filter_log' => [ 'JOIN', "old_id = $idSQL" ] ]
);
$prevID = $curID;
$curID += $batchSize;
$count += $res->numRows();
if ( !$this->dryRun ) {
$this->doUpdateText( $res, $esAccess );
MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
}
$this->maybeSleep();
} while ( $prevID <= $this->allRowsCount );
$msg = $this->dryRun
? "...found $count text rows to update.\n"
: "...updated $count text rows.\n";
$this->output( $msg );
}
/**
* @param IResultWrapper $res text rows
* @param ExternalStoreAccess $esAccess
*/
private function doUpdateText( IResultWrapper $res, ExternalStoreAccess $esAccess ) {
$orphaned = [];
foreach ( $res as $row ) {
// This is copied from AbuseFilter::loadVarDump
$oldFlags = explode( ',', $row->old_flags );
$text = $row->old_text;
if ( in_array( 'external', $oldFlags ) ) {
$text = $esAccess->fetchFromURL( $row->old_text );
}
if ( in_array( 'gzip', $oldFlags ) ) {
$text = gzinflate( $text );
}
if ( FormatJson::decode( $text ) !== null ) {
// Already in the new format, apparently.
if (
!in_array( 'utf-8', $oldFlags, true ) ||
in_array( 'nativeDataArray', $oldFlags, true )
) {
// Sanity
$this->fatalError( "Row {$row->old_id} is JSON-encoded with wrong flags: {$row->old_flags}" );
}
continue;
}
$obj = unserialize( $text );
$varArray = $obj instanceof AbuseFilterVariableHolder
? $obj->dumpAllVars( [ 'old_wikitext', 'new_wikitext' ] )
: $obj;
$varArray = $this->updateVariables( $varArray );
// Recreating flags will also ensure that we don't add 'nativeDataArray'
$newFlags = [ 'utf-8' ];
// This is copied from AbuseFilter::storeVarDump
$toStore = FormatJson::encode( $varArray );
if ( in_array( 'gzip', $oldFlags ) && function_exists( 'gzdeflate' ) ) {
$toStore = gzdeflate( $toStore );
$newFlags[] = 'gzip';
}
if ( in_array( 'external', $oldFlags ) ) {
$orphaned[] = $row->old_text;
$toStore = $esAccess->insert( $toStore );
$newFlags[] = 'external';
}
$this->dbw->update(
'text',
[
'old_text' => $toStore,
'old_flags' => implode( ',', $newFlags )
],
[ 'old_id' => $row->old_id ],
__METHOD__
);
}
if ( $this->printOrphanedFile !== null && $orphaned ) {
file_put_contents( $this->printOrphanedFile, implode( ', ', $orphaned ) . "\n", FILE_APPEND );
}
}
/**
* Given a stored object, removes some disabled variables and update deprecated ones.
* Also ensure that core variables are lowercase.
* Handles points 4., 5. and 8. of T213006.
*
* @param array $vars The stored vars.
* @return array
*/
private function updateVariables( array $vars ) {
// Remove all variables used in the past to store metadata
unset( $vars['context'], $vars['logged_local_ids'], $vars['logged_global_ids'] );
$builtinVars = $this->getBuiltinVarNames();
$newVars = [];
foreach ( $vars as $oldName => $value ) {
$lowerName = strtolower( $oldName );
if ( $lowerName !== $oldName && array_key_exists( $lowerName, $builtinVars ) ) {
$oldName = $lowerName;
}
$deprecatedVars = $this->keywordsManager->getDeprecatedVariables();
$newName = $deprecatedVars[$oldName] ?? $oldName;
$newVars[$newName] = $value;
}
return $newVars;
}
/**
* Get a set of builtin variable names. Copied from AbuseFilterVariableHolder::dumpAllVars.
* @return array [ varname => true ] for instantaneous search. All names are lowercase
*/
private function getBuiltinVarNames() {
global $wgRestrictionTypes;
static $coreVariables = null;
if ( $coreVariables ) {
return $coreVariables;
}
$activeVariables = array_keys( $this->keywordsManager->getVarsMappings() );
$deprecatedVariables = array_keys( $this->keywordsManager->getDeprecatedVariables() );
$disabledVariables = array_keys( $this->keywordsManager->getDisabledVariables() );
$coreVariables = array_merge( $activeVariables, $deprecatedVariables, $disabledVariables );
$prefixes = [ 'moved_from', 'moved_to', 'page' ];
foreach ( $wgRestrictionTypes as $action ) {
foreach ( $prefixes as $prefix ) {
$coreVariables[] = "{$prefix}_restrictions_$action";
}
}
$coreVariables = array_fill_keys( $coreVariables, true );
$coreVariables = array_change_key_case( $coreVariables );
return $coreVariables;
}
/**
* Replace 'stored-text:' with 'tt:' in afl_var_dump. Handles point 3. of T213006.
*/
private function updateAflVarDump() {
$this->output(
"...Replacing the 'stored-text:' prefix with 'tt:' (4/4).\n"
);
$batchSize = $this->getBatchSize();
// Use native SQL functions so that we can update all rows at the same time.
$newIdSQL = $this->dbw->strreplace(
'afl_var_dump',
$this->dbr->addQuotes( 'stored-text:' ),
$this->dbr->addQuotes( 'tt:' )
);
$prevID = 0;
$curID = $batchSize;
$numRows = 0;
do {
$this->maybePrintProgress( $prevID );
$args = [
'abuse_filter_log',
[ "afl_var_dump = $newIdSQL" ],
[
"afl_id > $prevID",
"afl_id <= $curID",
'afl_var_dump ' . $this->dbr->buildLike( 'stored-text:', $this->dbr->anyString() )
],
__METHOD__,
[ 'ORDER BY' => 'afl_id ASC' ]
];
if ( $this->dryRun ) {
$numRows += $this->dbr->selectRowCount( ...$args );
} else {
$this->dbw->update( ...$args );
$numRows += $this->dbw->affectedRows();
MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
}
$prevID = $curID;
$curID += $batchSize;
$this->maybeSleep();
} while ( $prevID <= $this->allRowsCount );
if ( $this->dryRun ) {
$this->output( "...would change afl_var_dump for $numRows rows.\n" );
} else {
$this->output( "...updated afl_var_dump prefix for $numRows rows.\n" );
}
}
/**
* Print a progress marker if the respective option is enabled
*
* @param int $start
*/
private function maybePrintProgress( int $start ) : void {
if ( $this->progressMarkers && $start % ( 10 * $this->getBatchSize() ) === 0 ) {
$end = $start + $this->getBatchSize();
$this->output( "...Doing range $start - $end\n" );
}
}
/**
* Sleep for a while, if required. Note: checking the value is several
* orders of magnitude faster than calling sleep(0).
*/
private function maybeSleep() : void {
if ( $this->sleep ) {
sleep( $this->sleep );
}
}
}
$maintClass = 'UpdateVarDumps';
require_once RUN_MAINTENANCE_IF_MAIN;