mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/PageImages
synced 2024-12-18 11:02:15 +00:00
03e14d0c86
Trying to run this script in the cluster fatals out due to memory problems somewhat regularly. The --start option helps to restart it where it fell down, but when trying to run against hundreds of wiki's that is a one-off solution that makes ensuring everything is actually visited a pain. To try and isolate errors add an option to push the parsing into the job queue. There is still the possibility to miss pages, but job queue retries should take care of us for the most part. Attempts to keep load down on the databases by making sure no more than a specified number of jobs are queued/processing at a given time. Bug: T152155 Change-Id: I3a4e3a415b2f03de0bb36ac0515241e950130fde
108 lines
3.2 KiB
PHP
108 lines
3.2 KiB
PHP
<?php
|
|
|
|
$IP = getenv( 'MW_INSTALL_PATH' );
|
|
if ( $IP === false ) {
|
|
$IP = __DIR__ . '/../../..';
|
|
}
|
|
require_once ( "$IP/maintenance/Maintenance.php" );
|
|
|
|
use MediaWiki\MediaWikiServices;
|
|
use PageImages\Job\InitImageDataJob;
|
|
|
|
/**
|
|
* @license WTFPL 2.0
|
|
* @author Max Semenik
|
|
*/
|
|
class InitImageData extends Maintenance {
|
|
public function __construct() {
|
|
parent::__construct();
|
|
$this->mDescription = 'Initializes PageImages data';
|
|
$this->addOption( 'namespaces',
|
|
'Comma-separated list of namespace(s) to refresh', false, true );
|
|
$this->addOption( 'earlier-than',
|
|
'Run only on pages earlier than this timestamp', false, true );
|
|
$this->addOption( 'start', 'Starting page ID', false, true );
|
|
$this->addOption( 'queue-pressure', 'Maximum number of jobs to enqueue at a time. If not provided or 0 will be run in-process.', false, true );
|
|
$this->addOption( 'quiet', "Don't report on job queue pressure" );
|
|
$this->setBatchSize( 100 );
|
|
}
|
|
|
|
public function execute() {
|
|
global $wgPageImagesNamespaces;
|
|
|
|
$lastId = $this->getOption( 'start', 0 );
|
|
$isQuiet = $this->getOption( 'quiet', false );
|
|
$queue = null;
|
|
$maxPressure = $this->getOption( 'queue-pressure', 0 );
|
|
if ( $maxPressure > 0 ) {
|
|
$queue = JobQueueGroup::singleton();
|
|
}
|
|
|
|
do {
|
|
$tables = [ 'page', 'imagelinks' ];
|
|
$conds = [
|
|
'page_id > ' . (int) $lastId,
|
|
'il_from IS NOT NULL',
|
|
'page_is_redirect' => 0,
|
|
];
|
|
$fields = [ 'page_id' ];
|
|
$joinConds = [ 'imagelinks' => [
|
|
'LEFT JOIN', 'page_id = il_from',
|
|
] ];
|
|
|
|
$dbr = wfGetDB( DB_SLAVE );
|
|
if ( $this->hasOption( 'namespaces' ) ) {
|
|
$ns = explode( ',', $this->getOption( 'namespaces' ) );
|
|
$conds['page_namespace'] = $ns;
|
|
} else {
|
|
$conds['page_namespace'] = $wgPageImagesNamespaces;
|
|
}
|
|
if ( $this->hasOption( 'earlier-than' ) ) {
|
|
$conds[] = 'page_touched < '
|
|
. $dbr->addQuotes( $this->getOption( 'earlier-than' ) );
|
|
}
|
|
$res = $dbr->select( $tables, $fields, $conds, __METHOD__,
|
|
[ 'LIMIT' => $this->mBatchSize, 'ORDER_BY' => 'page_id', 'GROUP BY' => 'page_id' ],
|
|
$joinConds
|
|
);
|
|
$page_ids = [];
|
|
foreach ( $res as $row ) {
|
|
$pageIds[] = $row->page_id;
|
|
}
|
|
$job = new InitImageDataJob( Title::newMainPage(), [ 'page_ids' => $pageIds ] );
|
|
if ( $queue === null ) {
|
|
$job->run();
|
|
} else {
|
|
$queue->push( $job );
|
|
$this->waitForMaxPressure( $queue, $maxPressure, $isQuiet );
|
|
}
|
|
$lastId = end( $pageIds );
|
|
$this->output( "$lastId\n" );
|
|
} while ( $res->numRows() );
|
|
$this->output( "done\n" );
|
|
}
|
|
|
|
/**
|
|
* @param JobQueueGroup $queue The job queue to fetch pressure from
|
|
* @param int $maxPressure The maximum number of queued + active
|
|
* jobs that can exist when returning
|
|
* @param bool $isQuiet When false report on job queue pressure every 10s
|
|
*/
|
|
private function waitForMaxPressure( JobQueueGroup $queue, $maxPressure, $isQuiet ) {
|
|
$group = $queue->get( 'InitImageDataJob' );
|
|
$i = 0;
|
|
do {
|
|
sleep( 1 );
|
|
$queued = $group->getSize();
|
|
$running = $group->getAcquiredCount();
|
|
if ( !$isQuiet && ++$i % 10 === 0 ) {
|
|
$now = date( 'Y-m-d H:i:s T');
|
|
$this->output( "[$now] Queued: $queued Running: $running Max: $maxPressure\n" );
|
|
}
|
|
} while ( $queued + $running >= $maxPressure );
|
|
}
|
|
}
|
|
|
|
$maintClass = 'InitImageData';
|
|
require_once ( DO_MAINTENANCE );
|