/'; protected Config $config; private RepoGroup $repoGroup; private WANObjectCache $mainWANObjectCache; private HttpRequestFactory $httpRequestFactory; private IConnectionProvider $connectionProvider; private TitleFactory $titleFactory; private LinksMigration $linksMigration; public function __construct( Config $config, RepoGroup $repoGroup, WANObjectCache $mainWANObjectCache, HttpRequestFactory $httpRequestFactory, IConnectionProvider $connectionProvider, TitleFactory $titleFactory, LinksMigration $linksMigration ) { $this->config = $config; $this->repoGroup = $repoGroup; $this->mainWANObjectCache = $mainWANObjectCache; $this->httpRequestFactory = $httpRequestFactory; $this->connectionProvider = $connectionProvider; $this->titleFactory = $titleFactory; $this->linksMigration = $linksMigration; } /** * @param array &$globals */ public function onParserTestGlobals( &$globals ) { $globals += [ 'wgPageImagesScores' => [ 'width' => [ 200 => 10, 1000 => 20 ], 'position' => [], 'ratio' => [], 'galleryImageWidth' => [] ], 'wgPageImagesLeadSectionOnly' => true ]; } /** * ParserModifyImageHTML hook. Save candidate images, and mark them with a * comment so that we can later tell if they were in the lead section. * * @param Parser $parser * @param File $file * @param array $params * @param string &$html */ public function onParserModifyImageHTML( Parser $parser, File $file, array $params, string &$html ): void { $page = $parser->getPage(); if ( !$page || !$this->processThisTitle( $page ) ) { return; } $this->calcWidth( $params, $file ); $index = $this->addPageImageCandidateToParserOutput( PageImageCandidate::newFromFileAndParams( $file, $params ), $parser->getOutput() ); $html .= ""; } /** * ParserAfterTidy hook handler. Remove candidate images which were not in * the lead section. * * @param Parser $parser * @param string &$text */ public function onParserAfterTidy( $parser, &$text ) { $parserOutput = $parser->getOutput(); $allImages = $parserOutput->getExtensionData( 'pageImages' ); if ( !$allImages ) { return; } // Find and remove our special comments $images = []; if ( $this->config->get( 'PageImagesLeadSectionOnly' ) ) { $leadEndPos = strpos( $text, 'findBestImages( $images ); if ( $freeImageName ) { $parserOutput->setPageProperty( PageImages::getPropName( true ), $freeImageName ); } // Only store the image if it's not free. Free image (if any) has already been stored above. if ( $bestImageName && $bestImageName !== $freeImageName ) { $parserOutput->setPageProperty( PageImages::getPropName( false ), $bestImageName ); } // Strip comments from indicators (T298930) foreach ( $parserOutput->getIndicators() as $id => $value ) { $stripped = preg_replace( self::CANDIDATE_REGEX, '', $value ); if ( $stripped !== $value ) { $parserOutput->setIndicator( $id, $stripped ); } } // We may have comments in TOC data - Parser::cleanupTocLine strips them for us. } /** * Find the best images out of an array of candidates * * @param PageImageCandidate[] $images * @return array{string|false,string|false} The best image, and the best free image */ private function findBestImages( array $images ) { if ( !$images ) { return [ false, false ]; } // Determine the image scores $scores = []; $counter = 0; foreach ( $images as $image ) { $score = $this->getScore( $image, $counter++ ); $fileName = $image->getFileName(); $scores[$fileName] = max( $scores[$fileName] ?? -1, $score ); } $bestImageName = false; $freeImageName = false; foreach ( $scores as $name => $score ) { if ( $score > 0 ) { if ( !$bestImageName || $score > $scores[$bestImageName] ) { $bestImageName = $name; } if ( ( !$freeImageName || $score > $scores[$freeImageName] ) && $this->isImageFree( $name ) ) { $freeImageName = $name; } } } return [ $bestImageName, $freeImageName ]; } /** * Adds $image to $parserOutput extension data. * * @param PageImageCandidate $image * @param ParserOutput $parserOutput * @return int */ private function addPageImageCandidateToParserOutput( PageImageCandidate $image, ParserOutput $parserOutput ) { $images = $parserOutput->getExtensionData( 'pageImages' ) ?: []; $images[] = $image->jsonSerialize(); $parserOutput->setExtensionData( 'pageImages', $images ); return count( $images ) - 1; } /** * Returns true if data for this title should be saved * * @param PageReference $pageReference * * @return bool */ private function processThisTitle( PageReference $pageReference ) { static $flipped = null; $flipped ??= array_flip( $this->config->get( 'PageImagesNamespaces' ) ); return isset( $flipped[$pageReference->getNamespace()] ); } /** * Estimates image size as displayed if not explicitly provided. We don't follow the core size * calculation algorithm precisely because it's not required and editor's intentions are more * important than the precise number. * * @param array[] &$params * @param File $file */ private function calcWidth( array &$params, File $file ) { if ( isset( $params['handler']['width'] ) ) { return; } if ( isset( $params['handler']['height'] ) && $file->getHeight() > 0 ) { $params['handler']['width'] = $file->getWidth() * ( $params['handler']['height'] / $file->getHeight() ); } elseif ( isset( $params['frame']['thumbnail'] ) || isset( $params['frame']['thumb'] ) || isset( $params['frame']['frameless'] ) ) { $thumbLimits = $this->config->get( MainConfigNames::ThumbLimits ); $defaultUserOptions = $this->config->get( MainConfigNames::DefaultUserOptions ); $params['handler']['width'] = $thumbLimits[$defaultUserOptions['thumbsize']] ?? 250; } else { $params['handler']['width'] = $file->getWidth(); } } /** * Returns score for image, the more the better, if it is less than zero, * the image shouldn't be used for anything * * @param PageImageCandidate $image Associative array describing an image * @param int $position Image order on page * * @return float */ protected function getScore( PageImageCandidate $image, $position ) { // Exclude images with class="notpageimage" if ( preg_match( '/(?:^|\s)notpageimage(?=\s|$)/', $image->getFrameClass() ) ) { return -1000; } $pageImagesScores = $this->config->get( 'PageImagesScores' ); if ( $image->getHandlerWidth() ) { // Standalone image $score = $this->scoreFromTable( $image->getHandlerWidth(), $pageImagesScores['width'] ); } else { // From gallery $score = $this->scoreFromTable( $image->getFullWidth(), $pageImagesScores['galleryImageWidth'] ); } if ( isset( $pageImagesScores['position'][$position] ) ) { $score += $pageImagesScores['position'][$position]; } $ratio = intval( $this->getRatio( $image ) * 10 ); $score += $this->scoreFromTable( $ratio, $pageImagesScores['ratio'] ); $denylist = $this->getDenylist(); if ( isset( $denylist[$image->getFileName()] ) ) { $score = -1000; } return $score; } /** * Returns score based on table of ranges * * @param int $value The number that the various bounds are compared against * to calculate the score * @param float[] $scores Table of scores for different ranges of $value * * @return float */ protected function scoreFromTable( $value, array $scores ) { $lastScore = 0; // The loop stops at the *first* match, and therefore *requires* the input array keys to be // in increasing order. ksort( $scores, SORT_NUMERIC ); foreach ( $scores as $upperBoundary => $score ) { $lastScore = $score; if ( $value <= $upperBoundary ) { break; } } if ( !is_numeric( $lastScore ) ) { wfLogWarning( 'The PageImagesScores setting must only contain numeric values!' ); } return (float)$lastScore; } /** * Check whether image's copyright allows it to be used freely. * * @param string $fileName Name of the image file * @return bool */ protected function isImageFree( $fileName ) { $file = $this->repoGroup->findFile( $fileName ); if ( $file ) { // Process copyright metadata from CommonsMetadata, if present. // Image is considered free if the value is '0' or unset. return empty( $this->fetchFileMetadata( $file )['NonFree']['value'] ); } return true; } /** * Fetch file metadata * * @param File $file File to fetch metadata from * @return array */ protected function fetchFileMetadata( $file ) { $format = new FormatMetadata; $context = new DerivativeContext( $format->getContext() ); // we don't care about the language, and specifying singleLanguage is slightly faster $format->setSingleLanguage( true ); // we don't care about the language, so avoid splitting the cache by selecting English $context->setLanguage( 'en' ); $format->setContext( $context ); return $format->fetchExtendedMetadata( $file ); } /** * Returns width/height ratio of an image as displayed or 0 if not available * * @param PageImageCandidate $image * * @return float|int */ protected function getRatio( PageImageCandidate $image ) { $width = $image->getFullWidth(); $height = $image->getFullHeight(); return $width > 0 && $height > 0 ? $width / $height : 0; } /** * Returns a list of images denylisted from influencing this extension's output * * @return int[] Flipped associative array in format "image BDB key" => int * @throws Exception */ protected function getDenylist() { return $this->mainWANObjectCache->getWithSetCallback( $this->mainWANObjectCache->makeKey( 'pageimages-denylist' ), $this->config->get( 'PageImagesDenylistExpiry' ), function () { $list = []; foreach ( $this->config->get( 'PageImagesDenylist' ) as $source ) { switch ( $source['type'] ) { case 'db': $list = array_merge( $list, $this->getDbDenylist( $source['db'], $source['page'] ) ); break; case 'url': $list = array_merge( $list, $this->getUrlDenylist( $source['url'] ) ); break; default: throw new RuntimeException( "unrecognized image denylist type '{$source['type']}'" ); } } return array_flip( $list ); } ); } /** * Returns list of images linked by the given denylist page * * @param string|false $dbName Database name or false for current database * @param string $page * * @return string[] */ private function getDbDenylist( $dbName, $page ) { $title = $this->titleFactory->newFromText( $page ); if ( !$title || !$title->canExist() ) { return []; } $dbr = $this->connectionProvider->getReplicaDatabase( $dbName ); $id = $dbr->newSelectQueryBuilder() ->select( 'page_id' ) ->from( 'page' ) ->where( [ 'page_namespace' => $title->getNamespace(), 'page_title' => $title->getDBkey() ] ) ->caller( __METHOD__ )->fetchField(); if ( !$id ) { return []; } [ $blNamespace, $blTitle ] = $this->linksMigration->getTitleFields( 'pagelinks' ); $queryInfo = $this->linksMigration->getQueryInfo( 'pagelinks' ); return $dbr->newSelectQueryBuilder() ->select( $blTitle ) ->tables( $queryInfo['tables'] ) ->joinConds( $queryInfo['joins'] ) ->where( [ 'pl_from' => (int)$id, $blNamespace => NS_FILE ] ) ->caller( __METHOD__ )->fetchFieldValues(); } /** * Returns list of images on given remote denylist page. * Not quite 100% bulletproof due to localised namespaces and so on. * Though if you beat people if they add bad entries to the list... :) * * @param string $url * * @return string[] */ private function getUrlDenylist( $url ) { $list = []; $text = $this->httpRequestFactory->get( $url, [ 'timeout' => 3 ], __METHOD__ ); $fileExtensions = $this->config->get( 'FileExtensions' ); $regex = '/\[\[:([^|\#]*?\.(?:' . implode( '|', $fileExtensions ) . '))/i'; if ( $text && preg_match_all( $regex, $text, $matches ) ) { foreach ( $matches[1] as $s ) { $t = $this->titleFactory->makeTitleSafe( NS_FILE, $s ); if ( $t ) { $list[] = $t->getDBkey(); } } } return $list; } }