2015-11-16 15:40:07 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace PageImages\Hooks;
|
|
|
|
|
2016-03-07 08:47:51 +00:00
|
|
|
use DerivativeContext;
|
2015-11-16 15:40:07 +00:00
|
|
|
use Exception;
|
2016-03-07 08:47:51 +00:00
|
|
|
use File;
|
|
|
|
use FormatMetadata;
|
2015-11-16 15:40:07 +00:00
|
|
|
use Http;
|
|
|
|
use LinksUpdate;
|
|
|
|
use PageImages;
|
|
|
|
use Title;
|
2018-09-07 09:26:57 +00:00
|
|
|
use Revision;
|
2015-11-16 15:40:07 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Handler for the "LinksUpdate" hook.
|
|
|
|
*
|
2018-05-25 04:42:29 +00:00
|
|
|
* @license WTFPL
|
2015-11-16 15:40:07 +00:00
|
|
|
* @author Max Semenik
|
2017-11-24 07:33:49 +00:00
|
|
|
* @author Thiemo Kreuz
|
2015-11-16 15:40:07 +00:00
|
|
|
*/
|
|
|
|
class LinksUpdateHookHandler {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* LinksUpdate hook handler, sets at most 2 page properties depending on images on page
|
|
|
|
*
|
|
|
|
* @see https://www.mediawiki.org/wiki/Manual:Hooks/LinksUpdate
|
|
|
|
*
|
2017-12-06 21:02:05 +00:00
|
|
|
* @param LinksUpdate $linksUpdate the LinksUpdate object that this hook is parsing
|
2015-11-16 15:40:07 +00:00
|
|
|
*/
|
|
|
|
public static function onLinksUpdate( LinksUpdate $linksUpdate ) {
|
|
|
|
$handler = new self();
|
2016-03-07 08:47:51 +00:00
|
|
|
$handler->doLinksUpdate( $linksUpdate );
|
2015-11-16 15:40:07 +00:00
|
|
|
}
|
|
|
|
|
2016-11-28 23:33:23 +00:00
|
|
|
/**
|
|
|
|
* Returns a list of page image candidates for consideration
|
|
|
|
* for scoring algorithm.
|
2017-12-06 21:02:05 +00:00
|
|
|
* @param LinksUpdate $linksUpdate LinksUpdate object used to determine what page
|
|
|
|
* to get page images for
|
2019-11-04 18:18:18 +00:00
|
|
|
* @return array[]|null $image Associative array describing an image
|
2016-11-28 23:33:23 +00:00
|
|
|
*/
|
|
|
|
public function getPageImageCandidates( LinksUpdate $linksUpdate ) {
|
|
|
|
global $wgPageImagesLeadSectionOnly;
|
|
|
|
$po = false;
|
|
|
|
|
|
|
|
if ( $wgPageImagesLeadSectionOnly ) {
|
|
|
|
$rev = $linksUpdate->getRevision();
|
2018-09-07 09:26:57 +00:00
|
|
|
if ( !$rev ) {
|
2019-07-25 03:58:11 +00:00
|
|
|
// Use READ_LATEST (T221763)
|
|
|
|
$rev = Revision::newFromTitle( $linksUpdate->getTitle(), 0,
|
|
|
|
Revision::READ_LATEST );
|
2018-09-07 09:26:57 +00:00
|
|
|
}
|
2016-11-28 23:33:23 +00:00
|
|
|
if ( $rev ) {
|
|
|
|
$content = $rev->getContent();
|
|
|
|
if ( $content ) {
|
|
|
|
$section = $content->getSection( 0 );
|
|
|
|
|
|
|
|
// Certain content types e.g. AbstractContent return null if sections do not apply
|
|
|
|
if ( $section ) {
|
|
|
|
$po = $section->getParserOutput( $linksUpdate->getTitle() );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$po = $linksUpdate->getParserOutput();
|
|
|
|
}
|
|
|
|
|
|
|
|
return $po ? $po->getExtensionData( 'pageImages' ) : [];
|
|
|
|
}
|
|
|
|
|
2015-11-16 15:40:07 +00:00
|
|
|
/**
|
2017-12-06 21:02:05 +00:00
|
|
|
* @param LinksUpdate $linksUpdate the LinksUpdate object that was passed to the handler
|
2015-11-16 15:40:07 +00:00
|
|
|
*/
|
|
|
|
public function doLinksUpdate( LinksUpdate $linksUpdate ) {
|
2016-11-28 23:33:23 +00:00
|
|
|
$images = $this->getPageImageCandidates( $linksUpdate );
|
2015-11-16 15:40:07 +00:00
|
|
|
|
|
|
|
if ( $images === null ) {
|
2016-03-07 08:47:51 +00:00
|
|
|
return;
|
2015-11-16 15:40:07 +00:00
|
|
|
}
|
|
|
|
|
2016-11-21 23:29:28 +00:00
|
|
|
$scores = [];
|
2015-11-16 15:40:07 +00:00
|
|
|
$counter = 0;
|
|
|
|
|
|
|
|
foreach ( $images as $image ) {
|
|
|
|
$fileName = $image['filename'];
|
|
|
|
|
|
|
|
if ( !isset( $scores[$fileName] ) ) {
|
|
|
|
$scores[$fileName] = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
$scores[$fileName] = max( $scores[$fileName], $this->getScore( $image, $counter++ ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
$image = false;
|
2016-11-10 00:02:00 +00:00
|
|
|
$free_image = false;
|
2015-11-16 15:40:07 +00:00
|
|
|
|
|
|
|
foreach ( $scores as $name => $score ) {
|
2016-11-10 00:02:00 +00:00
|
|
|
if ( $score > 0 ) {
|
|
|
|
if ( !$image || $score > $scores[$image] ) {
|
|
|
|
$image = $name;
|
|
|
|
}
|
|
|
|
if ( ( !$free_image || $score > $scores[$free_image] ) && $this->isImageFree( $name ) ) {
|
|
|
|
$free_image = $name;
|
|
|
|
}
|
2015-11-16 15:40:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-10 00:02:00 +00:00
|
|
|
if ( $free_image ) {
|
|
|
|
$linksUpdate->mProperties[PageImages::getPropName( true )] = $free_image;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only store the image if it's not free. Free image (if any) has already been stored above.
|
|
|
|
if ( $image && $image !== $free_image ) {
|
|
|
|
$linksUpdate->mProperties[PageImages::getPropName( false )] = $image;
|
2015-11-16 15:40:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns score for image, the more the better, if it is less than zero,
|
|
|
|
* the image shouldn't be used for anything
|
|
|
|
*
|
|
|
|
* @param array $image Associative array describing an image
|
|
|
|
* @param int $position Image order on page
|
|
|
|
*
|
2019-01-09 10:49:50 +00:00
|
|
|
* @return float
|
2015-11-16 15:40:07 +00:00
|
|
|
*/
|
2016-11-16 21:29:03 +00:00
|
|
|
protected function getScore( array $image, $position ) {
|
2015-11-16 15:40:07 +00:00
|
|
|
global $wgPageImagesScores;
|
|
|
|
|
|
|
|
if ( isset( $image['handler'] ) ) {
|
|
|
|
// Standalone image
|
|
|
|
$score = $this->scoreFromTable( $image['handler']['width'], $wgPageImagesScores['width'] );
|
|
|
|
} else {
|
|
|
|
// From gallery
|
|
|
|
$score = $this->scoreFromTable( $image['fullwidth'], $wgPageImagesScores['galleryImageWidth'] );
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( isset( $wgPageImagesScores['position'][$position] ) ) {
|
|
|
|
$score += $wgPageImagesScores['position'][$position];
|
|
|
|
}
|
|
|
|
|
|
|
|
$ratio = intval( $this->getRatio( $image ) * 10 );
|
|
|
|
$score += $this->scoreFromTable( $ratio, $wgPageImagesScores['ratio'] );
|
|
|
|
|
|
|
|
$blacklist = $this->getBlacklist();
|
|
|
|
if ( isset( $blacklist[$image['filename']] ) ) {
|
|
|
|
$score = -1000;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $score;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns score based on table of ranges
|
|
|
|
*
|
2017-12-06 21:02:05 +00:00
|
|
|
* @param int $value The number that the various bounds are compared against
|
|
|
|
* to calculate the score
|
2019-01-09 10:49:50 +00:00
|
|
|
* @param float[] $scores Table of scores for different ranges of $value
|
2015-11-16 15:40:07 +00:00
|
|
|
*
|
2019-01-09 10:49:50 +00:00
|
|
|
* @return float
|
2015-11-16 15:40:07 +00:00
|
|
|
*/
|
2016-11-16 21:29:03 +00:00
|
|
|
protected function scoreFromTable( $value, array $scores ) {
|
2015-11-16 15:40:07 +00:00
|
|
|
$lastScore = 0;
|
|
|
|
|
2019-01-09 10:42:44 +00:00
|
|
|
// The loop stops at the *first* match, and therefore *requires* the input array keys to be
|
|
|
|
// in increasing order.
|
|
|
|
ksort( $scores, SORT_NUMERIC );
|
|
|
|
foreach ( $scores as $upperBoundary => $score ) {
|
2015-11-16 15:40:07 +00:00
|
|
|
$lastScore = $score;
|
2019-01-09 10:42:44 +00:00
|
|
|
|
|
|
|
if ( $value <= $upperBoundary ) {
|
|
|
|
break;
|
|
|
|
}
|
2015-11-16 15:40:07 +00:00
|
|
|
}
|
|
|
|
|
2019-01-09 10:49:50 +00:00
|
|
|
if ( !is_numeric( $lastScore ) ) {
|
|
|
|
wfLogWarning( 'The PageImagesScores setting must only contain numeric values!' );
|
|
|
|
}
|
|
|
|
|
|
|
|
return (float)$lastScore;
|
2015-11-16 15:40:07 +00:00
|
|
|
}
|
|
|
|
|
2016-01-25 05:29:29 +00:00
|
|
|
/**
|
2016-11-10 00:02:00 +00:00
|
|
|
* Check whether image's copyright allows it to be used freely.
|
2016-03-07 08:47:51 +00:00
|
|
|
*
|
2016-11-10 00:02:00 +00:00
|
|
|
* @param string $fileName Name of the image file
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
protected function isImageFree( $fileName ) {
|
|
|
|
$file = wfFindFile( $fileName );
|
|
|
|
if ( $file ) {
|
|
|
|
// Process copyright metadata from CommonsMetadata, if present.
|
|
|
|
// Image is considered free if the value is '0' or unset.
|
|
|
|
return empty( $this->fetchFileMetadata( $file )['NonFree']['value'] );
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Fetch file metadata
|
2016-03-07 08:47:51 +00:00
|
|
|
*
|
2017-12-06 21:02:05 +00:00
|
|
|
* @param File $file File to fetch metadata from
|
2016-11-10 00:02:00 +00:00
|
|
|
* @return array
|
2016-01-25 05:29:29 +00:00
|
|
|
*/
|
2016-11-10 00:02:00 +00:00
|
|
|
protected function fetchFileMetadata( $file ) {
|
2016-01-25 05:29:29 +00:00
|
|
|
$format = new FormatMetadata;
|
|
|
|
$context = new DerivativeContext( $format->getContext() );
|
2017-12-06 21:59:16 +00:00
|
|
|
// we don't care about the language, and specifying singleLanguage is slightly faster
|
|
|
|
$format->setSingleLanguage( true );
|
|
|
|
// we don't care about the language, so avoid splitting the cache by selecting English
|
|
|
|
$context->setLanguage( 'en' );
|
2016-01-25 05:29:29 +00:00
|
|
|
$format->setContext( $context );
|
2016-11-10 00:02:00 +00:00
|
|
|
return $format->fetchExtendedMetadata( $file );
|
2016-01-25 05:29:29 +00:00
|
|
|
}
|
|
|
|
|
2015-11-16 15:40:07 +00:00
|
|
|
/**
|
|
|
|
* Returns width/height ratio of an image as displayed or 0 is not available
|
|
|
|
*
|
2017-12-06 21:02:05 +00:00
|
|
|
* @param array $image Array representing the image to get the aspect ratio from
|
2015-11-16 15:40:07 +00:00
|
|
|
*
|
|
|
|
* @return float|int
|
|
|
|
*/
|
2016-11-16 21:29:03 +00:00
|
|
|
protected function getRatio( array $image ) {
|
2015-11-16 15:40:07 +00:00
|
|
|
$width = $image['fullwidth'];
|
|
|
|
$height = $image['fullheight'];
|
|
|
|
|
|
|
|
if ( !$width || !$height ) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $width / $height;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns a list of images blacklisted from influencing this extension's output
|
|
|
|
*
|
|
|
|
* @throws Exception
|
|
|
|
* @return int[] Flipped associative array in format "image BDB key" => int
|
|
|
|
*/
|
2016-11-16 21:29:03 +00:00
|
|
|
protected function getBlacklist() {
|
2015-11-16 15:40:07 +00:00
|
|
|
global $wgPageImagesBlacklist, $wgPageImagesBlacklistExpiry, $wgMemc;
|
|
|
|
static $list = false;
|
|
|
|
|
|
|
|
if ( $list !== false ) {
|
|
|
|
return $list;
|
|
|
|
}
|
|
|
|
|
|
|
|
$key = wfMemcKey( 'pageimages', 'blacklist' );
|
|
|
|
$list = $wgMemc->get( $key );
|
|
|
|
if ( $list !== false ) {
|
|
|
|
return $list;
|
|
|
|
}
|
|
|
|
|
|
|
|
wfDebug( __METHOD__ . "(): cache miss\n" );
|
2016-11-21 23:29:28 +00:00
|
|
|
$list = [];
|
2015-11-16 15:40:07 +00:00
|
|
|
|
|
|
|
foreach ( $wgPageImagesBlacklist as $source ) {
|
|
|
|
switch ( $source['type'] ) {
|
|
|
|
case 'db':
|
|
|
|
$list = array_merge( $list, $this->getDbBlacklist( $source['db'], $source['page'] ) );
|
|
|
|
break;
|
|
|
|
case 'url':
|
|
|
|
$list = array_merge( $list, $this->getUrlBlacklist( $source['url'] ) );
|
|
|
|
break;
|
|
|
|
default:
|
2016-11-21 23:29:28 +00:00
|
|
|
throw new Exception(
|
|
|
|
__METHOD__ . "(): unrecognized image blacklist type '{$source['type']}'" );
|
2015-11-16 15:40:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$list = array_flip( $list );
|
|
|
|
$wgMemc->set( $key, $list, $wgPageImagesBlacklistExpiry );
|
|
|
|
return $list;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns list of images linked by the given blacklist page
|
|
|
|
*
|
|
|
|
* @param string|bool $dbName Database name or false for current database
|
|
|
|
* @param string $page
|
|
|
|
*
|
|
|
|
* @return string[]
|
|
|
|
*/
|
|
|
|
private function getDbBlacklist( $dbName, $page ) {
|
2017-09-24 05:30:58 +00:00
|
|
|
$dbr = wfGetDB( DB_REPLICA, [], $dbName );
|
2015-11-16 15:40:07 +00:00
|
|
|
$title = Title::newFromText( $page );
|
2016-11-21 23:29:28 +00:00
|
|
|
$list = [];
|
2015-11-16 15:40:07 +00:00
|
|
|
|
|
|
|
$id = $dbr->selectField(
|
|
|
|
'page',
|
|
|
|
'page_id',
|
2016-11-21 23:29:28 +00:00
|
|
|
[ 'page_namespace' => $title->getNamespace(), 'page_title' => $title->getDBkey() ],
|
2015-11-16 15:40:07 +00:00
|
|
|
__METHOD__
|
|
|
|
);
|
|
|
|
|
|
|
|
if ( $id ) {
|
|
|
|
$res = $dbr->select( 'pagelinks',
|
|
|
|
'pl_title',
|
2016-11-21 23:29:28 +00:00
|
|
|
[ 'pl_from' => $id, 'pl_namespace' => NS_FILE ],
|
2015-11-16 15:40:07 +00:00
|
|
|
__METHOD__
|
|
|
|
);
|
|
|
|
foreach ( $res as $row ) {
|
|
|
|
$list[] = $row->pl_title;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $list;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns list of images on given remote blacklist page.
|
|
|
|
* Not quite 100% bulletproof due to localised namespaces and so on.
|
|
|
|
* Though if you beat people if they add bad entries to the list... :)
|
|
|
|
*
|
|
|
|
* @param string $url
|
|
|
|
*
|
|
|
|
* @return string[]
|
|
|
|
*/
|
|
|
|
private function getUrlBlacklist( $url ) {
|
|
|
|
global $wgFileExtensions;
|
|
|
|
|
2016-11-21 23:29:28 +00:00
|
|
|
$list = [];
|
2018-04-05 08:14:31 +00:00
|
|
|
$text = Http::get( $url, [ 'timeout' => 3 ], __METHOD__ );
|
2015-11-16 15:40:07 +00:00
|
|
|
$regex = '/\[\[:([^|\#]*?\.(?:' . implode( '|', $wgFileExtensions ) . '))/i';
|
|
|
|
|
|
|
|
if ( $text && preg_match_all( $regex, $text, $matches ) ) {
|
2019-11-04 18:18:18 +00:00
|
|
|
// @phan-suppress-next-line PhanTypeArraySuspiciousNullable From capture group in regex
|
2015-11-16 15:40:07 +00:00
|
|
|
foreach ( $matches[1] as $s ) {
|
|
|
|
$t = Title::makeTitleSafe( NS_FILE, $s );
|
|
|
|
|
|
|
|
if ( $t ) {
|
|
|
|
$list[] = $t->getDBkey();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $list;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|