mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/PdfHandler
synced 2024-11-24 00:07:10 +00:00
Use the new metadata splitting facility to improve PDF performance
* Migrate to the new metadata system: override getSizeAndMetadata() * Use getHandlerState() instead of a custom property on the File object. * Opt in to metadata splitting. Avoid loading the text item unless it is really needed. * In getDimensionInfo(), use getHandlerState() instead of the WANObjectCache process cache (pcTTL). This is just a micro-optimisation, informed by profiling, which showed 90 calls to this function during an image page view. Depends-On: I876ea5c9d3a1881e278f689d2f8a3ae20240c703 Change-Id: I30d0b0009fcb11c14d14663bd1f2c2a3dfac55d6
This commit is contained in:
parent
5073f572cb
commit
989b42b8eb
|
@ -12,7 +12,6 @@ use PoolCounterWorkViaCallback;
|
||||||
use ResourceLoader;
|
use ResourceLoader;
|
||||||
use ThumbnailImage;
|
use ThumbnailImage;
|
||||||
use TransformParameterError;
|
use TransformParameterError;
|
||||||
use Wikimedia\AtEase\AtEase;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de>
|
* Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de>
|
||||||
|
@ -49,6 +48,16 @@ class PdfHandler extends ImageHandler {
|
||||||
*/
|
*/
|
||||||
private const LARGE_FILE = 1e7;
|
private const LARGE_FILE = 1e7;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key for getHandlerState for value of type PdfImage
|
||||||
|
*/
|
||||||
|
private const STATE_PDF_IMAGE = 'pdfImage';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key for getHandlerState for dimension info
|
||||||
|
*/
|
||||||
|
private const STATE_DIMENSION_INFO = 'pdfDimensionInfo';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return bool
|
* @return bool
|
||||||
*/
|
*/
|
||||||
|
@ -259,65 +268,32 @@ class PdfHandler extends ImageHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param File $image
|
* @param \MediaHandlerState $state
|
||||||
* @param string $path
|
* @param string $path
|
||||||
* @return PdfImage
|
* @return PdfImage
|
||||||
* @suppress PhanUndeclaredProperty
|
|
||||||
*/
|
*/
|
||||||
private function getPdfImage( $image, $path ) {
|
private function getPdfImage( $state, $path ) {
|
||||||
if ( !$image ) {
|
$pdfimg = $state->getHandlerState( self::STATE_PDF_IMAGE );
|
||||||
|
if ( !$pdfimg ) {
|
||||||
$pdfimg = new PdfImage( $path );
|
$pdfimg = new PdfImage( $path );
|
||||||
} elseif ( !isset( $image->pdfImage ) ) {
|
$state->setHandlerState( self::STATE_PDF_IMAGE, $pdfimg );
|
||||||
$pdfimg = $image->pdfImage = new PdfImage( $path );
|
|
||||||
} else {
|
|
||||||
$pdfimg = $image->pdfImage;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $pdfimg;
|
return $pdfimg;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param File $image
|
* @param \MediaHandlerState $state
|
||||||
* @return bool|array
|
|
||||||
*/
|
|
||||||
private function getMetaArray( $image ) {
|
|
||||||
if ( isset( $image->pdfMetaArray ) ) {
|
|
||||||
return $image->pdfMetaArray;
|
|
||||||
}
|
|
||||||
|
|
||||||
$metadata = $image->getMetadata();
|
|
||||||
|
|
||||||
if ( !$this->isMetadataValid( $image, $metadata ) ) {
|
|
||||||
wfDebug( "Pdf metadata is invalid or missing, should have been fixed in upgradeRow\n" );
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
$work = new PoolCounterWorkViaCallback(
|
|
||||||
'PdfHandler-unserialize-metadata',
|
|
||||||
$image->getName(),
|
|
||||||
[
|
|
||||||
/**
|
|
||||||
* @suppress PhanUndeclaredProperty
|
|
||||||
*/
|
|
||||||
'doWork' => static function () use ( $image, $metadata ) {
|
|
||||||
AtEase::suppressWarnings();
|
|
||||||
$image->pdfMetaArray = unserialize( $metadata );
|
|
||||||
AtEase::restoreWarnings();
|
|
||||||
},
|
|
||||||
]
|
|
||||||
);
|
|
||||||
$work->execute();
|
|
||||||
|
|
||||||
return $image->pdfMetaArray;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param File $image
|
|
||||||
* @param string $path
|
* @param string $path
|
||||||
* @return array|bool
|
* @return array|bool
|
||||||
*/
|
*/
|
||||||
public function getImageSize( $image, $path ) {
|
public function getSizeAndMetadata( $state, $path ) {
|
||||||
return $this->getPdfImage( $image, $path )->getImageSize();
|
$metadata = $this->getPdfImage( $state, $path )->retrieveMetaData();
|
||||||
|
$sizes = PdfImage::getPageSize( $metadata, 1 );
|
||||||
|
if ( $sizes ) {
|
||||||
|
return $sizes + [ 'metadata' => $metadata ];
|
||||||
|
} else {
|
||||||
|
return [ 'metadata' => $metadata ];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -338,23 +314,14 @@ class PdfHandler extends ImageHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param File $image
|
* @param File $file
|
||||||
* @param string $path
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getMetadata( $image, $path ) {
|
|
||||||
return serialize( $this->getPdfImage( $image, $path )->retrieveMetaData() );
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param File $image
|
|
||||||
* @param string $metadata
|
|
||||||
* @return bool
|
* @return bool
|
||||||
*/
|
*/
|
||||||
public function isMetadataValid( $image, $metadata ) {
|
public function isFileMetadataValid( $file ) {
|
||||||
if ( !$metadata || $metadata === serialize( [] ) ) {
|
$data = $file->getMetadataItems( [ 'mergedMetadata', 'pages' ] );
|
||||||
|
if ( !isset( $data['pages'] ) ) {
|
||||||
return self::METADATA_BAD;
|
return self::METADATA_BAD;
|
||||||
} elseif ( strpos( $metadata, 'mergedMetadata' ) === false ) {
|
} elseif ( !isset( $data['mergedMetadata'] ) ) {
|
||||||
return self::METADATA_COMPATIBLE;
|
return self::METADATA_COMPATIBLE;
|
||||||
}
|
}
|
||||||
return self::METADATA_GOOD;
|
return self::METADATA_GOOD;
|
||||||
|
@ -366,24 +333,14 @@ class PdfHandler extends ImageHandler {
|
||||||
* @return bool|array
|
* @return bool|array
|
||||||
*/
|
*/
|
||||||
public function formatMetadata( $image, $context = false ) {
|
public function formatMetadata( $image, $context = false ) {
|
||||||
$meta = $image->getMetadata();
|
$mergedMetadata = $image->getMetadataItem( 'mergedMetadata' );
|
||||||
|
|
||||||
if ( !$meta ) {
|
if ( !is_array( $mergedMetadata ) || !count( $mergedMetadata ) ) {
|
||||||
return false;
|
|
||||||
}
|
|
||||||
AtEase::suppressWarnings();
|
|
||||||
$meta = unserialize( $meta );
|
|
||||||
AtEase::restoreWarnings();
|
|
||||||
|
|
||||||
if ( !isset( $meta['mergedMetadata'] )
|
|
||||||
|| !is_array( $meta['mergedMetadata'] )
|
|
||||||
|| count( $meta['mergedMetadata'] ) < 1
|
|
||||||
) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Inherited from MediaHandler.
|
// Inherited from MediaHandler.
|
||||||
return $this->formatMetadataHelper( $meta['mergedMetadata'], $context );
|
return $this->formatMetadataHelper( $mergedMetadata, $context );
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @inheritDoc */
|
/** @inheritDoc */
|
||||||
|
@ -437,29 +394,30 @@ class PdfHandler extends ImageHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function getDimensionInfo( File $file ) {
|
protected function getDimensionInfo( File $file ) {
|
||||||
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
|
$info = $file->getHandlerState( self::STATE_DIMENSION_INFO );
|
||||||
return $cache->getWithSetCallback(
|
if ( !$info ) {
|
||||||
$cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ),
|
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
|
||||||
$cache::TTL_INDEFINITE,
|
$info = $cache->getWithSetCallback(
|
||||||
function () use ( $file ) {
|
$cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ),
|
||||||
$data = $this->getMetaArray( $file );
|
$cache::TTL_INDEFINITE,
|
||||||
if ( !$data || !isset( $data['Pages'] ) ) {
|
static function () use ( $file ) {
|
||||||
return false;
|
$data = $file->getMetadataItems( PdfImage::ITEMS_FOR_PAGE_SIZE );
|
||||||
|
if ( !$data || !isset( $data['Pages'] ) ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
$dimsByPage = [];
|
||||||
|
$count = intval( $data['Pages'] );
|
||||||
|
for ( $i = 1; $i <= $count; $i++ ) {
|
||||||
|
$dimsByPage[$i] = PdfImage::getPageSize( $data, $i );
|
||||||
|
}
|
||||||
|
|
||||||
|
return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
|
||||||
}
|
}
|
||||||
|
);
|
||||||
// lower peak RAM
|
}
|
||||||
unset( $data['text'] );
|
$file->setHandlerState( self::STATE_DIMENSION_INFO, $info );
|
||||||
|
return $info;
|
||||||
$dimsByPage = [];
|
|
||||||
$count = intval( $data['Pages'] );
|
|
||||||
for ( $i = 1; $i <= $count; $i++ ) {
|
|
||||||
$dimsByPage[$i] = PdfImage::getPageSize( $data, $i );
|
|
||||||
}
|
|
||||||
|
|
||||||
return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
|
|
||||||
},
|
|
||||||
[ 'pcTTL' => $cache::TTL_INDEFINITE ]
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -468,11 +426,11 @@ class PdfHandler extends ImageHandler {
|
||||||
* @return bool
|
* @return bool
|
||||||
*/
|
*/
|
||||||
public function getPageText( File $image, $page ) {
|
public function getPageText( File $image, $page ) {
|
||||||
$data = $this->getMetaArray( $image );
|
$pageTexts = $image->getMetadataItem( 'text' );
|
||||||
if ( !$data || !isset( $data['text'] ) || !isset( $data['text'][$page - 1] ) ) {
|
if ( !is_array( $pageTexts ) || !isset( $pageTexts[$page - 1] ) ) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return $data['text'][$page - 1];
|
return $pageTexts[$page - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -498,4 +456,8 @@ class PdfHandler extends ImageHandler {
|
||||||
'messages' => array_values( self::MESSAGES ),
|
'messages' => array_values( self::MESSAGES ),
|
||||||
] );
|
] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function useSplitMetadata() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,8 @@ class PdfImage {
|
||||||
*/
|
*/
|
||||||
private $mFilename;
|
private $mFilename;
|
||||||
|
|
||||||
|
public const ITEMS_FOR_PAGE_SIZE = [ 'Pages', 'pages', 'Page size', 'Page rot' ];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $filename
|
* @param string $filename
|
||||||
*/
|
*/
|
||||||
|
@ -53,22 +55,6 @@ class PdfImage {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @return array|bool
|
|
||||||
*/
|
|
||||||
public function getImageSize() {
|
|
||||||
$data = $this->retrieveMetadata();
|
|
||||||
$size = self::getPageSize( $data, 1 );
|
|
||||||
|
|
||||||
if ( $size ) {
|
|
||||||
$width = $size['width'];
|
|
||||||
$height = $size['height'];
|
|
||||||
return [ $width, $height, 'Pdf',
|
|
||||||
"width=\"$width\" height=\"$height\"" ];
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param array $data
|
* @param array $data
|
||||||
* @param int $page
|
* @param int $page
|
||||||
|
@ -117,9 +103,9 @@ class PdfImage {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return array|bool|null
|
* @return array
|
||||||
*/
|
*/
|
||||||
public function retrieveMetaData() {
|
public function retrieveMetaData(): array {
|
||||||
global $wgPdfInfo, $wgPdftoText;
|
global $wgPdfInfo, $wgPdftoText;
|
||||||
|
|
||||||
if ( $wgPdfInfo ) {
|
if ( $wgPdfInfo ) {
|
||||||
|
@ -151,7 +137,7 @@ class PdfImage {
|
||||||
$dump = $resultMeta->getStdout() . $resultPages->getStdout();
|
$dump = $resultMeta->getStdout() . $resultPages->getStdout();
|
||||||
$data = $this->convertDumpToArray( $dump );
|
$data = $this->convertDumpToArray( $dump );
|
||||||
} else {
|
} else {
|
||||||
$data = null;
|
$data = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read text layer
|
// Read text layer
|
||||||
|
@ -177,11 +163,11 @@ class PdfImage {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $dump
|
* @param string $dump
|
||||||
* @return array|bool
|
* @return array
|
||||||
*/
|
*/
|
||||||
protected function convertDumpToArray( $dump ) {
|
protected function convertDumpToArray( $dump ): array {
|
||||||
if ( strval( $dump ) == '' ) {
|
if ( strval( $dump ) == '' ) {
|
||||||
return false;
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
$lines = explode( "\n", $dump );
|
$lines = explode( "\n", $dump );
|
||||||
|
|
Loading…
Reference in a new issue