mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/PdfHandler
synced 2024-11-27 17:50:47 +00:00
Use the new metadata splitting facility to improve PDF performance
* Migrate to the new metadata system: override getSizeAndMetadata() * Use getHandlerState() instead of a custom property on the File object. * Opt in to metadata splitting. Avoid loading the text item unless it is really needed. * In getDimensionInfo(), use getHandlerState() instead of the WANObjectCache process cache (pcTTL). This is just a micro-optimisation, informed by profiling, which showed 90 calls to this function during an image page view. Depends-On: I876ea5c9d3a1881e278f689d2f8a3ae20240c703 Change-Id: I30d0b0009fcb11c14d14663bd1f2c2a3dfac55d6
This commit is contained in:
parent
5073f572cb
commit
989b42b8eb
|
@ -12,7 +12,6 @@ use PoolCounterWorkViaCallback;
|
|||
use ResourceLoader;
|
||||
use ThumbnailImage;
|
||||
use TransformParameterError;
|
||||
use Wikimedia\AtEase\AtEase;
|
||||
|
||||
/**
|
||||
* Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de>
|
||||
|
@ -49,6 +48,16 @@ class PdfHandler extends ImageHandler {
|
|||
*/
|
||||
private const LARGE_FILE = 1e7;
|
||||
|
||||
/**
|
||||
* Key for getHandlerState for value of type PdfImage
|
||||
*/
|
||||
private const STATE_PDF_IMAGE = 'pdfImage';
|
||||
|
||||
/**
|
||||
* Key for getHandlerState for dimension info
|
||||
*/
|
||||
private const STATE_DIMENSION_INFO = 'pdfDimensionInfo';
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
|
@ -259,65 +268,32 @@ class PdfHandler extends ImageHandler {
|
|||
}
|
||||
|
||||
/**
|
||||
* @param File $image
|
||||
* @param \MediaHandlerState $state
|
||||
* @param string $path
|
||||
* @return PdfImage
|
||||
* @suppress PhanUndeclaredProperty
|
||||
*/
|
||||
private function getPdfImage( $image, $path ) {
|
||||
if ( !$image ) {
|
||||
private function getPdfImage( $state, $path ) {
|
||||
$pdfimg = $state->getHandlerState( self::STATE_PDF_IMAGE );
|
||||
if ( !$pdfimg ) {
|
||||
$pdfimg = new PdfImage( $path );
|
||||
} elseif ( !isset( $image->pdfImage ) ) {
|
||||
$pdfimg = $image->pdfImage = new PdfImage( $path );
|
||||
} else {
|
||||
$pdfimg = $image->pdfImage;
|
||||
$state->setHandlerState( self::STATE_PDF_IMAGE, $pdfimg );
|
||||
}
|
||||
|
||||
return $pdfimg;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $image
|
||||
* @return bool|array
|
||||
*/
|
||||
private function getMetaArray( $image ) {
|
||||
if ( isset( $image->pdfMetaArray ) ) {
|
||||
return $image->pdfMetaArray;
|
||||
}
|
||||
|
||||
$metadata = $image->getMetadata();
|
||||
|
||||
if ( !$this->isMetadataValid( $image, $metadata ) ) {
|
||||
wfDebug( "Pdf metadata is invalid or missing, should have been fixed in upgradeRow\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
$work = new PoolCounterWorkViaCallback(
|
||||
'PdfHandler-unserialize-metadata',
|
||||
$image->getName(),
|
||||
[
|
||||
/**
|
||||
* @suppress PhanUndeclaredProperty
|
||||
*/
|
||||
'doWork' => static function () use ( $image, $metadata ) {
|
||||
AtEase::suppressWarnings();
|
||||
$image->pdfMetaArray = unserialize( $metadata );
|
||||
AtEase::restoreWarnings();
|
||||
},
|
||||
]
|
||||
);
|
||||
$work->execute();
|
||||
|
||||
return $image->pdfMetaArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $image
|
||||
* @param \MediaHandlerState $state
|
||||
* @param string $path
|
||||
* @return array|bool
|
||||
*/
|
||||
public function getImageSize( $image, $path ) {
|
||||
return $this->getPdfImage( $image, $path )->getImageSize();
|
||||
public function getSizeAndMetadata( $state, $path ) {
|
||||
$metadata = $this->getPdfImage( $state, $path )->retrieveMetaData();
|
||||
$sizes = PdfImage::getPageSize( $metadata, 1 );
|
||||
if ( $sizes ) {
|
||||
return $sizes + [ 'metadata' => $metadata ];
|
||||
} else {
|
||||
return [ 'metadata' => $metadata ];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -338,23 +314,14 @@ class PdfHandler extends ImageHandler {
|
|||
}
|
||||
|
||||
/**
|
||||
* @param File $image
|
||||
* @param string $path
|
||||
* @return string
|
||||
*/
|
||||
public function getMetadata( $image, $path ) {
|
||||
return serialize( $this->getPdfImage( $image, $path )->retrieveMetaData() );
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $image
|
||||
* @param string $metadata
|
||||
* @param File $file
|
||||
* @return bool
|
||||
*/
|
||||
public function isMetadataValid( $image, $metadata ) {
|
||||
if ( !$metadata || $metadata === serialize( [] ) ) {
|
||||
public function isFileMetadataValid( $file ) {
|
||||
$data = $file->getMetadataItems( [ 'mergedMetadata', 'pages' ] );
|
||||
if ( !isset( $data['pages'] ) ) {
|
||||
return self::METADATA_BAD;
|
||||
} elseif ( strpos( $metadata, 'mergedMetadata' ) === false ) {
|
||||
} elseif ( !isset( $data['mergedMetadata'] ) ) {
|
||||
return self::METADATA_COMPATIBLE;
|
||||
}
|
||||
return self::METADATA_GOOD;
|
||||
|
@ -366,24 +333,14 @@ class PdfHandler extends ImageHandler {
|
|||
* @return bool|array
|
||||
*/
|
||||
public function formatMetadata( $image, $context = false ) {
|
||||
$meta = $image->getMetadata();
|
||||
$mergedMetadata = $image->getMetadataItem( 'mergedMetadata' );
|
||||
|
||||
if ( !$meta ) {
|
||||
return false;
|
||||
}
|
||||
AtEase::suppressWarnings();
|
||||
$meta = unserialize( $meta );
|
||||
AtEase::restoreWarnings();
|
||||
|
||||
if ( !isset( $meta['mergedMetadata'] )
|
||||
|| !is_array( $meta['mergedMetadata'] )
|
||||
|| count( $meta['mergedMetadata'] ) < 1
|
||||
) {
|
||||
if ( !is_array( $mergedMetadata ) || !count( $mergedMetadata ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Inherited from MediaHandler.
|
||||
return $this->formatMetadataHelper( $meta['mergedMetadata'], $context );
|
||||
return $this->formatMetadataHelper( $mergedMetadata, $context );
|
||||
}
|
||||
|
||||
/** @inheritDoc */
|
||||
|
@ -437,19 +394,18 @@ class PdfHandler extends ImageHandler {
|
|||
}
|
||||
|
||||
protected function getDimensionInfo( File $file ) {
|
||||
$info = $file->getHandlerState( self::STATE_DIMENSION_INFO );
|
||||
if ( !$info ) {
|
||||
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
|
||||
return $cache->getWithSetCallback(
|
||||
$info = $cache->getWithSetCallback(
|
||||
$cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ),
|
||||
$cache::TTL_INDEFINITE,
|
||||
function () use ( $file ) {
|
||||
$data = $this->getMetaArray( $file );
|
||||
static function () use ( $file ) {
|
||||
$data = $file->getMetadataItems( PdfImage::ITEMS_FOR_PAGE_SIZE );
|
||||
if ( !$data || !isset( $data['Pages'] ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// lower peak RAM
|
||||
unset( $data['text'] );
|
||||
|
||||
$dimsByPage = [];
|
||||
$count = intval( $data['Pages'] );
|
||||
for ( $i = 1; $i <= $count; $i++ ) {
|
||||
|
@ -457,10 +413,12 @@ class PdfHandler extends ImageHandler {
|
|||
}
|
||||
|
||||
return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
|
||||
},
|
||||
[ 'pcTTL' => $cache::TTL_INDEFINITE ]
|
||||
}
|
||||
);
|
||||
}
|
||||
$file->setHandlerState( self::STATE_DIMENSION_INFO, $info );
|
||||
return $info;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $image
|
||||
|
@ -468,11 +426,11 @@ class PdfHandler extends ImageHandler {
|
|||
* @return bool
|
||||
*/
|
||||
public function getPageText( File $image, $page ) {
|
||||
$data = $this->getMetaArray( $image );
|
||||
if ( !$data || !isset( $data['text'] ) || !isset( $data['text'][$page - 1] ) ) {
|
||||
$pageTexts = $image->getMetadataItem( 'text' );
|
||||
if ( !is_array( $pageTexts ) || !isset( $pageTexts[$page - 1] ) ) {
|
||||
return false;
|
||||
}
|
||||
return $data['text'][$page - 1];
|
||||
return $pageTexts[$page - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -498,4 +456,8 @@ class PdfHandler extends ImageHandler {
|
|||
'messages' => array_values( self::MESSAGES ),
|
||||
] );
|
||||
}
|
||||
|
||||
public function useSplitMetadata() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,8 @@ class PdfImage {
|
|||
*/
|
||||
private $mFilename;
|
||||
|
||||
public const ITEMS_FOR_PAGE_SIZE = [ 'Pages', 'pages', 'Page size', 'Page rot' ];
|
||||
|
||||
/**
|
||||
* @param string $filename
|
||||
*/
|
||||
|
@ -53,22 +55,6 @@ class PdfImage {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array|bool
|
||||
*/
|
||||
public function getImageSize() {
|
||||
$data = $this->retrieveMetadata();
|
||||
$size = self::getPageSize( $data, 1 );
|
||||
|
||||
if ( $size ) {
|
||||
$width = $size['width'];
|
||||
$height = $size['height'];
|
||||
return [ $width, $height, 'Pdf',
|
||||
"width=\"$width\" height=\"$height\"" ];
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $data
|
||||
* @param int $page
|
||||
|
@ -117,9 +103,9 @@ class PdfImage {
|
|||
}
|
||||
|
||||
/**
|
||||
* @return array|bool|null
|
||||
* @return array
|
||||
*/
|
||||
public function retrieveMetaData() {
|
||||
public function retrieveMetaData(): array {
|
||||
global $wgPdfInfo, $wgPdftoText;
|
||||
|
||||
if ( $wgPdfInfo ) {
|
||||
|
@ -151,7 +137,7 @@ class PdfImage {
|
|||
$dump = $resultMeta->getStdout() . $resultPages->getStdout();
|
||||
$data = $this->convertDumpToArray( $dump );
|
||||
} else {
|
||||
$data = null;
|
||||
$data = [];
|
||||
}
|
||||
|
||||
// Read text layer
|
||||
|
@ -177,11 +163,11 @@ class PdfImage {
|
|||
|
||||
/**
|
||||
* @param string $dump
|
||||
* @return array|bool
|
||||
* @return array
|
||||
*/
|
||||
protected function convertDumpToArray( $dump ) {
|
||||
protected function convertDumpToArray( $dump ): array {
|
||||
if ( strval( $dump ) == '' ) {
|
||||
return false;
|
||||
return [];
|
||||
}
|
||||
|
||||
$lines = explode( "\n", $dump );
|
||||
|
|
Loading…
Reference in a new issue