Use the new metadata splitting facility to improve PDF performance

* Migrate to the new metadata system: override getSizeAndMetadata()
* Use getHandlerState() instead of a custom property on the File object.
* Opt in to metadata splitting. Avoid loading the text item unless it is
  really needed.
* In getDimensionInfo(), use getHandlerState() instead of the
  WANObjectCache process cache (pcTTL). This is just a
  micro-optimisation, informed by profiling, which showed 90 calls to
  this function during an image page view.

Depends-On: I876ea5c9d3a1881e278f689d2f8a3ae20240c703
Change-Id: I30d0b0009fcb11c14d14663bd1f2c2a3dfac55d6
This commit is contained in:
Tim Starling 2021-06-07 13:47:53 +10:00
parent 5073f572cb
commit 989b42b8eb
2 changed files with 70 additions and 122 deletions

View file

@ -12,7 +12,6 @@ use PoolCounterWorkViaCallback;
use ResourceLoader;
use ThumbnailImage;
use TransformParameterError;
use Wikimedia\AtEase\AtEase;
/**
* Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de>
@ -49,6 +48,16 @@ class PdfHandler extends ImageHandler {
*/
private const LARGE_FILE = 1e7;
/**
* Key for getHandlerState for value of type PdfImage
*/
private const STATE_PDF_IMAGE = 'pdfImage';
/**
* Key for getHandlerState for dimension info
*/
private const STATE_DIMENSION_INFO = 'pdfDimensionInfo';
/**
* @return bool
*/
@ -259,65 +268,32 @@ class PdfHandler extends ImageHandler {
}
/**
* @param File $image
* @param \MediaHandlerState $state
* @param string $path
* @return PdfImage
* @suppress PhanUndeclaredProperty
*/
private function getPdfImage( $image, $path ) {
if ( !$image ) {
private function getPdfImage( $state, $path ) {
$pdfimg = $state->getHandlerState( self::STATE_PDF_IMAGE );
if ( !$pdfimg ) {
$pdfimg = new PdfImage( $path );
} elseif ( !isset( $image->pdfImage ) ) {
$pdfimg = $image->pdfImage = new PdfImage( $path );
} else {
$pdfimg = $image->pdfImage;
$state->setHandlerState( self::STATE_PDF_IMAGE, $pdfimg );
}
return $pdfimg;
}
/**
* @param File $image
* @return bool|array
*/
private function getMetaArray( $image ) {
if ( isset( $image->pdfMetaArray ) ) {
return $image->pdfMetaArray;
}
$metadata = $image->getMetadata();
if ( !$this->isMetadataValid( $image, $metadata ) ) {
wfDebug( "Pdf metadata is invalid or missing, should have been fixed in upgradeRow\n" );
return false;
}
$work = new PoolCounterWorkViaCallback(
'PdfHandler-unserialize-metadata',
$image->getName(),
[
/**
* @suppress PhanUndeclaredProperty
*/
'doWork' => static function () use ( $image, $metadata ) {
AtEase::suppressWarnings();
$image->pdfMetaArray = unserialize( $metadata );
AtEase::restoreWarnings();
},
]
);
$work->execute();
return $image->pdfMetaArray;
}
/**
* @param File $image
* @param \MediaHandlerState $state
* @param string $path
* @return array|bool
*/
public function getImageSize( $image, $path ) {
return $this->getPdfImage( $image, $path )->getImageSize();
public function getSizeAndMetadata( $state, $path ) {
$metadata = $this->getPdfImage( $state, $path )->retrieveMetaData();
$sizes = PdfImage::getPageSize( $metadata, 1 );
if ( $sizes ) {
return $sizes + [ 'metadata' => $metadata ];
} else {
return [ 'metadata' => $metadata ];
}
}
/**
@ -338,23 +314,14 @@ class PdfHandler extends ImageHandler {
}
/**
* @param File $image
* @param string $path
* @return string
*/
public function getMetadata( $image, $path ) {
return serialize( $this->getPdfImage( $image, $path )->retrieveMetaData() );
}
/**
* @param File $image
* @param string $metadata
* @param File $file
* @return bool
*/
public function isMetadataValid( $image, $metadata ) {
if ( !$metadata || $metadata === serialize( [] ) ) {
public function isFileMetadataValid( $file ) {
$data = $file->getMetadataItems( [ 'mergedMetadata', 'pages' ] );
if ( !isset( $data['pages'] ) ) {
return self::METADATA_BAD;
} elseif ( strpos( $metadata, 'mergedMetadata' ) === false ) {
} elseif ( !isset( $data['mergedMetadata'] ) ) {
return self::METADATA_COMPATIBLE;
}
return self::METADATA_GOOD;
@ -366,24 +333,14 @@ class PdfHandler extends ImageHandler {
* @return bool|array
*/
public function formatMetadata( $image, $context = false ) {
$meta = $image->getMetadata();
$mergedMetadata = $image->getMetadataItem( 'mergedMetadata' );
if ( !$meta ) {
return false;
}
AtEase::suppressWarnings();
$meta = unserialize( $meta );
AtEase::restoreWarnings();
if ( !isset( $meta['mergedMetadata'] )
|| !is_array( $meta['mergedMetadata'] )
|| count( $meta['mergedMetadata'] ) < 1
) {
if ( !is_array( $mergedMetadata ) || !count( $mergedMetadata ) ) {
return false;
}
// Inherited from MediaHandler.
return $this->formatMetadataHelper( $meta['mergedMetadata'], $context );
return $this->formatMetadataHelper( $mergedMetadata, $context );
}
/** @inheritDoc */
@ -437,29 +394,30 @@ class PdfHandler extends ImageHandler {
}
protected function getDimensionInfo( File $file ) {
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
return $cache->getWithSetCallback(
$cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ),
$cache::TTL_INDEFINITE,
function () use ( $file ) {
$data = $this->getMetaArray( $file );
if ( !$data || !isset( $data['Pages'] ) ) {
return false;
$info = $file->getHandlerState( self::STATE_DIMENSION_INFO );
if ( !$info ) {
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
$info = $cache->getWithSetCallback(
$cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ),
$cache::TTL_INDEFINITE,
static function () use ( $file ) {
$data = $file->getMetadataItems( PdfImage::ITEMS_FOR_PAGE_SIZE );
if ( !$data || !isset( $data['Pages'] ) ) {
return false;
}
$dimsByPage = [];
$count = intval( $data['Pages'] );
for ( $i = 1; $i <= $count; $i++ ) {
$dimsByPage[$i] = PdfImage::getPageSize( $data, $i );
}
return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
}
// lower peak RAM
unset( $data['text'] );
$dimsByPage = [];
$count = intval( $data['Pages'] );
for ( $i = 1; $i <= $count; $i++ ) {
$dimsByPage[$i] = PdfImage::getPageSize( $data, $i );
}
return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
},
[ 'pcTTL' => $cache::TTL_INDEFINITE ]
);
);
}
$file->setHandlerState( self::STATE_DIMENSION_INFO, $info );
return $info;
}
/**
@ -468,11 +426,11 @@ class PdfHandler extends ImageHandler {
* @return bool
*/
public function getPageText( File $image, $page ) {
$data = $this->getMetaArray( $image );
if ( !$data || !isset( $data['text'] ) || !isset( $data['text'][$page - 1] ) ) {
$pageTexts = $image->getMetadataItem( 'text' );
if ( !is_array( $pageTexts ) || !isset( $pageTexts[$page - 1] ) ) {
return false;
}
return $data['text'][$page - 1];
return $pageTexts[$page - 1];
}
/**
@ -498,4 +456,8 @@ class PdfHandler extends ImageHandler {
'messages' => array_values( self::MESSAGES ),
] );
}
public function useSplitMetadata() {
return true;
}
}

View file

@ -39,6 +39,8 @@ class PdfImage {
*/
private $mFilename;
public const ITEMS_FOR_PAGE_SIZE = [ 'Pages', 'pages', 'Page size', 'Page rot' ];
/**
* @param string $filename
*/
@ -53,22 +55,6 @@ class PdfImage {
return true;
}
/**
* @return array|bool
*/
public function getImageSize() {
$data = $this->retrieveMetadata();
$size = self::getPageSize( $data, 1 );
if ( $size ) {
$width = $size['width'];
$height = $size['height'];
return [ $width, $height, 'Pdf',
"width=\"$width\" height=\"$height\"" ];
}
return false;
}
/**
* @param array $data
* @param int $page
@ -117,9 +103,9 @@ class PdfImage {
}
/**
* @return array|bool|null
* @return array
*/
public function retrieveMetaData() {
public function retrieveMetaData(): array {
global $wgPdfInfo, $wgPdftoText;
if ( $wgPdfInfo ) {
@ -151,7 +137,7 @@ class PdfImage {
$dump = $resultMeta->getStdout() . $resultPages->getStdout();
$data = $this->convertDumpToArray( $dump );
} else {
$data = null;
$data = [];
}
// Read text layer
@ -177,11 +163,11 @@ class PdfImage {
/**
* @param string $dump
* @return array|bool
* @return array
*/
protected function convertDumpToArray( $dump ) {
protected function convertDumpToArray( $dump ): array {
if ( strval( $dump ) == '' ) {
return false;
return [];
}
$lines = explode( "\n", $dump );