Use the new metadata splitting facility to improve PDF performance

* Migrate to the new metadata system: override getSizeAndMetadata()
* Use getHandlerState() instead of a custom property on the File object.
* Opt in to metadata splitting. Avoid loading the text item unless it is
  really needed.
* In getDimensionInfo(), use getHandlerState() instead of the
  WANObjectCache process cache (pcTTL). This is just a
  micro-optimisation, informed by profiling, which showed 90 calls to
  this function during an image page view.

Depends-On: I876ea5c9d3a1881e278f689d2f8a3ae20240c703
Change-Id: I30d0b0009fcb11c14d14663bd1f2c2a3dfac55d6
This commit is contained in:
Tim Starling 2021-06-07 13:47:53 +10:00
parent 5073f572cb
commit 989b42b8eb
2 changed files with 70 additions and 122 deletions

View file

@ -12,7 +12,6 @@ use PoolCounterWorkViaCallback;
use ResourceLoader; use ResourceLoader;
use ThumbnailImage; use ThumbnailImage;
use TransformParameterError; use TransformParameterError;
use Wikimedia\AtEase\AtEase;
/** /**
* Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de> * Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de>
@ -49,6 +48,16 @@ class PdfHandler extends ImageHandler {
*/ */
private const LARGE_FILE = 1e7; private const LARGE_FILE = 1e7;
/**
* Key for getHandlerState for value of type PdfImage
*/
private const STATE_PDF_IMAGE = 'pdfImage';
/**
* Key for getHandlerState for dimension info
*/
private const STATE_DIMENSION_INFO = 'pdfDimensionInfo';
/** /**
* @return bool * @return bool
*/ */
@ -259,65 +268,32 @@ class PdfHandler extends ImageHandler {
} }
/** /**
* @param File $image * @param \MediaHandlerState $state
* @param string $path * @param string $path
* @return PdfImage * @return PdfImage
* @suppress PhanUndeclaredProperty
*/ */
private function getPdfImage( $image, $path ) { private function getPdfImage( $state, $path ) {
if ( !$image ) { $pdfimg = $state->getHandlerState( self::STATE_PDF_IMAGE );
if ( !$pdfimg ) {
$pdfimg = new PdfImage( $path ); $pdfimg = new PdfImage( $path );
} elseif ( !isset( $image->pdfImage ) ) { $state->setHandlerState( self::STATE_PDF_IMAGE, $pdfimg );
$pdfimg = $image->pdfImage = new PdfImage( $path );
} else {
$pdfimg = $image->pdfImage;
} }
return $pdfimg; return $pdfimg;
} }
/** /**
* @param File $image * @param \MediaHandlerState $state
* @return bool|array
*/
private function getMetaArray( $image ) {
if ( isset( $image->pdfMetaArray ) ) {
return $image->pdfMetaArray;
}
$metadata = $image->getMetadata();
if ( !$this->isMetadataValid( $image, $metadata ) ) {
wfDebug( "Pdf metadata is invalid or missing, should have been fixed in upgradeRow\n" );
return false;
}
$work = new PoolCounterWorkViaCallback(
'PdfHandler-unserialize-metadata',
$image->getName(),
[
/**
* @suppress PhanUndeclaredProperty
*/
'doWork' => static function () use ( $image, $metadata ) {
AtEase::suppressWarnings();
$image->pdfMetaArray = unserialize( $metadata );
AtEase::restoreWarnings();
},
]
);
$work->execute();
return $image->pdfMetaArray;
}
/**
* @param File $image
* @param string $path * @param string $path
* @return array|bool * @return array|bool
*/ */
public function getImageSize( $image, $path ) { public function getSizeAndMetadata( $state, $path ) {
return $this->getPdfImage( $image, $path )->getImageSize(); $metadata = $this->getPdfImage( $state, $path )->retrieveMetaData();
$sizes = PdfImage::getPageSize( $metadata, 1 );
if ( $sizes ) {
return $sizes + [ 'metadata' => $metadata ];
} else {
return [ 'metadata' => $metadata ];
}
} }
/** /**
@ -338,23 +314,14 @@ class PdfHandler extends ImageHandler {
} }
/** /**
* @param File $image * @param File $file
* @param string $path
* @return string
*/
public function getMetadata( $image, $path ) {
return serialize( $this->getPdfImage( $image, $path )->retrieveMetaData() );
}
/**
* @param File $image
* @param string $metadata
* @return bool * @return bool
*/ */
public function isMetadataValid( $image, $metadata ) { public function isFileMetadataValid( $file ) {
if ( !$metadata || $metadata === serialize( [] ) ) { $data = $file->getMetadataItems( [ 'mergedMetadata', 'pages' ] );
if ( !isset( $data['pages'] ) ) {
return self::METADATA_BAD; return self::METADATA_BAD;
} elseif ( strpos( $metadata, 'mergedMetadata' ) === false ) { } elseif ( !isset( $data['mergedMetadata'] ) ) {
return self::METADATA_COMPATIBLE; return self::METADATA_COMPATIBLE;
} }
return self::METADATA_GOOD; return self::METADATA_GOOD;
@ -366,24 +333,14 @@ class PdfHandler extends ImageHandler {
* @return bool|array * @return bool|array
*/ */
public function formatMetadata( $image, $context = false ) { public function formatMetadata( $image, $context = false ) {
$meta = $image->getMetadata(); $mergedMetadata = $image->getMetadataItem( 'mergedMetadata' );
if ( !$meta ) { if ( !is_array( $mergedMetadata ) || !count( $mergedMetadata ) ) {
return false;
}
AtEase::suppressWarnings();
$meta = unserialize( $meta );
AtEase::restoreWarnings();
if ( !isset( $meta['mergedMetadata'] )
|| !is_array( $meta['mergedMetadata'] )
|| count( $meta['mergedMetadata'] ) < 1
) {
return false; return false;
} }
// Inherited from MediaHandler. // Inherited from MediaHandler.
return $this->formatMetadataHelper( $meta['mergedMetadata'], $context ); return $this->formatMetadataHelper( $mergedMetadata, $context );
} }
/** @inheritDoc */ /** @inheritDoc */
@ -437,29 +394,30 @@ class PdfHandler extends ImageHandler {
} }
protected function getDimensionInfo( File $file ) { protected function getDimensionInfo( File $file ) {
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); $info = $file->getHandlerState( self::STATE_DIMENSION_INFO );
return $cache->getWithSetCallback( if ( !$info ) {
$cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ), $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
$cache::TTL_INDEFINITE, $info = $cache->getWithSetCallback(
function () use ( $file ) { $cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ),
$data = $this->getMetaArray( $file ); $cache::TTL_INDEFINITE,
if ( !$data || !isset( $data['Pages'] ) ) { static function () use ( $file ) {
return false; $data = $file->getMetadataItems( PdfImage::ITEMS_FOR_PAGE_SIZE );
if ( !$data || !isset( $data['Pages'] ) ) {
return false;
}
$dimsByPage = [];
$count = intval( $data['Pages'] );
for ( $i = 1; $i <= $count; $i++ ) {
$dimsByPage[$i] = PdfImage::getPageSize( $data, $i );
}
return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
} }
);
// lower peak RAM }
unset( $data['text'] ); $file->setHandlerState( self::STATE_DIMENSION_INFO, $info );
return $info;
$dimsByPage = [];
$count = intval( $data['Pages'] );
for ( $i = 1; $i <= $count; $i++ ) {
$dimsByPage[$i] = PdfImage::getPageSize( $data, $i );
}
return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
},
[ 'pcTTL' => $cache::TTL_INDEFINITE ]
);
} }
/** /**
@ -468,11 +426,11 @@ class PdfHandler extends ImageHandler {
* @return bool * @return bool
*/ */
public function getPageText( File $image, $page ) { public function getPageText( File $image, $page ) {
$data = $this->getMetaArray( $image ); $pageTexts = $image->getMetadataItem( 'text' );
if ( !$data || !isset( $data['text'] ) || !isset( $data['text'][$page - 1] ) ) { if ( !is_array( $pageTexts ) || !isset( $pageTexts[$page - 1] ) ) {
return false; return false;
} }
return $data['text'][$page - 1]; return $pageTexts[$page - 1];
} }
/** /**
@ -498,4 +456,8 @@ class PdfHandler extends ImageHandler {
'messages' => array_values( self::MESSAGES ), 'messages' => array_values( self::MESSAGES ),
] ); ] );
} }
public function useSplitMetadata() {
return true;
}
} }

View file

@ -39,6 +39,8 @@ class PdfImage {
*/ */
private $mFilename; private $mFilename;
public const ITEMS_FOR_PAGE_SIZE = [ 'Pages', 'pages', 'Page size', 'Page rot' ];
/** /**
* @param string $filename * @param string $filename
*/ */
@ -53,22 +55,6 @@ class PdfImage {
return true; return true;
} }
/**
* @return array|bool
*/
public function getImageSize() {
$data = $this->retrieveMetadata();
$size = self::getPageSize( $data, 1 );
if ( $size ) {
$width = $size['width'];
$height = $size['height'];
return [ $width, $height, 'Pdf',
"width=\"$width\" height=\"$height\"" ];
}
return false;
}
/** /**
* @param array $data * @param array $data
* @param int $page * @param int $page
@ -117,9 +103,9 @@ class PdfImage {
} }
/** /**
* @return array|bool|null * @return array
*/ */
public function retrieveMetaData() { public function retrieveMetaData(): array {
global $wgPdfInfo, $wgPdftoText; global $wgPdfInfo, $wgPdftoText;
if ( $wgPdfInfo ) { if ( $wgPdfInfo ) {
@ -151,7 +137,7 @@ class PdfImage {
$dump = $resultMeta->getStdout() . $resultPages->getStdout(); $dump = $resultMeta->getStdout() . $resultPages->getStdout();
$data = $this->convertDumpToArray( $dump ); $data = $this->convertDumpToArray( $dump );
} else { } else {
$data = null; $data = [];
} }
// Read text layer // Read text layer
@ -177,11 +163,11 @@ class PdfImage {
/** /**
* @param string $dump * @param string $dump
* @return array|bool * @return array
*/ */
protected function convertDumpToArray( $dump ) { protected function convertDumpToArray( $dump ): array {
if ( strval( $dump ) == '' ) { if ( strval( $dump ) == '' ) {
return false; return [];
} }
$lines = explode( "\n", $dump ); $lines = explode( "\n", $dump );