mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/PdfHandler
synced 2024-09-23 10:23:03 +00:00
Port retrieveMetaData to BoxedCommand
Combine all 3 shellouts into one script, retrieveMetaData.sh. The script is executed by /bin/sh by default, it can be changed for Windows users by setting $wgPdfHandlerShell. pdftotext is a bit special since it's behavior varies based on the program's exit code, so save that in a file so we can check it independently of the overall exit status. Bug: T289228 Change-Id: I29750bcc282bd5f9b8e2f79aa340869738ea5f5b
This commit is contained in:
parent
7d6a851e1d
commit
b253dc04c4
|
@ -9,7 +9,7 @@
|
|||
"license-name": "GPL-2.0-or-later",
|
||||
"type": "media",
|
||||
"requires": {
|
||||
"MediaWiki": ">= 1.34.0"
|
||||
"MediaWiki": ">= 1.36.0"
|
||||
},
|
||||
"config": {
|
||||
"PdfOutputExtension": {
|
||||
|
@ -32,6 +32,9 @@
|
|||
},
|
||||
"PdfInfo": {
|
||||
"value": "/usr/bin/pdfinfo"
|
||||
},
|
||||
"PdfHandlerShell": {
|
||||
"value": "/bin/sh"
|
||||
}
|
||||
},
|
||||
"MessagesDirs": {
|
||||
|
|
|
@ -23,7 +23,7 @@ namespace MediaWiki\Extension\PdfHandler;
|
|||
|
||||
use BitmapMetadataHandler;
|
||||
use MediaWiki\Logger\LoggerFactory;
|
||||
use MediaWiki\Shell\Shell;
|
||||
use MediaWiki\MediaWikiServices;
|
||||
use UtfNormal\Validator;
|
||||
use Wikimedia\XMPReader\Reader as XMPReader;
|
||||
|
||||
|
@ -106,59 +106,61 @@ class PdfImage {
|
|||
* @return array
|
||||
*/
|
||||
public function retrieveMetaData(): array {
|
||||
global $wgPdfInfo, $wgPdftoText;
|
||||
global $wgPdfInfo, $wgPdftoText, $wgPdfHandlerShell;
|
||||
|
||||
if ( $wgPdfInfo ) {
|
||||
// Note in poppler 0.26 the -meta and page data options worked together,
|
||||
// but as of poppler 0.48 they must be queried separately.
|
||||
// https://bugs.freedesktop.org/show_bug.cgi?id=96801
|
||||
$cmdMeta = [
|
||||
$wgPdfInfo,
|
||||
# Report metadata as UTF-8 text...
|
||||
'-enc', 'UTF-8',
|
||||
# Report XMP metadata
|
||||
'-meta',
|
||||
$this->mFilename,
|
||||
];
|
||||
$resultMeta = Shell::command( $cmdMeta )
|
||||
->execute();
|
||||
$command = MediaWikiServices::getInstance()->getShellCommandFactory()
|
||||
->createBoxed( 'pdfhandler' )
|
||||
->disableNetwork()
|
||||
->firejailDefaultSeccomp()
|
||||
->routeName( 'pdfhandler-metadata' );
|
||||
|
||||
$cmdPages = [
|
||||
$wgPdfInfo,
|
||||
# Report metadata as UTF-8 text...
|
||||
'-enc', 'UTF-8',
|
||||
# Report page sizes for all pages
|
||||
'-l', '9999999',
|
||||
$this->mFilename,
|
||||
];
|
||||
$resultPages = Shell::command( $cmdPages )
|
||||
->execute();
|
||||
$result = $command
|
||||
->params( $wgPdfHandlerShell, 'scripts/retrieveMetaData.sh' )
|
||||
->inputFileFromFile(
|
||||
'scripts/retrieveMetaData.sh',
|
||||
__DIR__ . '/../scripts/retrieveMetaData.sh' )
|
||||
->inputFileFromFile( 'file.pdf', $this->mFilename )
|
||||
->outputFileToString( 'meta' )
|
||||
->outputFileToString( 'pages' )
|
||||
->outputFileToString( 'text' )
|
||||
->outputFileToString( 'text_exit_code' )
|
||||
->environment( [
|
||||
'PDFHANDLER_INFO' => $wgPdfInfo,
|
||||
'PDFHANDLER_TOTEXT' => $wgPdftoText,
|
||||
] )
|
||||
->execute();
|
||||
|
||||
// Record in statsd
|
||||
MediaWikiServices::getInstance()->getStatsdDataFactory()
|
||||
->increment( 'pdfhandler.shell.retrieve_meta_data' );
|
||||
|
||||
$resultMeta = $result->getFileContents( 'meta' );
|
||||
$resultPages = $result->getFileContents( 'pages' );
|
||||
if ( $resultMeta !== null || $resultPages !== null ) {
|
||||
$data = $this->convertDumpToArray(
|
||||
$resultMeta->getStdout(),
|
||||
$resultPages->getStdout()
|
||||
$resultMeta ?? '',
|
||||
$resultPages ?? ''
|
||||
);
|
||||
} else {
|
||||
$data = [];
|
||||
}
|
||||
|
||||
// Read text layer
|
||||
if ( isset( $wgPdftoText ) ) {
|
||||
$cmd = [ $wgPdftoText, $this->mFilename, '-' ];
|
||||
$result = Shell::command( $cmd )
|
||||
->execute();
|
||||
|
||||
if ( $result->getExitCode() === 0 ) {
|
||||
$txt = str_replace( "\r\n", "\n", $result->getStdout() );
|
||||
$pages = explode( "\f", $txt );
|
||||
foreach ( $pages as $page => $pageText ) {
|
||||
// Get rid of invalid UTF-8, strip control characters
|
||||
// Note we need to do this per page, as \f page feed would be stripped.
|
||||
$pages[$page] = Validator::cleanUp( $pageText );
|
||||
}
|
||||
$data['text'] = $pages;
|
||||
$retval = $result->wasReceived( 'text_exit_code' )
|
||||
? (int)trim( $result->getFileContents( 'text_exit_code' ) )
|
||||
: 1;
|
||||
$txt = $result->getFileContents( 'text' );
|
||||
if ( $retval == 0 && strlen( $txt ) ) {
|
||||
$txt = str_replace( "\r\n", "\n", $txt );
|
||||
$pages = explode( "\f", $txt );
|
||||
foreach ( $pages as $page => $pageText ) {
|
||||
// Get rid of invalid UTF-8, strip control characters
|
||||
// Note we need to do this per page, as \f page feed would be stripped.
|
||||
$pages[$page] = Validator::cleanUp( $pageText );
|
||||
}
|
||||
$data['text'] = $pages;
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
|
|
39
scripts/retrieveMetaData.sh
Normal file
39
scripts/retrieveMetaData.sh
Normal file
|
@ -0,0 +1,39 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Get parameters from environment
|
||||
|
||||
export PDFHANDLER_INFO="${PDFHANDLER_INFO:-pdfinfo}"
|
||||
export PDFHANDLER_TOTEXT="${PDFHANDLER_TOTEXT:-pdftotext}"
|
||||
|
||||
runInfo() {
|
||||
# Note in poppler 0.26 the -meta and page data options worked together,
|
||||
# but as of poppler 0.48 they must be queried separately.
|
||||
# https://bugs.freedesktop.org/show_bug.cgi?id=96801
|
||||
# Report metadata as UTF-8 text...and report XMP metadata
|
||||
"$PDFHANDLER_INFO" \
|
||||
-enc 'UTF-8' \
|
||||
-meta \
|
||||
file.pdf > meta
|
||||
|
||||
# Report metadata as UTF-8 text...and report page sizes for all pages
|
||||
"$PDFHANDLER_INFO" \
|
||||
-enc 'UTF-8' \
|
||||
-l 9999999 \
|
||||
file.pdf > pages
|
||||
|
||||
}
|
||||
|
||||
runToText() {
|
||||
"$PDFHANDLER_TOTEXT" \
|
||||
file.pdf - > text
|
||||
# Store exit code so we can use it later
|
||||
echo $? > text_exit_code
|
||||
}
|
||||
|
||||
if [ -x "$PDFHANDLER_INFO" ]; then
|
||||
runInfo
|
||||
fi
|
||||
|
||||
if [ -x "$PDFHANDLER_TOTEXT" ]; then
|
||||
runToText
|
||||
fi
|
Loading…
Reference in a new issue