mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/PdfHandler
synced 2024-11-24 00:07:10 +00:00
Fix broken PDF XMP extraction
XMP extraction does not work for me with libpoppler 0.86, because when the output of the two commands is concatenated, there is no "Metadata:" prefix introducing the XMP. It ends up splitting every line of the XML on colon characters in attribute names, spamming lots of little properties into the final result. I can confirm that it's also broken in production. So, just treat the output of pdfinfo -meta as plain XML. Change-Id: Ia3df17daed0f27e95294b5d97872ec064c79965c
This commit is contained in:
parent
989b42b8eb
commit
86df1cd6c6
|
@ -134,8 +134,10 @@ class PdfImage {
|
|||
$resultPages = Shell::command( $cmdPages )
|
||||
->execute();
|
||||
|
||||
$dump = $resultMeta->getStdout() . $resultPages->getStdout();
|
||||
$data = $this->convertDumpToArray( $dump );
|
||||
$data = $this->convertDumpToArray(
|
||||
$resultMeta->getStdout(),
|
||||
$resultPages->getStdout()
|
||||
);
|
||||
} else {
|
||||
$data = [];
|
||||
}
|
||||
|
@ -162,15 +164,16 @@ class PdfImage {
|
|||
}
|
||||
|
||||
/**
|
||||
* @param string $dump
|
||||
* @param string $metaDump
|
||||
* @param string $infoDump
|
||||
* @return array
|
||||
*/
|
||||
protected function convertDumpToArray( $dump ): array {
|
||||
if ( strval( $dump ) == '' ) {
|
||||
protected function convertDumpToArray( $metaDump, $infoDump ): array {
|
||||
if ( strval( $infoDump ) == '' ) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$lines = explode( "\n", $dump );
|
||||
$lines = explode( "\n", $infoDump );
|
||||
$data = [];
|
||||
|
||||
// Metadata is always the last item, and spans multiple lines.
|
||||
|
@ -204,6 +207,10 @@ class PdfImage {
|
|||
}
|
||||
}
|
||||
}
|
||||
$metaDump = trim( $metaDump );
|
||||
if ( $metaDump !== '' ) {
|
||||
$data['xmp'] = $metaDump;
|
||||
}
|
||||
$data = $this->postProcessDump( $data );
|
||||
return $data;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue