Fix broken PDF XMP extraction

XMP extraction does not work for me with libpoppler 0.86, because when
the output of the two commands is concatenated, there is no "Metadata:"
prefix introducing the XMP. It ends up splitting every line of the XML
on colon characters in attribute names, spamming lots of little
properties into the final result.

I can confirm that it's also broken in production.

So, just treat the output of pdfinfo -meta as plain XML.

Change-Id: Ia3df17daed0f27e95294b5d97872ec064c79965c
This commit is contained in:
Tim Starling 2021-06-07 15:30:03 +10:00
parent 989b42b8eb
commit 86df1cd6c6

View file

@ -134,8 +134,10 @@ class PdfImage {
$resultPages = Shell::command( $cmdPages )
->execute();
$dump = $resultMeta->getStdout() . $resultPages->getStdout();
$data = $this->convertDumpToArray( $dump );
$data = $this->convertDumpToArray(
$resultMeta->getStdout(),
$resultPages->getStdout()
);
} else {
$data = [];
}
@ -162,15 +164,16 @@ class PdfImage {
}
/**
* @param string $dump
* @param string $metaDump
* @param string $infoDump
* @return array
*/
protected function convertDumpToArray( $dump ): array {
if ( strval( $dump ) == '' ) {
protected function convertDumpToArray( $metaDump, $infoDump ): array {
if ( strval( $infoDump ) == '' ) {
return [];
}
$lines = explode( "\n", $dump );
$lines = explode( "\n", $infoDump );
$data = [];
// Metadata is always the last item, and spans multiple lines.
@ -204,6 +207,10 @@ class PdfImage {
}
}
}
$metaDump = trim( $metaDump );
if ( $metaDump !== '' ) {
$data['xmp'] = $metaDump;
}
$data = $this->postProcessDump( $data );
return $data;
}