Fix broken PDF XMP extraction

XMP extraction does not work for me with libpoppler 0.86, because when the output of the two commands is concatenated, there is no "Metadata:" prefix introducing the XMP. It ends up splitting every line of the XML on colon characters in attribute names, spamming lots of little properties into the final result. I can confirm that it's also broken in production. So, just treat the output of pdfinfo -meta as plain XML. Change-Id: Ia3df17daed0f27e95294b5d97872ec064c79965c
2024-11-24 00:07:10 +00:00 · 2021-06-07 15:30:03 +10:00 · 2021-06-07 15:30:03 +10:00 · 86df1cd6c6
parent 989b42b8eb
commit 86df1cd6c6
1 changed files with 13 additions and 6 deletions
--- a/includes/PdfImage.php
+++ b/includes/PdfImage.php
@ -134,8 +134,10 @@ class PdfImage {
 			$resultPages = Shell::command( $cmdPages )
 				->execute();

-			$dump = $resultMeta->getStdout() . $resultPages->getStdout();
-			$data = $this->convertDumpToArray( $dump );
+			$data = $this->convertDumpToArray(
+				$resultMeta->getStdout(),
+				$resultPages->getStdout()
+			);
 		} else {
 			$data = [];
 		}
@ -162,15 +164,16 @@ class PdfImage {
 	}

 	/**
-	 * @param string $dump
+	 * @param string $metaDump
+	 * @param string $infoDump
 	 * @return array
 	 */
-	protected function convertDumpToArray( $dump ): array {
-		if ( strval( $dump ) == '' ) {
+	protected function convertDumpToArray( $metaDump, $infoDump ): array {
+		if ( strval( $infoDump ) == '' ) {
 			return [];
 		}

-		$lines = explode( "\n", $dump );
+		$lines = explode( "\n", $infoDump );
 		$data = [];

 		// Metadata is always the last item, and spans multiple lines.
@ -204,6 +207,10 @@ class PdfImage {
 				}
 			}
 		}
+		$metaDump = trim( $metaDump );
+		if ( $metaDump !== '' ) {
+			$data['xmp'] = $metaDump;
+		}
 		$data = $this->postProcessDump( $data );
 		return $data;
 	}