From b846970ae2db6c22ed9ce3e433b2f7c813608459 Mon Sep 17 00:00:00 2001 From: Derk-Jan Hartman Date: Tue, 14 Jun 2022 22:48:11 +0200 Subject: [PATCH] Use the PDF cropbox for rendering By default the mediabox is used. This is the full potential area of pages, as also used by PDF editors and can contain areas outside of the page. The cropbox is also the size that is reported by pdfinfo as the pagesize. Bug: T167420 Change-Id: I92267a9dbe81b6e0e471b8eae1e4c2ba4e5d84e9 --- includes/PdfHandler.php | 2 ++ scripts/retrieveMetaData.sh | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/includes/PdfHandler.php b/includes/PdfHandler.php index aed8e77..0ce0faf 100644 --- a/includes/PdfHandler.php +++ b/includes/PdfHandler.php @@ -217,6 +217,8 @@ class PdfHandler extends ImageHandler { "-dLastPage={$page}", "-dSAFER", "-r{$wgPdfHandlerDpi}", + // CropBox defines the region that the PDF viewer application is expected to display or print. + "-dUseCropBox", "-dBATCH", "-dNOPAUSE", "-q", diff --git a/scripts/retrieveMetaData.sh b/scripts/retrieveMetaData.sh index 3c3f440..dbd0aee 100644 --- a/scripts/retrieveMetaData.sh +++ b/scripts/retrieveMetaData.sh @@ -24,6 +24,10 @@ runInfo() { } runToText() { + # CropBox defines the region that the PDF viewer application is expected to display or print. + # pdftotext's -cropbox was only introduced in poppler 21.03.0 + # It also only works with -bbox so we cannot use it. + # Some text that is not visible in the PDF might thus be included in the output "$PDFHANDLER_TOTEXT" \ file.pdf - > text # Store exit code so we can use it later