mediawiki-extensions-PdfHan.../scripts/retrieveMetaData.sh
Derk-Jan Hartman b846970ae2 Use the PDF cropbox for rendering
By default the mediabox is used. This is the full potential area of
pages, as also used by PDF editors and can contain areas outside of
the page.
The cropbox is also the size that is reported by pdfinfo as the
pagesize.

Bug: T167420
Change-Id: I92267a9dbe81b6e0e471b8eae1e4c2ba4e5d84e9
2022-06-15 18:39:35 +00:00

44 lines
1.1 KiB
Bash

#!/bin/sh
# Get parameters from environment
export PDFHANDLER_INFO="${PDFHANDLER_INFO:-pdfinfo}"
export PDFHANDLER_TOTEXT="${PDFHANDLER_TOTEXT:-pdftotext}"
runInfo() {
# Note in poppler 0.26 the -meta and page data options worked together,
# but as of poppler 0.48 they must be queried separately.
# https://bugs.freedesktop.org/show_bug.cgi?id=96801
# Report metadata as UTF-8 text...and report XMP metadata
"$PDFHANDLER_INFO" \
-enc 'UTF-8' \
-meta \
file.pdf > meta
# Report metadata as UTF-8 text...and report page sizes for all pages
"$PDFHANDLER_INFO" \
-enc 'UTF-8' \
-l 9999999 \
file.pdf > pages
}
runToText() {
# CropBox defines the region that the PDF viewer application is expected to display or print.
# pdftotext's -cropbox was only introduced in poppler 21.03.0
# It also only works with -bbox so we cannot use it.
# Some text that is not visible in the PDF might thus be included in the output
"$PDFHANDLER_TOTEXT" \
file.pdf - > text
# Store exit code so we can use it later
echo $? > text_exit_code
}
if [ -x "$PDFHANDLER_INFO" ]; then
runInfo
fi
if [ -x "$PDFHANDLER_TOTEXT" ]; then
runToText
fi