mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/PdfHandler
synced 2024-12-04 12:49:23 +00:00
b846970ae2
By default the mediabox is used. This is the full potential area of pages, as also used by PDF editors and can contain areas outside of the page. The cropbox is also the size that is reported by pdfinfo as the pagesize. Bug: T167420 Change-Id: I92267a9dbe81b6e0e471b8eae1e4c2ba4e5d84e9
44 lines
1.1 KiB
Bash
44 lines
1.1 KiB
Bash
#!/bin/sh
|
|
|
|
# Get parameters from environment
|
|
|
|
export PDFHANDLER_INFO="${PDFHANDLER_INFO:-pdfinfo}"
|
|
export PDFHANDLER_TOTEXT="${PDFHANDLER_TOTEXT:-pdftotext}"
|
|
|
|
runInfo() {
|
|
# Note in poppler 0.26 the -meta and page data options worked together,
|
|
# but as of poppler 0.48 they must be queried separately.
|
|
# https://bugs.freedesktop.org/show_bug.cgi?id=96801
|
|
# Report metadata as UTF-8 text...and report XMP metadata
|
|
"$PDFHANDLER_INFO" \
|
|
-enc 'UTF-8' \
|
|
-meta \
|
|
file.pdf > meta
|
|
|
|
# Report metadata as UTF-8 text...and report page sizes for all pages
|
|
"$PDFHANDLER_INFO" \
|
|
-enc 'UTF-8' \
|
|
-l 9999999 \
|
|
file.pdf > pages
|
|
|
|
}
|
|
|
|
runToText() {
|
|
# CropBox defines the region that the PDF viewer application is expected to display or print.
|
|
# pdftotext's -cropbox was only introduced in poppler 21.03.0
|
|
# It also only works with -bbox so we cannot use it.
|
|
# Some text that is not visible in the PDF might thus be included in the output
|
|
"$PDFHANDLER_TOTEXT" \
|
|
file.pdf - > text
|
|
# Store exit code so we can use it later
|
|
echo $? > text_exit_code
|
|
}
|
|
|
|
if [ -x "$PDFHANDLER_INFO" ]; then
|
|
runInfo
|
|
fi
|
|
|
|
if [ -x "$PDFHANDLER_TOTEXT" ]; then
|
|
runToText
|
|
fi
|