mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/PdfHandler
synced 2024-09-24 02:39:50 +00:00
Cleanup for r56413 - PDF text extraction support:
* use UtfNormal::cleanUp() for UTF-8 and control char cleanup instead of iconv() and a manual strip * remove the htmlspecialchars() which looks like it shouldn't be here; this is for internal data storage not HTML output
This commit is contained in:
parent
a62a02638d
commit
e4a25d9ccd
|
@ -101,14 +101,14 @@ class PdfImage {
|
|||
wfDebug( __METHOD__.": $cmd\n" );
|
||||
$txt = wfShellExec( $cmd, $retval );
|
||||
wfProfileOut( 'pdftotext' );
|
||||
if( $retval == 0) {
|
||||
# Get rid of invalid UTF-8, strip control characters
|
||||
wfSuppressWarnings();
|
||||
$txt = iconv( "UTF-8","UTF-8//IGNORE", $txt );
|
||||
wfRestoreWarnings();
|
||||
$txt = preg_replace( "/[\013\035\037]/", "", $txt );
|
||||
$txt = htmlspecialchars($txt);
|
||||
$pages = preg_split("/\f/s", $txt );
|
||||
if( $retval == 0 ) {
|
||||
$txt = str_replace( "\r\n", "\n", $txt );
|
||||
$pages = explode( "\f", $txt );
|
||||
foreach( $pages as $page => $pageText ) {
|
||||
# Get rid of invalid UTF-8, strip control characters
|
||||
# Note we need to do this per page, as \f page feed would be stripped.
|
||||
$pages[$page] = UtfNormal::cleanUp( $pageText );
|
||||
}
|
||||
$data['text'] = $pages;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue