From e4a25d9ccd91423bfc67ad5b8424ee62171d524d Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Thu, 1 Oct 2009 23:29:05 +0000 Subject: [PATCH] Cleanup for r56413 - PDF text extraction support: * use UtfNormal::cleanUp() for UTF-8 and control char cleanup instead of iconv() and a manual strip * remove the htmlspecialchars() which looks like it shouldn't be here; this is for internal data storage not HTML output --- PdfHandler.image.php | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/PdfHandler.image.php b/PdfHandler.image.php index 0586cf4..7f9ebc8 100644 --- a/PdfHandler.image.php +++ b/PdfHandler.image.php @@ -101,14 +101,14 @@ class PdfImage { wfDebug( __METHOD__.": $cmd\n" ); $txt = wfShellExec( $cmd, $retval ); wfProfileOut( 'pdftotext' ); - if( $retval == 0) { - # Get rid of invalid UTF-8, strip control characters - wfSuppressWarnings(); - $txt = iconv( "UTF-8","UTF-8//IGNORE", $txt ); - wfRestoreWarnings(); - $txt = preg_replace( "/[\013\035\037]/", "", $txt ); - $txt = htmlspecialchars($txt); - $pages = preg_split("/\f/s", $txt ); + if( $retval == 0 ) { + $txt = str_replace( "\r\n", "\n", $txt ); + $pages = explode( "\f", $txt ); + foreach( $pages as $page => $pageText ) { + # Get rid of invalid UTF-8, strip control characters + # Note we need to do this per page, as \f page feed would be stripped. + $pages[$page] = UtfNormal::cleanUp( $pageText ); + } $data['text'] = $pages; } }