From e4a25d9ccd91423bfc67ad5b8424ee62171d524d Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Thu, 1 Oct 2009 23:29:05 +0000
Subject: [PATCH] Cleanup for r56413 - PDF text extraction support: * use
 UtfNormal::cleanUp() for UTF-8 and control char cleanup instead of iconv()
 and a manual strip * remove the htmlspecialchars() which looks like it
 shouldn't be here; this is for internal data storage not HTML output

---
 PdfHandler.image.php | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/PdfHandler.image.php b/PdfHandler.image.php
index 0586cf4..7f9ebc8 100644
--- a/PdfHandler.image.php
+++ b/PdfHandler.image.php
@@ -101,14 +101,14 @@ class PdfImage {
 			wfDebug( __METHOD__.": $cmd\n" );
 			$txt = wfShellExec( $cmd, $retval );
 			wfProfileOut( 'pdftotext' );
-			if( $retval == 0) {
-				# Get rid of invalid UTF-8, strip control characters
-				wfSuppressWarnings();
-				$txt = iconv( "UTF-8","UTF-8//IGNORE", $txt );
-				wfRestoreWarnings();
-				$txt = preg_replace( "/[\013\035\037]/", "", $txt );
-				$txt = htmlspecialchars($txt);
-				$pages = preg_split("/\f/s", $txt  );
+			if( $retval == 0 ) {
+				$txt = str_replace( "\r\n", "\n", $txt );
+				$pages = explode( "\f", $txt );
+				foreach( $pages as $page => $pageText ) {
+					# Get rid of invalid UTF-8, strip control characters
+					# Note we need to do this per page, as \f page feed would be stripped.
+					$pages[$page] = UtfNormal::cleanUp( $pageText );
+				}
 				$data['text'] = $pages;
 			}
 		}