From 72392d0edd3b41c70a51509b97b842cc370e3e08 Mon Sep 17 00:00:00 2001 From: Sebastian Marzjan Date: Tue, 5 Jan 2016 08:45:16 +0100 Subject: [PATCH] Normalize whitespace when processing infobox contents --- services/Sanitizers/NodeSanitizer.php | 15 +++++++++++++-- tests/sanitizers/NodeImageSanitizerTest.php | 15 +++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/services/Sanitizers/NodeSanitizer.php b/services/Sanitizers/NodeSanitizer.php index 13f45c0..5a57a04 100644 --- a/services/Sanitizers/NodeSanitizer.php +++ b/services/Sanitizers/NodeSanitizer.php @@ -59,7 +59,7 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { $this->removeNodesBySelector( $xpath, $this->selectorsForFullRemoval ); $nodes = $this->extractNeededNodes( $xpath ); - return $this->generateHTML( $nodes, $dom ); + return $this->normalizeWhitespace( $this->generateHTML( $nodes, $dom ) ); } /** @@ -78,7 +78,18 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { */ $result[] = ( $node->nodeName === '#text' ) ? htmlspecialchars( $dom->saveHTML( $node ), ENT_QUOTES ) : $dom->saveHTML( $node ); } - return implode( '', $result ); + return implode( ' ', $result ); + } + + /** + * Replaces multiple whitespaces with single ones. + * Transparent from non-preformatted HTML point of view + * + * @param $text string + * @return string + */ + protected function normalizeWhitespace( $text ) { + return mbereg_replace( "\s+", " ", $text ); } /** diff --git a/tests/sanitizers/NodeImageSanitizerTest.php b/tests/sanitizers/NodeImageSanitizerTest.php index 5b9cf44..77df8fd 100644 --- a/tests/sanitizers/NodeImageSanitizerTest.php +++ b/tests/sanitizers/NodeImageSanitizerTest.php @@ -43,6 +43,21 @@ class NodeImageSanitizerTest extends WikiaBaseTest { [ [ 'caption' => '' ], [ 'caption' => '' ] + ], + [ + [ 'caption' => '' ], + [ 'caption' => '1 2 3' ] + ], + [ + [ 'caption' => '
  1. 1 +
  2. 2 +
    1. 2.1 +
    +
' ], + [ 'caption' => '1 2 2.1' ] ] ]; }