From 8de415c4fda9cb7845627c45bea61bcc5ca9afd2 Mon Sep 17 00:00:00 2001 From: Thiemo Kreuz Date: Tue, 19 Mar 2019 18:28:58 +0100 Subject: [PATCH] Fix truncate code potentially removing whitespace from extract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By turning the (?:…) into (?=…) they become lookaheads and are not part of the returned string in $tail any more. This is exactly what we want here. All we want is to *know* if the dot, question or exclamation mark is followed by a space. But we don't need the space captured. Change-Id: I4be715c4c084165e5ab25da77609f12ffce4d385 --- includes/TextTruncator.php | 10 ++++++---- tests/phpunit/TextTruncatorTest.php | 6 ++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/includes/TextTruncator.php b/includes/TextTruncator.php index acc942f..770ee25 100644 --- a/includes/TextTruncator.php +++ b/includes/TextTruncator.php @@ -40,12 +40,14 @@ class TextTruncator { // Based on code from OpenSearchXml by Brion Vibber $endchars = [ // regular ASCII - '\P{Lu}\.(?:[ \n]|$)', - '[!?](?:[ \n]|$)', + '\P{Lu}\.(?=[ \n]|$)', + '[!?](?=[ \n]|$)', // full-width ideographic full-stop '。', // double-width roman forms - '.', '!', '?', + '.', + '!', + '?', // half-width ideographic full stop '。', ]; @@ -62,7 +64,7 @@ class TextTruncator { $index = min( $requestedSentenceCount, $res ) - 1; list( $tail, $length ) = $matches[0][$index]; // PCRE returns raw offsets, so using substr() instead of mb_substr() - $text = substr( $text, 0, $length ) . trim( $tail ); + $text = substr( $text, 0, $length ) . $tail; return $this->tidy( $text ); } diff --git a/tests/phpunit/TextTruncatorTest.php b/tests/phpunit/TextTruncatorTest.php index f9fba9f..1980e75 100644 --- a/tests/phpunit/TextTruncatorTest.php +++ b/tests/phpunit/TextTruncatorTest.php @@ -107,6 +107,12 @@ class TextTruncatorTest extends \PHPUnit\Framework\TestCase { 65536, trim( str_repeat( 'Sentence. ', 65536 ) ), ], + + 'Preserve whitespace before end character' => [ + 'Aa . Bb', + 1, + 'Aa .', + ], ]; }