Fix truncate code potentially removing whitespace from extract

By turning the (?:…) into (?=…) they become lookaheads and are not part of the returned string in $tail any more. This is exactly what we want here. All we want is to *know* if the dot, question or exclamation mark is followed by a space. But we don't need the space captured. Change-Id: I4be715c4c084165e5ab25da77609f12ffce4d385
2024-11-23 15:56:52 +00:00 · 2019-03-19 18:28:58 +01:00 · 2019-03-19 18:28:58 +01:00 · 8de415c4fd
parent 81fd92685a
commit 8de415c4fd
2 changed files with 12 additions and 4 deletions
--- a/includes/TextTruncator.php
+++ b/includes/TextTruncator.php
@ -40,12 +40,14 @@ class TextTruncator {
 		// Based on code from OpenSearchXml by Brion Vibber
 		$endchars = [
 			// regular ASCII
-			'\P{Lu}\.(?:[ \n]|$)',
+			'\P{Lu}\.(?=[ \n]|$)',
-			'[!?](?:[ \n]|$)',
+			'[!?](?=[ \n]|$)',
 			// full-width ideographic full-stop
 			'。',
 			// double-width roman forms
-			'．', '！', '？',
+			'．',
 			'！',
 			'？',
 			// half-width ideographic full stop
 			'｡',
 		];
@ -62,7 +64,7 @@ class TextTruncator {
 		$index = min( $requestedSentenceCount, $res ) - 1;
 		list( $tail, $length ) = $matches[0][$index];
 		// PCRE returns raw offsets, so using substr() instead of mb_substr()
-		$text = substr( $text, 0, $length ) . trim( $tail );
+		$text = substr( $text, 0, $length ) . $tail;
 		return $this->tidy( $text );
 	}
--- a/tests/phpunit/TextTruncatorTest.php
+++ b/tests/phpunit/TextTruncatorTest.php
@ -107,6 +107,12 @@ class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
 				65536,
 				trim( str_repeat( 'Sentence. ', 65536 ) ),
 			],
 			'Preserve whitespace before end character' => [
 				'Aa . Bb',
 				1,
 				'Aa .',
 			],
 		];
 	}