Fix truncate code potentially removing whitespace from extract

By turning the (?:…) into (?=…) they become lookaheads and are not
part of the returned string in $tail any more. This is exactly what we
want here. All we want is to *know* if the dot, question or exclamation
mark is followed by a space. But we don't need the space captured.

Change-Id: I4be715c4c084165e5ab25da77609f12ffce4d385
This commit is contained in:
Thiemo Kreuz 2019-03-19 18:28:58 +01:00
parent 81fd92685a
commit 8de415c4fd
2 changed files with 12 additions and 4 deletions

View file

@ -40,12 +40,14 @@ class TextTruncator {
// Based on code from OpenSearchXml by Brion Vibber // Based on code from OpenSearchXml by Brion Vibber
$endchars = [ $endchars = [
// regular ASCII // regular ASCII
'\P{Lu}\.(?:[ \n]|$)', '\P{Lu}\.(?=[ \n]|$)',
'[!?](?:[ \n]|$)', '[!?](?=[ \n]|$)',
// full-width ideographic full-stop // full-width ideographic full-stop
'。', '。',
// double-width roman forms // double-width roman forms
'', '', '', '',
'',
'',
// half-width ideographic full stop // half-width ideographic full stop
'。', '。',
]; ];
@ -62,7 +64,7 @@ class TextTruncator {
$index = min( $requestedSentenceCount, $res ) - 1; $index = min( $requestedSentenceCount, $res ) - 1;
list( $tail, $length ) = $matches[0][$index]; list( $tail, $length ) = $matches[0][$index];
// PCRE returns raw offsets, so using substr() instead of mb_substr() // PCRE returns raw offsets, so using substr() instead of mb_substr()
$text = substr( $text, 0, $length ) . trim( $tail ); $text = substr( $text, 0, $length ) . $tail;
return $this->tidy( $text ); return $this->tidy( $text );
} }

View file

@ -107,6 +107,12 @@ class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
65536, 65536,
trim( str_repeat( 'Sentence. ', 65536 ) ), trim( str_repeat( 'Sentence. ', 65536 ) ),
], ],
'Preserve whitespace before end character' => [
'Aa . Bb',
1,
'Aa .',
],
]; ];
} }