mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-23 15:56:52 +00:00
Fix truncate code potentially removing whitespace from extract
By turning the (?:…) into (?=…) they become lookaheads and are not part of the returned string in $tail any more. This is exactly what we want here. All we want is to *know* if the dot, question or exclamation mark is followed by a space. But we don't need the space captured. Change-Id: I4be715c4c084165e5ab25da77609f12ffce4d385
This commit is contained in:
parent
81fd92685a
commit
8de415c4fd
|
@ -40,12 +40,14 @@ class TextTruncator {
|
|||
// Based on code from OpenSearchXml by Brion Vibber
|
||||
$endchars = [
|
||||
// regular ASCII
|
||||
'\P{Lu}\.(?:[ \n]|$)',
|
||||
'[!?](?:[ \n]|$)',
|
||||
'\P{Lu}\.(?=[ \n]|$)',
|
||||
'[!?](?=[ \n]|$)',
|
||||
// full-width ideographic full-stop
|
||||
'。',
|
||||
// double-width roman forms
|
||||
'.', '!', '?',
|
||||
'.',
|
||||
'!',
|
||||
'?',
|
||||
// half-width ideographic full stop
|
||||
'。',
|
||||
];
|
||||
|
@ -62,7 +64,7 @@ class TextTruncator {
|
|||
$index = min( $requestedSentenceCount, $res ) - 1;
|
||||
list( $tail, $length ) = $matches[0][$index];
|
||||
// PCRE returns raw offsets, so using substr() instead of mb_substr()
|
||||
$text = substr( $text, 0, $length ) . trim( $tail );
|
||||
$text = substr( $text, 0, $length ) . $tail;
|
||||
|
||||
return $this->tidy( $text );
|
||||
}
|
||||
|
|
|
@ -107,6 +107,12 @@ class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
|
|||
65536,
|
||||
trim( str_repeat( 'Sentence. ', 65536 ) ),
|
||||
],
|
||||
|
||||
'Preserve whitespace before end character' => [
|
||||
'Aa . Bb',
|
||||
1,
|
||||
'Aa .',
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue