mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-23 15:56:52 +00:00
Fix truncate code potentially removing whitespace from extract
By turning the (?:…) into (?=…) they become lookaheads and are not part of the returned string in $tail any more. This is exactly what we want here. All we want is to *know* if the dot, question or exclamation mark is followed by a space. But we don't need the space captured. Change-Id: I4be715c4c084165e5ab25da77609f12ffce4d385
This commit is contained in:
parent
81fd92685a
commit
8de415c4fd
|
@ -40,12 +40,14 @@ class TextTruncator {
|
||||||
// Based on code from OpenSearchXml by Brion Vibber
|
// Based on code from OpenSearchXml by Brion Vibber
|
||||||
$endchars = [
|
$endchars = [
|
||||||
// regular ASCII
|
// regular ASCII
|
||||||
'\P{Lu}\.(?:[ \n]|$)',
|
'\P{Lu}\.(?=[ \n]|$)',
|
||||||
'[!?](?:[ \n]|$)',
|
'[!?](?=[ \n]|$)',
|
||||||
// full-width ideographic full-stop
|
// full-width ideographic full-stop
|
||||||
'。',
|
'。',
|
||||||
// double-width roman forms
|
// double-width roman forms
|
||||||
'.', '!', '?',
|
'.',
|
||||||
|
'!',
|
||||||
|
'?',
|
||||||
// half-width ideographic full stop
|
// half-width ideographic full stop
|
||||||
'。',
|
'。',
|
||||||
];
|
];
|
||||||
|
@ -62,7 +64,7 @@ class TextTruncator {
|
||||||
$index = min( $requestedSentenceCount, $res ) - 1;
|
$index = min( $requestedSentenceCount, $res ) - 1;
|
||||||
list( $tail, $length ) = $matches[0][$index];
|
list( $tail, $length ) = $matches[0][$index];
|
||||||
// PCRE returns raw offsets, so using substr() instead of mb_substr()
|
// PCRE returns raw offsets, so using substr() instead of mb_substr()
|
||||||
$text = substr( $text, 0, $length ) . trim( $tail );
|
$text = substr( $text, 0, $length ) . $tail;
|
||||||
|
|
||||||
return $this->tidy( $text );
|
return $this->tidy( $text );
|
||||||
}
|
}
|
||||||
|
|
|
@ -107,6 +107,12 @@ class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
|
||||||
65536,
|
65536,
|
||||||
trim( str_repeat( 'Sentence. ', 65536 ) ),
|
trim( str_repeat( 'Sentence. ', 65536 ) ),
|
||||||
],
|
],
|
||||||
|
|
||||||
|
'Preserve whitespace before end character' => [
|
||||||
|
'Aa . Bb',
|
||||||
|
1,
|
||||||
|
'Aa .',
|
||||||
|
],
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue