mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-15 03:35:20 +00:00
Merge "Fix separation of text into sentences."
This commit is contained in:
commit
647c39734a
|
@ -80,7 +80,7 @@ class ExtractFormatter extends HtmlFormatter {
|
|||
public static function getFirstSentences( $text, $requestedSentenceCount ) {
|
||||
// Based on code from OpenSearchXml by Brion Vibber
|
||||
$endchars = array(
|
||||
'[^\p{Lu}]\.\s', '\!\s', '\?\s', // regular ASCII
|
||||
'[^\p{Lu}]\.[ \n]', '\![ \n]', '\?[ \n]', // regular ASCII
|
||||
'。', // full-width ideographic full-stop
|
||||
'.', '!', '?', // double-width roman forms
|
||||
'。', // half-width ideographic full stop
|
||||
|
|
|
@ -115,6 +115,12 @@ class ExtractFormatterTest extends MediaWikiTestCase {
|
|||
1,
|
||||
'P.J. Harvey is a singer.',
|
||||
),
|
||||
// Bug T115817 - Non-breaking space is not a delimiter
|
||||
array(
|
||||
html_entity_decode( 'Pigeons (lat. Columbidae) are birds. They primarily feed on seeds.' ),
|
||||
1,
|
||||
html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue