mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-15 03:35:20 +00:00
Merge "Fix separation of text into sentences."
This commit is contained in:
commit
647c39734a
|
@ -80,7 +80,7 @@ class ExtractFormatter extends HtmlFormatter {
|
||||||
public static function getFirstSentences( $text, $requestedSentenceCount ) {
|
public static function getFirstSentences( $text, $requestedSentenceCount ) {
|
||||||
// Based on code from OpenSearchXml by Brion Vibber
|
// Based on code from OpenSearchXml by Brion Vibber
|
||||||
$endchars = array(
|
$endchars = array(
|
||||||
'[^\p{Lu}]\.\s', '\!\s', '\?\s', // regular ASCII
|
'[^\p{Lu}]\.[ \n]', '\![ \n]', '\?[ \n]', // regular ASCII
|
||||||
'。', // full-width ideographic full-stop
|
'。', // full-width ideographic full-stop
|
||||||
'.', '!', '?', // double-width roman forms
|
'.', '!', '?', // double-width roman forms
|
||||||
'。', // half-width ideographic full stop
|
'。', // half-width ideographic full stop
|
||||||
|
|
|
@ -115,6 +115,12 @@ class ExtractFormatterTest extends MediaWikiTestCase {
|
||||||
1,
|
1,
|
||||||
'P.J. Harvey is a singer.',
|
'P.J. Harvey is a singer.',
|
||||||
),
|
),
|
||||||
|
// Bug T115817 - Non-breaking space is not a delimiter
|
||||||
|
array(
|
||||||
|
html_entity_decode( 'Pigeons (lat. Columbidae) are birds. They primarily feed on seeds.' ),
|
||||||
|
1,
|
||||||
|
html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ),
|
||||||
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue