diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php index eb20524..4dd7e92 100644 --- a/includes/ExtractFormatter.php +++ b/includes/ExtractFormatter.php @@ -80,6 +80,10 @@ class ExtractFormatter extends HtmlFormatter { * @return string */ public static function getFirstSentences( $text, $requestedSentenceCount ) { + if ( $requestedSentenceCount <= 0 ) { + return ''; + } + // Based on code from OpenSearchXml by Brion Vibber $endchars = [ '[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)', // regular ASCII @@ -89,20 +93,19 @@ class ExtractFormatter extends HtmlFormatter { ]; $endgroup = implode( '|', $endchars ); - $end = "(?:$endgroup)"; - $sentence = ".+?$end+"; - $requestedSentenceCount = intval( $requestedSentenceCount ); - $regexp = "/^($sentence){1,{$requestedSentenceCount}}/u"; + $regexp = "/($endgroup)+/u"; + $matches = []; - $res = preg_match( $regexp, $text, $matches ); + $res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE ); + if ( $res ) { - $text = trim( $matches[0] ); + $index = min( $requestedSentenceCount, $res ) - 1; + list( $tail, $length ) = $matches[0][ $index ]; + // PCRE returns raw offsets, so using substr() instead of mb_substr() + $text = substr( $text, 0, $length ) . trim( $tail ); } else { - if ( $res === false ) { - throw new Exception( __METHOD__ . "() error compiling regular expression $regexp" ); - } // Just return the first line - $lines = explode( "\n", $text ); + $lines = explode( "\n", $text, 2 ); $text = trim( $lines[0] ); } return $text; diff --git a/tests/phpunit/ExtractFormatterTest.php b/tests/phpunit/ExtractFormatterTest.php index 299b01d..064871d 100644 --- a/tests/phpunit/ExtractFormatterTest.php +++ b/tests/phpunit/ExtractFormatterTest.php @@ -63,6 +63,7 @@ class ExtractFormatterTest extends MediaWikiTestCase { } public function provideGetFirstSentences() { + $longLine = str_repeat( 'word ', 1000000 ); return [ [ 'Foo is a bar. Such a smart boy. But completely useless.', @@ -106,6 +107,12 @@ class ExtractFormatterTest extends MediaWikiTestCase { 1, "Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.", ], + // No clear sentences + [ + "foo\nbar\nbaz", + 2, + 'foo', + ], // Bug T118621 [ 'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.', @@ -124,6 +131,17 @@ class ExtractFormatterTest extends MediaWikiTestCase { 1, html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ), ], + // Bug T145231 - various problems with regexes + [ + $longLine, + 3, + trim( $longLine ), + ], + [ + str_repeat( 'Sentence. ', 70000 ), + 65536, + trim( str_repeat( 'Sentence. ', 65536 ) ), + ], ]; }