getFirstSentences(): don't use crazy regexes

Bug: T145231
Change-Id: I820fb152e86b273ddeba1617658a13e3a3f0bae3
This commit is contained in:
Max Semenik 2017-01-11 14:13:45 -08:00
parent effb70aadd
commit 21ef48483f
2 changed files with 31 additions and 10 deletions

View file

@ -80,6 +80,10 @@ class ExtractFormatter extends HtmlFormatter {
* @return string
*/
public static function getFirstSentences( $text, $requestedSentenceCount ) {
if ( $requestedSentenceCount <= 0 ) {
return '';
}
// Based on code from OpenSearchXml by Brion Vibber
$endchars = [
'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)', // regular ASCII
@ -89,20 +93,19 @@ class ExtractFormatter extends HtmlFormatter {
];
$endgroup = implode( '|', $endchars );
$end = "(?:$endgroup)";
$sentence = ".+?$end+";
$requestedSentenceCount = intval( $requestedSentenceCount );
$regexp = "/^($sentence){1,{$requestedSentenceCount}}/u";
$regexp = "/($endgroup)+/u";
$matches = [];
$res = preg_match( $regexp, $text, $matches );
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
if ( $res ) {
$text = trim( $matches[0] );
$index = min( $requestedSentenceCount, $res ) - 1;
list( $tail, $length ) = $matches[0][ $index ];
// PCRE returns raw offsets, so using substr() instead of mb_substr()
$text = substr( $text, 0, $length ) . trim( $tail );
} else {
if ( $res === false ) {
throw new Exception( __METHOD__ . "() error compiling regular expression $regexp" );
}
// Just return the first line
$lines = explode( "\n", $text );
$lines = explode( "\n", $text, 2 );
$text = trim( $lines[0] );
}
return $text;

View file

@ -63,6 +63,7 @@ class ExtractFormatterTest extends MediaWikiTestCase {
}
public function provideGetFirstSentences() {
$longLine = str_repeat( 'word ', 1000000 );
return [
[
'Foo is a bar. Such a smart boy. But completely useless.',
@ -106,6 +107,12 @@ class ExtractFormatterTest extends MediaWikiTestCase {
1,
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
],
// No clear sentences
[
"foo\nbar\nbaz",
2,
'foo',
],
// Bug T118621
[
'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
@ -124,6 +131,17 @@ class ExtractFormatterTest extends MediaWikiTestCase {
1,
html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds.' ),
],
// Bug T145231 - various problems with regexes
[
$longLine,
3,
trim( $longLine ),
],
[
str_repeat( 'Sentence. ', 70000 ),
65536,
trim( str_repeat( 'Sentence. ', 65536 ) ),
],
];
}