getFirstSentences(): don't use crazy regexes

Bug: T145231 Change-Id: I820fb152e86b273ddeba1617658a13e3a3f0bae3
2024-11-24 00:04:35 +00:00 · 2017-01-11 14:13:45 -08:00 · 2017-01-11 14:13:45 -08:00 · 21ef48483f
parent effb70aadd
commit 21ef48483f
2 changed files with 31 additions and 10 deletions
--- a/includes/ExtractFormatter.php
+++ b/includes/ExtractFormatter.php
@ -80,6 +80,10 @@ class ExtractFormatter extends HtmlFormatter {
 	 * @return string
 	 */
 	public static function getFirstSentences( $text, $requestedSentenceCount ) {
+		if ( $requestedSentenceCount <= 0 ) {
+			return '';
+		}
+
 		// Based on code from OpenSearchXml by Brion Vibber
 		$endchars = [
 			'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)', // regular ASCII
@ -89,20 +93,19 @@ class ExtractFormatter extends HtmlFormatter {
 			];

 		$endgroup = implode( '|', $endchars );
-		$end = "(?:$endgroup)";
-		$sentence = ".+?$end+";
-		$requestedSentenceCount = intval( $requestedSentenceCount );
-		$regexp = "/^($sentence){1,{$requestedSentenceCount}}/u";
+		$regexp = "/($endgroup)+/u";
+
 		$matches = [];
-		$res = preg_match( $regexp, $text, $matches );
+		$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
+
 		if ( $res ) {
-			$text = trim( $matches[0] );
+			$index = min( $requestedSentenceCount, $res ) - 1;
+			list( $tail, $length ) = $matches[0][ $index ];
+			// PCRE returns raw offsets, so using substr() instead of mb_substr()
+			$text = substr( $text, 0, $length ) . trim( $tail );
 		} else {
-			if ( $res === false ) {
-				throw new Exception( __METHOD__ . "() error compiling regular expression $regexp" );
-			}
 			// Just return the first line
-			$lines = explode( "\n", $text );
+			$lines = explode( "\n", $text, 2 );
 			$text = trim( $lines[0] );
 		}
 		return $text;
--- a/tests/phpunit/ExtractFormatterTest.php
+++ b/tests/phpunit/ExtractFormatterTest.php
@ -63,6 +63,7 @@ class ExtractFormatterTest extends MediaWikiTestCase {
 	}

 	public function provideGetFirstSentences() {
+		$longLine = str_repeat( 'word ', 1000000 );
 		return [
 			[
 				'Foo is a bar. Such a smart boy. But completely useless.',
@ -106,6 +107,12 @@ class ExtractFormatterTest extends MediaWikiTestCase {
 				1,
 				"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
 			],
+			// No clear sentences
+			[
+				"foo\nbar\nbaz",
+				2,
+				'foo',
+			],
 			// Bug T118621
 			[
 				'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
@ -124,6 +131,17 @@ class ExtractFormatterTest extends MediaWikiTestCase {
 				1,
 				html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds.' ),
 			],
+			// Bug T145231 - various problems with regexes
+			[
+				$longLine,
+				3,
+				trim( $longLine ),
+			],
+			[
+				str_repeat( 'Sentence. ', 70000 ),
+				65536,
+				trim( str_repeat( 'Sentence. ', 65536 ) ),
+			],
 		];
 	}