mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-24 00:04:35 +00:00
getFirstSentences(): don't use crazy regexes
Bug: T145231 Change-Id: I820fb152e86b273ddeba1617658a13e3a3f0bae3
This commit is contained in:
parent
effb70aadd
commit
21ef48483f
|
@ -80,6 +80,10 @@ class ExtractFormatter extends HtmlFormatter {
|
|||
* @return string
|
||||
*/
|
||||
public static function getFirstSentences( $text, $requestedSentenceCount ) {
|
||||
if ( $requestedSentenceCount <= 0 ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Based on code from OpenSearchXml by Brion Vibber
|
||||
$endchars = [
|
||||
'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)', // regular ASCII
|
||||
|
@ -89,20 +93,19 @@ class ExtractFormatter extends HtmlFormatter {
|
|||
];
|
||||
|
||||
$endgroup = implode( '|', $endchars );
|
||||
$end = "(?:$endgroup)";
|
||||
$sentence = ".+?$end+";
|
||||
$requestedSentenceCount = intval( $requestedSentenceCount );
|
||||
$regexp = "/^($sentence){1,{$requestedSentenceCount}}/u";
|
||||
$regexp = "/($endgroup)+/u";
|
||||
|
||||
$matches = [];
|
||||
$res = preg_match( $regexp, $text, $matches );
|
||||
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
|
||||
|
||||
if ( $res ) {
|
||||
$text = trim( $matches[0] );
|
||||
$index = min( $requestedSentenceCount, $res ) - 1;
|
||||
list( $tail, $length ) = $matches[0][ $index ];
|
||||
// PCRE returns raw offsets, so using substr() instead of mb_substr()
|
||||
$text = substr( $text, 0, $length ) . trim( $tail );
|
||||
} else {
|
||||
if ( $res === false ) {
|
||||
throw new Exception( __METHOD__ . "() error compiling regular expression $regexp" );
|
||||
}
|
||||
// Just return the first line
|
||||
$lines = explode( "\n", $text );
|
||||
$lines = explode( "\n", $text, 2 );
|
||||
$text = trim( $lines[0] );
|
||||
}
|
||||
return $text;
|
||||
|
|
|
@ -63,6 +63,7 @@ class ExtractFormatterTest extends MediaWikiTestCase {
|
|||
}
|
||||
|
||||
public function provideGetFirstSentences() {
|
||||
$longLine = str_repeat( 'word ', 1000000 );
|
||||
return [
|
||||
[
|
||||
'Foo is a bar. Such a smart boy. But completely useless.',
|
||||
|
@ -106,6 +107,12 @@ class ExtractFormatterTest extends MediaWikiTestCase {
|
|||
1,
|
||||
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
|
||||
],
|
||||
// No clear sentences
|
||||
[
|
||||
"foo\nbar\nbaz",
|
||||
2,
|
||||
'foo',
|
||||
],
|
||||
// Bug T118621
|
||||
[
|
||||
'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
|
||||
|
@ -124,6 +131,17 @@ class ExtractFormatterTest extends MediaWikiTestCase {
|
|||
1,
|
||||
html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ),
|
||||
],
|
||||
// Bug T145231 - various problems with regexes
|
||||
[
|
||||
$longLine,
|
||||
3,
|
||||
trim( $longLine ),
|
||||
],
|
||||
[
|
||||
str_repeat( 'Sentence. ', 70000 ),
|
||||
65536,
|
||||
trim( str_repeat( 'Sentence. ', 65536 ) ),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue