Extract unrelated static code from ExtractFormatter

This is a straightforward baseline patch that does nothing but moving
existing code around, without touching it. I'm not even trying to
remove the "static" keyword. The actual refactoring will be done in
the next patch. I hope with this the changes I do in the refactoring
become more visible and much easier to review.

Change-Id: Idba859ec0c24f3622ea8fb8d7a9b11843d1e3827
This commit is contained in:
Thiemo Kreuz 2019-03-19 12:43:25 +01:00 committed by Thiemo Kreuz (WMDE)
parent b6250d3791
commit a0d37fcb51
5 changed files with 223 additions and 197 deletions

View file

@ -329,7 +329,7 @@ class ApiQueryExtracts extends ApiQueryBase {
* @return string
*/
private function getFirstChars( $text, $requestedLength ) {
$text = ExtractFormatter::getFirstChars( $text, $requestedLength );
$text = TextTruncator::getFirstChars( $text, $requestedLength );
// Fix possibly unclosed tags
$text = $this->tidy( $text );
$text .= wfMessage( 'ellipsis' )->inContentLanguage()->text();
@ -342,7 +342,7 @@ class ApiQueryExtracts extends ApiQueryBase {
* @return string
*/
private function getFirstSentences( $text, $requestedSentenceCount ) {
$text = ExtractFormatter::getFirstSentences( $text, $requestedSentenceCount );
$text = TextTruncator::getFirstSentences( $text, $requestedSentenceCount );
$text = $this->tidy( $text );
return $text;
}

View file

@ -83,69 +83,6 @@ class ExtractFormatter extends HtmlFormatter {
return $html;
}
/**
* Returns no more than the given number of sentences
*
* @param string $text Source text to extract from
* @param int $requestedSentenceCount Maximum number of sentences to extract
* @return string
*/
public static function getFirstSentences( $text, $requestedSentenceCount ) {
if ( $requestedSentenceCount <= 0 ) {
return '';
}
// Based on code from OpenSearchXml by Brion Vibber
$endchars = [
// regular ASCII
'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)',
// full-width ideographic full-stop
'。',
// double-width roman forms
'', '', '',
// half-width ideographic full stop
'。',
];
$endgroup = implode( '|', $endchars );
$regexp = "/($endgroup)+/u";
$matches = [];
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
if ( $res ) {
$index = min( $requestedSentenceCount, $res ) - 1;
list( $tail, $length ) = $matches[0][ $index ];
// PCRE returns raw offsets, so using substr() instead of mb_substr()
$text = substr( $text, 0, $length ) . trim( $tail );
} else {
// Just return the first line
$lines = explode( "\n", $text, 2 );
$text = trim( $lines[0] );
}
return $text;
}
/**
* Returns no more than a requested number of characters, preserving words
*
* @param string $text Source text to extract from
* @param int $requestedLength Maximum number of characters to return
* @return string
*/
public static function getFirstChars( $text, $requestedLength ) {
if ( $requestedLength <= 0 ) {
return '';
}
$length = mb_strlen( $text );
if ( $length <= $requestedLength ) {
return $text;
}
$pattern = "#^[\\w/]*>?#su";
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
return mb_substr( $text, 0, $requestedLength ) . $m[0];
}
/**
* Removes content we've chosen to remove then removes class and style
* attributes from the remaining span elements.

View file

@ -0,0 +1,73 @@
<?php
namespace TextExtracts;
/**
* @license GPL-2.0-or-later
*/
class TextTruncator {
/**
* Returns no more than the given number of sentences
*
* @param string $text Source text to extract from
* @param int $requestedSentenceCount Maximum number of sentences to extract
* @return string
*/
public static function getFirstSentences( $text, $requestedSentenceCount ) {
if ( $requestedSentenceCount <= 0 ) {
return '';
}
// Based on code from OpenSearchXml by Brion Vibber
$endchars = [
// regular ASCII
'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)',
// full-width ideographic full-stop
'。',
// double-width roman forms
'', '', '',
// half-width ideographic full stop
'。',
];
$endgroup = implode( '|', $endchars );
$regexp = "/($endgroup)+/u";
$matches = [];
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
if ( $res ) {
$index = min( $requestedSentenceCount, $res ) - 1;
list( $tail, $length ) = $matches[0][ $index ];
// PCRE returns raw offsets, so using substr() instead of mb_substr()
$text = substr( $text, 0, $length ) . trim( $tail );
} else {
// Just return the first line
$lines = explode( "\n", $text, 2 );
$text = trim( $lines[0] );
}
return $text;
}
/**
* Returns no more than a requested number of characters, preserving words
*
* @param string $text Source text to extract from
* @param int $requestedLength Maximum number of characters to return
* @return string
*/
public static function getFirstChars( $text, $requestedLength ) {
if ( $requestedLength <= 0 ) {
return '';
}
$length = mb_strlen( $text );
if ( $length <= $requestedLength ) {
return $text;
}
$pattern = "#^[\\w/]*>?#su";
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
return mb_substr( $text, 0, $requestedLength ) . $m[0];
}
}

View file

@ -77,136 +77,4 @@ class ExtractFormatterTest extends MediaWikiTestCase {
];
}
/**
* @dataProvider provideGetFirstSentences
* @param string $text
* @param string $sentences
* @param string $expected
*/
public function testGetFirstSentences( $text, $sentences, $expected ) {
$this->assertSame( $expected, ExtractFormatter::getFirstSentences( $text, $sentences ) );
}
public function provideGetFirstSentences() {
$longLine = str_repeat( 'word ', 1000000 );
return [
[
'Foo is a bar. Such a smart boy. But completely useless.',
2,
'Foo is a bar. Such a smart boy.',
],
[
'Foo is a bar. Such a smart boy. But completely useless.',
1,
'Foo is a bar.',
],
[
'Foo is a bar. Such a smart boy.',
2,
'Foo is a bar. Such a smart boy.',
],
[
'Foo is a bar.',
1,
'Foo is a bar.',
],
[
'Foo is a bar.',
2,
'Foo is a bar.',
],
[
'',
1,
'',
],
// Exclamation points too!!!
[
'Foo is a bar! Such a smart boy! But completely useless!',
1,
'Foo is a bar!',
],
// A tricky one
[
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. " .
"Polyvinyl acetate, however, is another story.",
1,
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
],
// No clear sentences
[
"foo\nbar\nbaz",
2,
'foo',
],
// Bug T118621
[
'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
1,
'Foo was born in 1977.',
],
// Bug T115795 - Test no cropping after initials
[
'P.J. Harvey is a singer. She is awesome!',
1,
'P.J. Harvey is a singer.',
],
// Bug T115817 - Non-breaking space is not a delimiter
[
html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds. ' .
'They primarily feed on seeds.' ),
1,
html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds.' ),
],
// Bug T145231 - various problems with regexes
[
$longLine,
3,
trim( $longLine ),
],
[
str_repeat( 'Sentence. ', 70000 ),
65536,
trim( str_repeat( 'Sentence. ', 65536 ) ),
],
];
}
/**
* @dataProvider provideGetFirstChars
* @param string $text
* @param string $chars
* @param string $expected
*/
public function testGetFirstChars( $text, $chars, $expected ) {
$this->assertSame( $expected, ExtractFormatter::getFirstChars( $text, $chars ) );
}
public function provideGetFirstChars() {
$text = 'Lullzy lulz are lullzy!';
$html = 'foo<tag>bar</tag>';
$longText = str_repeat( 'тест ', 50000 );
$longTextExpected = trim( str_repeat( 'тест ', 13108 ) );
return [
[ $text, -8, '' ],
[ $text, 0, '' ],
[ $text, 100, $text ],
[ $text, 1, 'Lullzy' ],
[ $text, 6, 'Lullzy' ],
// [ $text, 7, 'Lullzy' ],
[ $text, 8, 'Lullzy lulz' ],
// HTML processing
[ $html, 1, 'foo' ],
// let HTML sanitizer clean it up later
[ $html, 4, 'foo<tag>' ],
[ $html, 12, 'foo<tag>bar</tag>' ],
[ $html, 13, 'foo<tag>bar</tag>' ],
[ $html, 16, 'foo<tag>bar</tag>' ],
[ $html, 17, 'foo<tag>bar</tag>' ],
// T143178 - previously, characters were extracted using regexps which failed when
// requesting 64K chars or more.
[ $longText, 65536, $longTextExpected ],
];
}
}

View file

@ -0,0 +1,148 @@
<?php
namespace TextExtracts\Test;
use TextExtracts\TextTruncator;
/**
* @covers \TextExtracts\TextTruncator
* @group TextExtracts
*
* @license GPL-2.0-or-later
*/
class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
/**
* @dataProvider provideGetFirstSentences
* @param string $text
* @param string $sentences
* @param string $expected
*/
public function testGetFirstSentences( $text, $sentences, $expected ) {
$this->assertSame( $expected, TextTruncator::getFirstSentences( $text, $sentences ) );
}
public function provideGetFirstSentences() {
$longLine = str_repeat( 'word ', 1000000 );
return [
[
'Foo is a bar. Such a smart boy. But completely useless.',
2,
'Foo is a bar. Such a smart boy.',
],
[
'Foo is a bar. Such a smart boy. But completely useless.',
1,
'Foo is a bar.',
],
[
'Foo is a bar. Such a smart boy.',
2,
'Foo is a bar. Such a smart boy.',
],
[
'Foo is a bar.',
1,
'Foo is a bar.',
],
[
'Foo is a bar.',
2,
'Foo is a bar.',
],
[
'',
1,
'',
],
// Exclamation points too!!!
[
'Foo is a bar! Such a smart boy! But completely useless!',
1,
'Foo is a bar!',
],
// A tricky one
[
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. " .
"Polyvinyl acetate, however, is another story.",
1,
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
],
// No clear sentences
[
"foo\nbar\nbaz",
2,
'foo',
],
// Bug T118621
[
'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
1,
'Foo was born in 1977.',
],
// Bug T115795 - Test no cropping after initials
[
'P.J. Harvey is a singer. She is awesome!',
1,
'P.J. Harvey is a singer.',
],
// Bug T115817 - Non-breaking space is not a delimiter
[
html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds. ' .
'They primarily feed on seeds.' ),
1,
html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds.' ),
],
// Bug T145231 - various problems with regexes
[
$longLine,
3,
trim( $longLine ),
],
[
str_repeat( 'Sentence. ', 70000 ),
65536,
trim( str_repeat( 'Sentence. ', 65536 ) ),
],
];
}
/**
* @dataProvider provideGetFirstChars
* @param string $text
* @param string $chars
* @param string $expected
*/
public function testGetFirstChars( $text, $chars, $expected ) {
$this->assertSame( $expected, TextTruncator::getFirstChars( $text, $chars ) );
}
public function provideGetFirstChars() {
$text = 'Lullzy lulz are lullzy!';
$html = 'foo<tag>bar</tag>';
$longText = str_repeat( 'тест ', 50000 );
$longTextExpected = trim( str_repeat( 'тест ', 13108 ) );
return [
[ $text, -8, '' ],
[ $text, 0, '' ],
[ $text, 100, $text ],
[ $text, 1, 'Lullzy' ],
[ $text, 6, 'Lullzy' ],
// [ $text, 7, 'Lullzy' ],
[ $text, 8, 'Lullzy lulz' ],
// HTML processing
[ $html, 1, 'foo' ],
// let HTML sanitizer clean it up later
[ $html, 4, 'foo<tag>' ],
[ $html, 12, 'foo<tag>bar</tag>' ],
[ $html, 13, 'foo<tag>bar</tag>' ],
[ $html, 16, 'foo<tag>bar</tag>' ],
[ $html, 17, 'foo<tag>bar</tag>' ],
// T143178 - previously, characters were extracted using regexps which failed when
// requesting 64K chars or more.
[ $longText, 65536, $longTextExpected ],
];
}
}