mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-15 03:35:20 +00:00
Extract unrelated static code from ExtractFormatter
This is a straightforward baseline patch that does nothing but moving existing code around, without touching it. I'm not even trying to remove the "static" keyword. The actual refactoring will be done in the next patch. I hope with this the changes I do in the refactoring become more visible and much easier to review. Change-Id: Idba859ec0c24f3622ea8fb8d7a9b11843d1e3827
This commit is contained in:
parent
b6250d3791
commit
a0d37fcb51
|
@ -329,7 +329,7 @@ class ApiQueryExtracts extends ApiQueryBase {
|
|||
* @return string
|
||||
*/
|
||||
private function getFirstChars( $text, $requestedLength ) {
|
||||
$text = ExtractFormatter::getFirstChars( $text, $requestedLength );
|
||||
$text = TextTruncator::getFirstChars( $text, $requestedLength );
|
||||
// Fix possibly unclosed tags
|
||||
$text = $this->tidy( $text );
|
||||
$text .= wfMessage( 'ellipsis' )->inContentLanguage()->text();
|
||||
|
@ -342,7 +342,7 @@ class ApiQueryExtracts extends ApiQueryBase {
|
|||
* @return string
|
||||
*/
|
||||
private function getFirstSentences( $text, $requestedSentenceCount ) {
|
||||
$text = ExtractFormatter::getFirstSentences( $text, $requestedSentenceCount );
|
||||
$text = TextTruncator::getFirstSentences( $text, $requestedSentenceCount );
|
||||
$text = $this->tidy( $text );
|
||||
return $text;
|
||||
}
|
||||
|
|
|
@ -83,69 +83,6 @@ class ExtractFormatter extends HtmlFormatter {
|
|||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns no more than the given number of sentences
|
||||
*
|
||||
* @param string $text Source text to extract from
|
||||
* @param int $requestedSentenceCount Maximum number of sentences to extract
|
||||
* @return string
|
||||
*/
|
||||
public static function getFirstSentences( $text, $requestedSentenceCount ) {
|
||||
if ( $requestedSentenceCount <= 0 ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Based on code from OpenSearchXml by Brion Vibber
|
||||
$endchars = [
|
||||
// regular ASCII
|
||||
'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)',
|
||||
// full-width ideographic full-stop
|
||||
'。',
|
||||
// double-width roman forms
|
||||
'.', '!', '?',
|
||||
// half-width ideographic full stop
|
||||
'。',
|
||||
];
|
||||
|
||||
$endgroup = implode( '|', $endchars );
|
||||
$regexp = "/($endgroup)+/u";
|
||||
|
||||
$matches = [];
|
||||
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
|
||||
|
||||
if ( $res ) {
|
||||
$index = min( $requestedSentenceCount, $res ) - 1;
|
||||
list( $tail, $length ) = $matches[0][ $index ];
|
||||
// PCRE returns raw offsets, so using substr() instead of mb_substr()
|
||||
$text = substr( $text, 0, $length ) . trim( $tail );
|
||||
} else {
|
||||
// Just return the first line
|
||||
$lines = explode( "\n", $text, 2 );
|
||||
$text = trim( $lines[0] );
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns no more than a requested number of characters, preserving words
|
||||
*
|
||||
* @param string $text Source text to extract from
|
||||
* @param int $requestedLength Maximum number of characters to return
|
||||
* @return string
|
||||
*/
|
||||
public static function getFirstChars( $text, $requestedLength ) {
|
||||
if ( $requestedLength <= 0 ) {
|
||||
return '';
|
||||
}
|
||||
$length = mb_strlen( $text );
|
||||
if ( $length <= $requestedLength ) {
|
||||
return $text;
|
||||
}
|
||||
$pattern = "#^[\\w/]*>?#su";
|
||||
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
|
||||
return mb_substr( $text, 0, $requestedLength ) . $m[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes content we've chosen to remove then removes class and style
|
||||
* attributes from the remaining span elements.
|
||||
|
|
73
includes/TextTruncator.php
Normal file
73
includes/TextTruncator.php
Normal file
|
@ -0,0 +1,73 @@
|
|||
<?php
|
||||
|
||||
namespace TextExtracts;
|
||||
|
||||
/**
|
||||
* @license GPL-2.0-or-later
|
||||
*/
|
||||
class TextTruncator {
|
||||
|
||||
/**
|
||||
* Returns no more than the given number of sentences
|
||||
*
|
||||
* @param string $text Source text to extract from
|
||||
* @param int $requestedSentenceCount Maximum number of sentences to extract
|
||||
* @return string
|
||||
*/
|
||||
public static function getFirstSentences( $text, $requestedSentenceCount ) {
|
||||
if ( $requestedSentenceCount <= 0 ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Based on code from OpenSearchXml by Brion Vibber
|
||||
$endchars = [
|
||||
// regular ASCII
|
||||
'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)',
|
||||
// full-width ideographic full-stop
|
||||
'。',
|
||||
// double-width roman forms
|
||||
'.', '!', '?',
|
||||
// half-width ideographic full stop
|
||||
'。',
|
||||
];
|
||||
|
||||
$endgroup = implode( '|', $endchars );
|
||||
$regexp = "/($endgroup)+/u";
|
||||
|
||||
$matches = [];
|
||||
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
|
||||
|
||||
if ( $res ) {
|
||||
$index = min( $requestedSentenceCount, $res ) - 1;
|
||||
list( $tail, $length ) = $matches[0][ $index ];
|
||||
// PCRE returns raw offsets, so using substr() instead of mb_substr()
|
||||
$text = substr( $text, 0, $length ) . trim( $tail );
|
||||
} else {
|
||||
// Just return the first line
|
||||
$lines = explode( "\n", $text, 2 );
|
||||
$text = trim( $lines[0] );
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns no more than a requested number of characters, preserving words
|
||||
*
|
||||
* @param string $text Source text to extract from
|
||||
* @param int $requestedLength Maximum number of characters to return
|
||||
* @return string
|
||||
*/
|
||||
public static function getFirstChars( $text, $requestedLength ) {
|
||||
if ( $requestedLength <= 0 ) {
|
||||
return '';
|
||||
}
|
||||
$length = mb_strlen( $text );
|
||||
if ( $length <= $requestedLength ) {
|
||||
return $text;
|
||||
}
|
||||
$pattern = "#^[\\w/]*>?#su";
|
||||
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
|
||||
return mb_substr( $text, 0, $requestedLength ) . $m[0];
|
||||
}
|
||||
|
||||
}
|
|
@ -77,136 +77,4 @@ class ExtractFormatterTest extends MediaWikiTestCase {
|
|||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideGetFirstSentences
|
||||
* @param string $text
|
||||
* @param string $sentences
|
||||
* @param string $expected
|
||||
*/
|
||||
public function testGetFirstSentences( $text, $sentences, $expected ) {
|
||||
$this->assertSame( $expected, ExtractFormatter::getFirstSentences( $text, $sentences ) );
|
||||
}
|
||||
|
||||
public function provideGetFirstSentences() {
|
||||
$longLine = str_repeat( 'word ', 1000000 );
|
||||
return [
|
||||
[
|
||||
'Foo is a bar. Such a smart boy. But completely useless.',
|
||||
2,
|
||||
'Foo is a bar. Such a smart boy.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar. Such a smart boy. But completely useless.',
|
||||
1,
|
||||
'Foo is a bar.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar. Such a smart boy.',
|
||||
2,
|
||||
'Foo is a bar. Such a smart boy.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar.',
|
||||
1,
|
||||
'Foo is a bar.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar.',
|
||||
2,
|
||||
'Foo is a bar.',
|
||||
],
|
||||
[
|
||||
'',
|
||||
1,
|
||||
'',
|
||||
],
|
||||
// Exclamation points too!!!
|
||||
[
|
||||
'Foo is a bar! Such a smart boy! But completely useless!',
|
||||
1,
|
||||
'Foo is a bar!',
|
||||
],
|
||||
// A tricky one
|
||||
[
|
||||
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. " .
|
||||
"Polyvinyl acetate, however, is another story.",
|
||||
1,
|
||||
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
|
||||
],
|
||||
// No clear sentences
|
||||
[
|
||||
"foo\nbar\nbaz",
|
||||
2,
|
||||
'foo',
|
||||
],
|
||||
// Bug T118621
|
||||
[
|
||||
'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
|
||||
1,
|
||||
'Foo was born in 1977.',
|
||||
],
|
||||
// Bug T115795 - Test no cropping after initials
|
||||
[
|
||||
'P.J. Harvey is a singer. She is awesome!',
|
||||
1,
|
||||
'P.J. Harvey is a singer.',
|
||||
],
|
||||
// Bug T115817 - Non-breaking space is not a delimiter
|
||||
[
|
||||
html_entity_decode( 'Pigeons (lat. Columbidae) are birds. ' .
|
||||
'They primarily feed on seeds.' ),
|
||||
1,
|
||||
html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ),
|
||||
],
|
||||
// Bug T145231 - various problems with regexes
|
||||
[
|
||||
$longLine,
|
||||
3,
|
||||
trim( $longLine ),
|
||||
],
|
||||
[
|
||||
str_repeat( 'Sentence. ', 70000 ),
|
||||
65536,
|
||||
trim( str_repeat( 'Sentence. ', 65536 ) ),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideGetFirstChars
|
||||
* @param string $text
|
||||
* @param string $chars
|
||||
* @param string $expected
|
||||
*/
|
||||
public function testGetFirstChars( $text, $chars, $expected ) {
|
||||
$this->assertSame( $expected, ExtractFormatter::getFirstChars( $text, $chars ) );
|
||||
}
|
||||
|
||||
public function provideGetFirstChars() {
|
||||
$text = 'Lullzy lulz are lullzy!';
|
||||
$html = 'foo<tag>bar</tag>';
|
||||
$longText = str_repeat( 'тест ', 50000 );
|
||||
$longTextExpected = trim( str_repeat( 'тест ', 13108 ) );
|
||||
|
||||
return [
|
||||
[ $text, -8, '' ],
|
||||
[ $text, 0, '' ],
|
||||
[ $text, 100, $text ],
|
||||
[ $text, 1, 'Lullzy' ],
|
||||
[ $text, 6, 'Lullzy' ],
|
||||
// [ $text, 7, 'Lullzy' ],
|
||||
[ $text, 8, 'Lullzy lulz' ],
|
||||
// HTML processing
|
||||
[ $html, 1, 'foo' ],
|
||||
// let HTML sanitizer clean it up later
|
||||
[ $html, 4, 'foo<tag>' ],
|
||||
[ $html, 12, 'foo<tag>bar</tag>' ],
|
||||
[ $html, 13, 'foo<tag>bar</tag>' ],
|
||||
[ $html, 16, 'foo<tag>bar</tag>' ],
|
||||
[ $html, 17, 'foo<tag>bar</tag>' ],
|
||||
// T143178 - previously, characters were extracted using regexps which failed when
|
||||
// requesting 64K chars or more.
|
||||
[ $longText, 65536, $longTextExpected ],
|
||||
];
|
||||
}
|
||||
}
|
||||
|
|
148
tests/phpunit/TextTruncatorTest.php
Normal file
148
tests/phpunit/TextTruncatorTest.php
Normal file
|
@ -0,0 +1,148 @@
|
|||
<?php
|
||||
|
||||
namespace TextExtracts\Test;
|
||||
|
||||
use TextExtracts\TextTruncator;
|
||||
|
||||
/**
|
||||
* @covers \TextExtracts\TextTruncator
|
||||
* @group TextExtracts
|
||||
*
|
||||
* @license GPL-2.0-or-later
|
||||
*/
|
||||
class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
|
||||
|
||||
/**
|
||||
* @dataProvider provideGetFirstSentences
|
||||
* @param string $text
|
||||
* @param string $sentences
|
||||
* @param string $expected
|
||||
*/
|
||||
public function testGetFirstSentences( $text, $sentences, $expected ) {
|
||||
$this->assertSame( $expected, TextTruncator::getFirstSentences( $text, $sentences ) );
|
||||
}
|
||||
|
||||
public function provideGetFirstSentences() {
|
||||
$longLine = str_repeat( 'word ', 1000000 );
|
||||
return [
|
||||
[
|
||||
'Foo is a bar. Such a smart boy. But completely useless.',
|
||||
2,
|
||||
'Foo is a bar. Such a smart boy.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar. Such a smart boy. But completely useless.',
|
||||
1,
|
||||
'Foo is a bar.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar. Such a smart boy.',
|
||||
2,
|
||||
'Foo is a bar. Such a smart boy.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar.',
|
||||
1,
|
||||
'Foo is a bar.',
|
||||
],
|
||||
[
|
||||
'Foo is a bar.',
|
||||
2,
|
||||
'Foo is a bar.',
|
||||
],
|
||||
[
|
||||
'',
|
||||
1,
|
||||
'',
|
||||
],
|
||||
// Exclamation points too!!!
|
||||
[
|
||||
'Foo is a bar! Such a smart boy! But completely useless!',
|
||||
1,
|
||||
'Foo is a bar!',
|
||||
],
|
||||
// A tricky one
|
||||
[
|
||||
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. " .
|
||||
"Polyvinyl acetate, however, is another story.",
|
||||
1,
|
||||
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
|
||||
],
|
||||
// No clear sentences
|
||||
[
|
||||
"foo\nbar\nbaz",
|
||||
2,
|
||||
'foo',
|
||||
],
|
||||
// Bug T118621
|
||||
[
|
||||
'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
|
||||
1,
|
||||
'Foo was born in 1977.',
|
||||
],
|
||||
// Bug T115795 - Test no cropping after initials
|
||||
[
|
||||
'P.J. Harvey is a singer. She is awesome!',
|
||||
1,
|
||||
'P.J. Harvey is a singer.',
|
||||
],
|
||||
// Bug T115817 - Non-breaking space is not a delimiter
|
||||
[
|
||||
html_entity_decode( 'Pigeons (lat. Columbidae) are birds. ' .
|
||||
'They primarily feed on seeds.' ),
|
||||
1,
|
||||
html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ),
|
||||
],
|
||||
// Bug T145231 - various problems with regexes
|
||||
[
|
||||
$longLine,
|
||||
3,
|
||||
trim( $longLine ),
|
||||
],
|
||||
[
|
||||
str_repeat( 'Sentence. ', 70000 ),
|
||||
65536,
|
||||
trim( str_repeat( 'Sentence. ', 65536 ) ),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideGetFirstChars
|
||||
* @param string $text
|
||||
* @param string $chars
|
||||
* @param string $expected
|
||||
*/
|
||||
public function testGetFirstChars( $text, $chars, $expected ) {
|
||||
$this->assertSame( $expected, TextTruncator::getFirstChars( $text, $chars ) );
|
||||
}
|
||||
|
||||
public function provideGetFirstChars() {
|
||||
$text = 'Lullzy lulz are lullzy!';
|
||||
$html = 'foo<tag>bar</tag>';
|
||||
$longText = str_repeat( 'тест ', 50000 );
|
||||
$longTextExpected = trim( str_repeat( 'тест ', 13108 ) );
|
||||
|
||||
return [
|
||||
[ $text, -8, '' ],
|
||||
[ $text, 0, '' ],
|
||||
[ $text, 100, $text ],
|
||||
[ $text, 1, 'Lullzy' ],
|
||||
[ $text, 6, 'Lullzy' ],
|
||||
// [ $text, 7, 'Lullzy' ],
|
||||
[ $text, 8, 'Lullzy lulz' ],
|
||||
// HTML processing
|
||||
[ $html, 1, 'foo' ],
|
||||
// let HTML sanitizer clean it up later
|
||||
[ $html, 4, 'foo<tag>' ],
|
||||
[ $html, 12, 'foo<tag>bar</tag>' ],
|
||||
[ $html, 13, 'foo<tag>bar</tag>' ],
|
||||
[ $html, 16, 'foo<tag>bar</tag>' ],
|
||||
[ $html, 17, 'foo<tag>bar</tag>' ],
|
||||
// T143178 - previously, characters were extracted using regexps which failed when
|
||||
// requesting 64K chars or more.
|
||||
[ $longText, 65536, $longTextExpected ],
|
||||
];
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue