2013-12-17 21:20:13 +00:00
|
|
|
<?php
|
2014-12-31 23:02:05 +00:00
|
|
|
use TextExtracts\ExtractFormatter;
|
2013-12-17 21:20:13 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @group TextExtracts
|
|
|
|
*/
|
|
|
|
class ExtractFormatterTest extends MediaWikiTestCase {
|
|
|
|
/**
|
2014-04-23 01:12:44 +00:00
|
|
|
* Disabled for now due to Jenkins weirdness
|
2013-12-17 21:20:13 +00:00
|
|
|
* @dataProvider provideExtracts
|
|
|
|
*/
|
2014-04-23 01:12:44 +00:00
|
|
|
private function notReallyTestExtracts( $expected, $wikiText, $plainText ) {
|
2013-12-17 21:20:13 +00:00
|
|
|
$title = Title::newFromText( 'Test' );
|
|
|
|
$po = new ParserOptions();
|
|
|
|
$po->setEditSection( true );
|
|
|
|
$parser = new Parser();
|
|
|
|
$text = $parser->parse( $wikiText, $title, $po )->getText();
|
2014-08-13 06:18:34 +00:00
|
|
|
$config = ConfigFactory::getDefaultInstance()->makeConfig( 'textextracts' );
|
|
|
|
$fmt = new ExtractFormatter( $text, $plainText, $config );
|
2013-12-17 21:20:13 +00:00
|
|
|
$fmt->remove( '.metadata' ); // Will be added via $wgExtractsRemoveClasses on WMF
|
|
|
|
$text = trim( $fmt->getText() );
|
|
|
|
$this->assertEquals( $expected, $text );
|
|
|
|
}
|
|
|
|
|
|
|
|
public function provideExtracts() {
|
|
|
|
$dutch = "'''Dutch''' (<span class=\"unicode haudio\" style=\"white-space:nowrap;\"><span class=\"fn\">"
|
|
|
|
. "[[File:Loudspeaker.svg|11px|link=File:nl-Nederlands.ogg|About this sound]] [[:Media:nl-Nederlands.ogg|''Nederlands'']]"
|
|
|
|
. "</span> <small class=\"metadata audiolinkinfo\" style=\"cursor:help;\">([[Wikipedia:Media help|<span style=\"cursor:help;\">"
|
|
|
|
. "help</span>]]·[[:File:nl-Nederlands.ogg|<span style=\"cursor:help;\">info</span>]])</small></span>) is a"
|
|
|
|
. " [[West Germanic languages|West Germanic language]] and the native language of most of the population of the [[Netherlands]]";
|
2015-01-08 12:50:23 +00:00
|
|
|
|
2013-12-17 21:20:13 +00:00
|
|
|
return array(
|
|
|
|
array(
|
|
|
|
"Dutch ( Nederlands ) is a West Germanic language and the native language of most of the population of the Netherlands",
|
|
|
|
$dutch,
|
|
|
|
true,
|
|
|
|
),
|
2015-01-08 12:50:23 +00:00
|
|
|
|
|
|
|
array(
|
|
|
|
"<p><span><span lang=\"baz\">qux</span></span>\n</p>",
|
|
|
|
'<span class="foo"><span lang="baz">qux</span></span>',
|
|
|
|
false,
|
|
|
|
),
|
|
|
|
array(
|
|
|
|
"<p><span><span lang=\"baz\">qux</span></span>\n</p>",
|
|
|
|
'<span style="foo: bar;"><span lang="baz">qux</span></span>',
|
|
|
|
false,
|
|
|
|
),
|
|
|
|
array(
|
|
|
|
"<p><span><span lang=\"qux\">quux</span></span>\n</p>",
|
|
|
|
'<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>',
|
|
|
|
false,
|
|
|
|
),
|
2013-12-17 21:20:13 +00:00
|
|
|
);
|
|
|
|
}
|
2014-04-23 01:12:44 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @dataProvider provideGetFirstSentences
|
|
|
|
* @param $text
|
|
|
|
* @param $sentences
|
|
|
|
* @param $expected
|
|
|
|
*/
|
|
|
|
public function testGetFirstSentences( $text, $sentences, $expected ) {
|
|
|
|
$this->assertEquals( $expected, ExtractFormatter::getFirstSentences( $text, $sentences ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
public function provideGetFirstSentences() {
|
|
|
|
return array(
|
|
|
|
array(
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar. Such a smart boy. But completely useless.',
|
2014-04-23 01:12:44 +00:00
|
|
|
2,
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar. Such a smart boy.',
|
2014-04-23 01:12:44 +00:00
|
|
|
),
|
|
|
|
array(
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar. Such a smart boy. But completely useless.',
|
2014-04-23 01:12:44 +00:00
|
|
|
1,
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar.',
|
2014-04-23 01:12:44 +00:00
|
|
|
),
|
|
|
|
array(
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar.',
|
2014-04-23 01:12:44 +00:00
|
|
|
1,
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar.',
|
2014-04-23 01:12:44 +00:00
|
|
|
),
|
|
|
|
array(
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar.',
|
2014-04-23 01:12:44 +00:00
|
|
|
2,
|
2015-11-18 05:46:36 +00:00
|
|
|
'Foo is a bar.',
|
2014-04-23 01:12:44 +00:00
|
|
|
),
|
|
|
|
array(
|
|
|
|
'',
|
|
|
|
1,
|
|
|
|
'',
|
|
|
|
),
|
2015-11-18 05:46:36 +00:00
|
|
|
// Exclamation points too!!!
|
|
|
|
array(
|
|
|
|
'Foo is a bar! Such a smart boy! But completely useless!',
|
|
|
|
1,
|
|
|
|
'Foo is a bar!',
|
|
|
|
),
|
|
|
|
// A tricky one
|
|
|
|
array(
|
|
|
|
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. Polyvinyl acetate, however, is another story.",
|
|
|
|
1,
|
|
|
|
"Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
|
|
|
|
),
|
|
|
|
// Bug T118621
|
|
|
|
array(
|
|
|
|
'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
|
|
|
|
1,
|
|
|
|
'Foo was born in 1977.',
|
|
|
|
),
|
2015-11-30 08:59:16 +00:00
|
|
|
// Bug T115795 - Test no cropping after initials
|
2014-04-23 01:12:44 +00:00
|
|
|
array(
|
|
|
|
'P.J. Harvey is a singer. She is awesome!',
|
|
|
|
1,
|
|
|
|
'P.J. Harvey is a singer.',
|
2015-11-30 08:59:16 +00:00
|
|
|
),
|
2016-03-28 22:29:16 +00:00
|
|
|
// Bug T115817 - Non-breaking space is not a delimiter
|
|
|
|
array(
|
|
|
|
html_entity_decode( 'Pigeons (lat. Columbidae) are birds. They primarily feed on seeds.' ),
|
|
|
|
1,
|
|
|
|
html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ),
|
|
|
|
),
|
2014-04-23 01:12:44 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @dataProvider provideGetFirstChars
|
|
|
|
* @param $text
|
|
|
|
* @param $chars
|
|
|
|
* @param $expected
|
|
|
|
*/
|
|
|
|
public function testGetFirstChars( $text, $chars, $expected ) {
|
|
|
|
$this->assertEquals( $expected, ExtractFormatter::getFirstChars( $text, $chars ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
public function provideGetFirstChars() {
|
|
|
|
$text = 'Lullzy lulz are lullzy!';
|
|
|
|
return array(
|
|
|
|
//array( $text, 0, '' ),
|
|
|
|
array( $text, 100, $text ),
|
|
|
|
array( $text, 1, 'Lullzy' ),
|
|
|
|
array( $text, 6, 'Lullzy' ),
|
|
|
|
//array( $text, 7, 'Lullzy' ),
|
|
|
|
array( $text, 8, 'Lullzy lulz' ),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|