getConfigFactory()->makeConfig( 'textextracts' ); $fmt = new ExtractFormatter( $text, $plainText, $config ); // .metadata class will be added via $wgExtractsRemoveClasses on WMF $fmt->remove( '.metadata' ); $text = trim( $fmt->getText() ); $this->assertEquals( $expected, $text ); } public function provideExtracts() { // @codingStandardsIgnoreStart $dutch = 'Dutch (About this sound Nederlands (help·info)) is a West Germanic language and the native language of most of the population of the Netherlands'; $tocText = 'Lead
TOC goes here

Section

Section text

'; // @codingStandardsIgnoreEnd return [ [ 'Dutch ( Nederlands ) is a West Germanic language and the native language of ' . 'most of the population of the Netherlands', $dutch, true, ], [ "qux", 'qux', false, ], [ "qux", 'qux', false, ], [ "quux", 'quux', false, ], [ // Verify that TOC is properly removed (HTML mode) "Lead\n

Section

\n

Section text

", $tocText, false, ], [ // Verify that TOC is properly removed (plain text mode) "Lead\n\n\x01\x021\2\1Section\nSection text", $tocText, true, ], ]; } /** * @dataProvider provideGetFirstSentences * @param string $text * @param string $sentences * @param string $expected */ public function testGetFirstSentences( $text, $sentences, $expected ) { $this->assertEquals( $expected, ExtractFormatter::getFirstSentences( $text, $sentences ) ); } public function provideGetFirstSentences() { $longLine = str_repeat( 'word ', 1000000 ); return [ [ 'Foo is a bar. Such a smart boy. But completely useless.', 2, 'Foo is a bar. Such a smart boy.', ], [ 'Foo is a bar. Such a smart boy. But completely useless.', 1, 'Foo is a bar.', ], [ 'Foo is a bar. Such a smart boy.', 2, 'Foo is a bar. Such a smart boy.', ], [ 'Foo is a bar.', 1, 'Foo is a bar.', ], [ 'Foo is a bar.', 2, 'Foo is a bar.', ], [ '', 1, '', ], // Exclamation points too!!! [ 'Foo is a bar! Such a smart boy! But completely useless!', 1, 'Foo is a bar!', ], // A tricky one [ "Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. " . "Polyvinyl acetate, however, is another story.", 1, "Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.", ], // No clear sentences [ "foo\nbar\nbaz", 2, 'foo', ], // Bug T118621 [ 'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.', 1, 'Foo was born in 1977.', ], // Bug T115795 - Test no cropping after initials [ 'P.J. Harvey is a singer. She is awesome!', 1, 'P.J. Harvey is a singer.', ], // Bug T115817 - Non-breaking space is not a delimiter [ html_entity_decode( 'Pigeons (lat. Columbidae) are birds. ' . 'They primarily feed on seeds.' ), 1, html_entity_decode( 'Pigeons (lat. Columbidae) are birds.' ), ], // Bug T145231 - various problems with regexes [ $longLine, 3, trim( $longLine ), ], [ str_repeat( 'Sentence. ', 70000 ), 65536, trim( str_repeat( 'Sentence. ', 65536 ) ), ], ]; } /** * @dataProvider provideGetFirstChars * @param string $text * @param string $chars * @param string $expected */ public function testGetFirstChars( $text, $chars, $expected ) { $this->assertEquals( $expected, ExtractFormatter::getFirstChars( $text, $chars ) ); } public function provideGetFirstChars() { $text = 'Lullzy lulz are lullzy!'; $html = 'foobar'; $longText = str_repeat( 'тест ', 50000 ); $longTextExpected = trim( str_repeat( 'тест ', 13108 ) ); return [ [ $text, -8, '' ], [ $text, 0, '' ], [ $text, 100, $text ], [ $text, 1, 'Lullzy' ], [ $text, 6, 'Lullzy' ], // [ $text, 7, 'Lullzy' ], [ $text, 8, 'Lullzy lulz' ], // HTML processing [ $html, 1, 'foo' ], // let HTML sanitizer clean it up later [ $html, 4, 'foo' ], [ $html, 12, 'foobar' ], [ $html, 13, 'foobar' ], [ $html, 16, 'foobar' ], [ $html, 17, 'foobar' ], // T143178 - previously, characters were extracted using regexps which failed when // requesting 64K chars or more. [ $longText, 65536, $longTextExpected ], ]; } }