mediawiki-extensions-TextEx.../tests/phpunit/ApiQueryExtractsTest.php
Thiemo Kreuz 60e1c5ad83 Skip <h2> in TOC when extracting first section
This piece of code is only relevant in case when:
- the intro section is requested (either in plaintext or html);
- the parse result for the full page is available in the parser cache;
- the full extract is not available in the TextExtracts WAN cache;
- the intro is also not available in the TextExtracts WAN cache.

In this case getFirstSection() is called with the parser output,
which is different from the the convertText() output it is called
with in other code paths, and still contains <h*> tags. A quick
regex is used to extract the first section. This stops at any <h2>.
A TOC also contains a <h2> (which will be removed later via
$wgExtractsRemoveClasses). This one needs to be ignored in case
the TOC is placed before the first section using e.g. the __TOC__
keyword.

The patch changes the regex so it ignores a h2 with
id="mw-toc-heading", but keeps working in plaintext mode when <h*>
tags are not present  (the code path when the intro section is
requested, and the full extract is available in the TextExtracts
WAN cache but the intro extract isn't).

Bug: T269967
Change-Id: I0a495d06cf1725744e556e81f17047fb53f53521
2023-03-20 07:40:07 +00:00

248 lines
6.4 KiB
PHP

<?php
namespace TextExtracts\Test;
use ILanguageConverter;
use MediaWiki\Languages\LanguageConverterFactory;
use MediaWikiCoversValidator;
use TextExtracts\ApiQueryExtracts;
use Wikimedia\TestingAccessWrapper;
/**
* @covers \TextExtracts\ApiQueryExtracts
* @group TextExtracts
*
* @license GPL-2.0-or-later
*/
class ApiQueryExtractsTest extends \MediaWikiTestCase {
use MediaWikiCoversValidator;
private function newInstance() {
$config = new \HashConfig( [
'ParserCacheExpireTime' => \IExpiringStore::TTL_INDEFINITE,
] );
$configFactory = $this->createMock( \ConfigFactory::class );
$configFactory->method( 'makeConfig' )
->with( 'textextracts' )
->willReturn( $config );
$cache = new \WANObjectCache( [ 'cache' => new \HashBagOStuff() ] );
$context = $this->createMock( \IContextSource::class );
$context->method( 'getConfig' )
->willReturn( $config );
$context->method( 'msg' )
->willReturnCallback( function ( $key, ...$params ) {
$msg = $this->createMock( \Message::class );
$msg->method( 'text' )->willReturn( "($key)" );
return $msg;
} );
$main = $this->createMock( \ApiMain::class );
$main->expects( $this->once() )
->method( 'getContext' )
->willReturn( $context );
$query = $this->createMock( \ApiQuery::class );
$query->expects( $this->once() )
->method( 'getMain' )
->willReturn( $main );
$langConvFactory = $this->createMock( LanguageConverterFactory::class );
$langConvFactory->method( 'getLanguageConverter' )
->willReturn( $this->createMock( ILanguageConverter::class ) );
return new ApiQueryExtracts( $query, '', $configFactory, $cache, $langConvFactory );
}
public function testMemCacheHelpers() {
$title = $this->createMock( \Title::class );
$title->method( 'getPageLanguage' )
->willReturn( $this->createMock( \Language::class ) );
$page = $this->createMock( \WikiPage::class );
$page->method( 'getTitle' )
->willReturn( $title );
$text = 'Text to cache';
/** @var ApiQueryExtracts $instance */
$instance = TestingAccessWrapper::newFromObject( $this->newInstance() );
// Default param values for this API module
$instance->params = [ 'intro' => false, 'plaintext' => false ];
$this->assertFalse( $instance->getFromCache( $page, false ), 'is not cached yet' );
$instance->setCache( $page, $text );
$instance->cache->clearProcessCache();
$this->assertSame( $text, $instance->getFromCache( $page, false ) );
}
public function testSelfDocumentation() {
/** @var ApiQueryExtracts $instance */
$instance = TestingAccessWrapper::newFromObject( $this->newInstance() );
$this->assertIsString( $instance->getCacheMode( [] ) );
$this->assertNotEmpty( $instance->getExamplesMessages() );
$this->assertIsString( $instance->getHelpUrls() );
$params = $instance->getAllowedParams();
$this->assertIsArray( $params );
$this->assertSame( $params['chars'][\ApiBase::PARAM_MIN], 1 );
$this->assertSame( $params['chars'][\ApiBase::PARAM_MAX], 1200 );
$this->assertSame( $params['limit'][\ApiBase::PARAM_DFLT], 20 );
$this->assertSame( $params['limit'][\ApiBase::PARAM_TYPE], 'limit' );
$this->assertSame( $params['limit'][\ApiBase::PARAM_MIN], 1 );
$this->assertSame( $params['limit'][\ApiBase::PARAM_MAX], 20 );
$this->assertSame( $params['limit'][\ApiBase::PARAM_MAX2], 20 );
}
/**
* @dataProvider provideFirstSectionsToExtract
*/
public function testGetFirstSection( $text, $isPlainText, $expected ) {
/** @var ApiQueryExtracts $instance */
$instance = TestingAccessWrapper::newFromObject( $this->newInstance() );
$this->assertSame( $expected, $instance->getFirstSection( $text, $isPlainText ) );
}
public function provideFirstSectionsToExtract() {
return [
'Plain text match' => [
"First\nsection \1\2... \1\2...",
true,
"First\nsection ",
],
'Plain text without a match' => [
'Example\1\2...',
true,
'Example\1\2...',
],
'HTML match' => [
"First\nsection <h1>...<h2>...",
false,
"First\nsection ",
],
'HTML without a match' => [
'Example <h11>...',
false,
'Example <h11>...',
],
'__TOC__ before intro (HTML)' => [
'<h2 id="mw-toc-heading">Contents</h2>Intro<h2>Actual heading</h2>...',
false,
'<h2 id="mw-toc-heading">Contents</h2>Intro',
],
'__TOC__ before intro (plaintext)' => [
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro\1\2_\2\1<h2>Actual heading</h2>...",
true,
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro",
],
];
}
/**
* @dataProvider provideTextsToTruncate
*/
public function testTruncate( $text, array $params, $expected ) {
/** @var ApiQueryExtracts $instance */
$instance = TestingAccessWrapper::newFromObject( $this->newInstance() );
$instance->params = $params + [ 'chars' => null, 'sentences' => null, 'plaintext' => true ];
$this->assertSame( $expected, $instance->truncate( $text ) );
}
public function provideTextsToTruncate() {
return [
[ '', [], '' ],
[ 'abc', [], 'abc' ],
[
'abc',
[ 'chars' => 1 ],
'abc'
],
[
'abc',
[ 'chars' => 1, 'plaintext' => false ],
'abc'
],
[
'abc',
[ 'sentences' => 1 ],
'abc'
],
[
'abc abc. xyz xyz.',
[ 'chars' => 1 ],
'abc(ellipsis)'
],
[
'abc abc. xyz xyz.',
[ 'sentences' => 1 ],
'abc abc.'
],
[
'abc abc. xyz xyz.',
[ 'chars' => 1000 ],
'abc abc. xyz xyz.'
],
[
'abc abc. xyz xyz.',
[ 'chars' => 1000, 'plaintext' => false ],
'abc abc. xyz xyz.'
],
[
'abc abc. xyz xyz.',
[ 'sentences' => 10 ],
'abc abc. xyz xyz.'
],
];
}
/**
* @dataProvider provideSectionsToFormat
*/
public function testDoSections( $text, $format, $expected ) {
/** @var ApiQueryExtracts $instance */
$instance = TestingAccessWrapper::newFromObject( $this->newInstance() );
$instance->params = [ 'sectionformat' => $format ];
$this->assertSame( $expected, $instance->doSections( $text ) );
}
public function provideSectionsToFormat() {
$level = 3;
$marker = "\1\2$level\2\1";
return [
'Raw' => [
"$marker Headline\t\nNext line",
'raw',
"$marker Headline\t\nNext line",
],
'Wiki text' => [
"$marker Headline\t\nNext line",
'wiki',
"\n=== Headline ===\nNext line",
],
'Plain text' => [
"$marker Headline\t\nNext line",
'plain',
"\nHeadline\nNext line",
],
'Multiple matches' => [
"${marker}First\n${marker}Second",
'wiki',
"\n=== First ===\n\n=== Second ===",
],
];
}
}