ApiQueryExtracts: Use raw text from ParserOutput

Use ParserOutput::getRawText() instead of deprecated getText().
This avoids a dependency on MediaWiki\OutputTransform classes,
which call into Skin and lots of other scary code.

Small updates are needed so that we continue stripping the TOC
and section edit links in this format.

This simultaneously reverts two patches that were only needed to work
around problems caused by markup added by OutputTransforms:
* Revert "Skip <h2> in TOC when extracting first section"
  This reverts commit 60e1c5ad83.
* Revert "ExtractFormatter: Rescue headings from being removed"
  This reverts commit 0fafa44a20.

Change-Id: Ie436576a356d05f2c4c4b84c8c1d82ba70d357d4
This commit is contained in:
Bartosz Dziewoński 2024-06-20 21:37:36 +02:00
parent 173012d395
commit 207242f93d
5 changed files with 8 additions and 41 deletions

View file

@ -56,7 +56,9 @@
"input",
"style",
"ul.gallery",
".mw-editsection",
"mw\\:editsection",
"editsection",
"meta",
"sup.reference",
"ol.references",
".error",

View file

@ -26,7 +26,7 @@ class ApiQueryExtracts extends ApiQueryBase {
/**
* Bump when memcache needs clearing
*/
private const CACHE_VERSION = 2;
private const CACHE_VERSION = 3;
private const PREFIX = 'ex';
@ -241,10 +241,9 @@ class ApiQueryExtracts extends ApiQueryBase {
*/
private function getFirstSection( $text, $plainText ) {
if ( $plainText ) {
$regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START .
'(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s';
$regexp = '/^(.*?)(?=' . ExtractFormatter::SECTION_MARKER_START . ')/s';
} else {
$regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s';
$regexp = '/^(.*?)(?=<h[1-6]\b)/s';
}
if ( preg_match( $regexp, $text, $matches ) ) {
$text = $matches[0];
@ -266,7 +265,7 @@ class ApiQueryExtracts extends ApiQueryBase {
);
if ( $status->isOK() ) {
$pout = $status->getValue();
$text = $pout->getText( [ 'unwrap' => true ] );
$text = $pout->getRawText();
if ( $this->params['intro'] ) {
$text = $this->getFirstSection( $text, false );
}

View file

@ -4,7 +4,6 @@ namespace TextExtracts;
use DOMElement;
use HtmlFormatter\HtmlFormatter;
use Wikimedia\Parsoid\Utils\DOMCompat;
/**
* Provides text-only or limited-HTML extracts of page HTML
@ -81,21 +80,9 @@ class ExtractFormatter extends HtmlFormatter {
* @return array Array of removed DOMElements
*/
public function filterContent(): array {
$doc = $this->getDoc();
// Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses,
// move it outside the header to rescue it (T363445)
// https://www.mediawiki.org/wiki/Heading_HTML_changes
$headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' );
foreach ( $headings as $heading ) {
// @phan-suppress-next-line PhanTypeMismatchArgumentSuperType
if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) {
$heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode );
}
}
$removed = parent::filterContent();
$doc = $this->getDoc();
$spans = $doc->getElementsByTagName( 'span' );
/** @var DOMElement $span */

View file

@ -154,16 +154,6 @@ class ApiQueryExtractsTest extends \MediaWikiIntegrationTestCase {
false,
'Example <h11>...',
],
'__TOC__ before intro (HTML)' => [
'<h2 id="mw-toc-heading">Contents</h2>Intro<h2>Actual heading</h2>...',
false,
'<h2 id="mw-toc-heading">Contents</h2>Intro',
],
'__TOC__ before intro (plaintext)' => [
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro\1\2_\2\1<h2>Actual heading</h2>...",
true,
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro",
],
];
}

View file

@ -67,17 +67,6 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase {
'<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>',
false,
],
[
// New heading markup from T13555 (T363445)
"<p>hello</p><h2 id=\"heading\">heading</h2><p>bye</p>",
'<p>hello</p>' .
'<div class="mw-heading mw-heading2">' .
'<h2 id="heading">heading</h2>' .
'<span class="mw-editsection">xxx</span>' .
'</div>' .
'<p>bye</p>',
false,
],
[
// Verify that TOC is properly removed (HTML mode)
"Lead\n<h1>Section</h1>\n<p>Section text</p>",