From 207242f93d0b6aa0de42d3207a856e5481695634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartosz=20Dziewo=C5=84ski?= Date: Thu, 20 Jun 2024 21:37:36 +0200 Subject: [PATCH] ApiQueryExtracts: Use raw text from ParserOutput Use ParserOutput::getRawText() instead of deprecated getText(). This avoids a dependency on MediaWiki\OutputTransform classes, which call into Skin and lots of other scary code. Small updates are needed so that we continue stripping the TOC and section edit links in this format. This simultaneously reverts two patches that were only needed to work around problems caused by markup added by OutputTransforms: * Revert "Skip

in TOC when extracting first section" This reverts commit 60e1c5ad83434aaca46615f8cb675944d1e1b722. * Revert "ExtractFormatter: Rescue headings from being removed" This reverts commit 0fafa44a20af0af865849ae52141340ab92868bf. Change-Id: Ie436576a356d05f2c4c4b84c8c1d82ba70d357d4 --- extension.json | 4 +++- includes/ApiQueryExtracts.php | 9 ++++----- includes/ExtractFormatter.php | 15 +-------------- tests/phpunit/ApiQueryExtractsTest.php | 10 ---------- tests/phpunit/ExtractFormatterTest.php | 11 ----------- 5 files changed, 8 insertions(+), 41 deletions(-) diff --git a/extension.json b/extension.json index 3f3e55c..f43fb3d 100644 --- a/extension.json +++ b/extension.json @@ -56,7 +56,9 @@ "input", "style", "ul.gallery", - ".mw-editsection", + "mw\\:editsection", + "editsection", + "meta", "sup.reference", "ol.references", ".error", diff --git a/includes/ApiQueryExtracts.php b/includes/ApiQueryExtracts.php index 1508eff..4a4cac2 100644 --- a/includes/ApiQueryExtracts.php +++ b/includes/ApiQueryExtracts.php @@ -26,7 +26,7 @@ class ApiQueryExtracts extends ApiQueryBase { /** * Bump when memcache needs clearing */ - private const CACHE_VERSION = 2; + private const CACHE_VERSION = 3; private const PREFIX = 'ex'; @@ -241,10 +241,9 @@ class ApiQueryExtracts extends ApiQueryBase { */ private function getFirstSection( $text, $plainText ) { if ( $plainText ) { - $regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START . - '(?!.' . ExtractFormatter::SECTION_MARKER_END . '

isOK() ) { $pout = $status->getValue(); - $text = $pout->getText( [ 'unwrap' => true ] ); + $text = $pout->getRawText(); if ( $this->params['intro'] ) { $text = $this->getFirstSection( $text, false ); } diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php index f755ae4..1cde6f7 100644 --- a/includes/ExtractFormatter.php +++ b/includes/ExtractFormatter.php @@ -4,7 +4,6 @@ namespace TextExtracts; use DOMElement; use HtmlFormatter\HtmlFormatter; -use Wikimedia\Parsoid\Utils\DOMCompat; /** * Provides text-only or limited-HTML extracts of page HTML @@ -81,21 +80,9 @@ class ExtractFormatter extends HtmlFormatter { * @return array Array of removed DOMElements */ public function filterContent(): array { - $doc = $this->getDoc(); - - // Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses, - // move it outside the header to rescue it (T363445) - // https://www.mediawiki.org/wiki/Heading_HTML_changes - $headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' ); - foreach ( $headings as $heading ) { - // @phan-suppress-next-line PhanTypeMismatchArgumentSuperType - if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) { - $heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode ); - } - } - $removed = parent::filterContent(); + $doc = $this->getDoc(); $spans = $doc->getElementsByTagName( 'span' ); /** @var DOMElement $span */ diff --git a/tests/phpunit/ApiQueryExtractsTest.php b/tests/phpunit/ApiQueryExtractsTest.php index 39d862f..580084b 100644 --- a/tests/phpunit/ApiQueryExtractsTest.php +++ b/tests/phpunit/ApiQueryExtractsTest.php @@ -154,16 +154,6 @@ class ApiQueryExtractsTest extends \MediaWikiIntegrationTestCase { false, 'Example ...', ], - '__TOC__ before intro (HTML)' => [ - '

Contents

Intro

Actual heading

...', - false, - '

Contents

Intro', - ], - '__TOC__ before intro (plaintext)' => [ - "\1\2_\2\1

Contents

Intro\1\2_\2\1

Actual heading

...", - true, - "\1\2_\2\1

Contents

Intro", - ], ]; } diff --git a/tests/phpunit/ExtractFormatterTest.php b/tests/phpunit/ExtractFormatterTest.php index 64c8ceb..ecc37a4 100644 --- a/tests/phpunit/ExtractFormatterTest.php +++ b/tests/phpunit/ExtractFormatterTest.php @@ -67,17 +67,6 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase { 'quux', false, ], - [ - // New heading markup from T13555 (T363445) - "

hello

heading

bye

", - '

hello

' . - '
' . - '

heading

' . - 'xxx' . - '
' . - '

bye

', - false, - ], [ // Verify that TOC is properly removed (HTML mode) "Lead\n

Section

\n

Section text

",