From 0fafa44a20af0af865849ae52141340ab92868bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartosz=20Dziewo=C5=84ski?= Date: Thu, 20 Jun 2024 22:34:38 +0200 Subject: [PATCH] ExtractFormatter: Rescue headings from being removed Bug: T363445 Change-Id: I662fe3dd06d6b010108c6f0ef891a8c6113b9a45 --- includes/ExtractFormatter.php | 15 ++++++++++++++- tests/phpunit/ExtractFormatterTest.php | 11 +++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php index 1cde6f7..f755ae4 100644 --- a/includes/ExtractFormatter.php +++ b/includes/ExtractFormatter.php @@ -4,6 +4,7 @@ namespace TextExtracts; use DOMElement; use HtmlFormatter\HtmlFormatter; +use Wikimedia\Parsoid\Utils\DOMCompat; /** * Provides text-only or limited-HTML extracts of page HTML @@ -80,9 +81,21 @@ class ExtractFormatter extends HtmlFormatter { * @return array Array of removed DOMElements */ public function filterContent(): array { + $doc = $this->getDoc(); + + // Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses, + // move it outside the header to rescue it (T363445) + // https://www.mediawiki.org/wiki/Heading_HTML_changes + $headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' ); + foreach ( $headings as $heading ) { + // @phan-suppress-next-line PhanTypeMismatchArgumentSuperType + if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) { + $heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode ); + } + } + $removed = parent::filterContent(); - $doc = $this->getDoc(); $spans = $doc->getElementsByTagName( 'span' ); /** @var DOMElement $span */ diff --git a/tests/phpunit/ExtractFormatterTest.php b/tests/phpunit/ExtractFormatterTest.php index ecc37a4..64c8ceb 100644 --- a/tests/phpunit/ExtractFormatterTest.php +++ b/tests/phpunit/ExtractFormatterTest.php @@ -67,6 +67,17 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase { 'quux', false, ], + [ + // New heading markup from T13555 (T363445) + "

hello

heading

bye

", + '

hello

' . + '
' . + '

heading

' . + 'xxx' . + '
' . + '

bye

', + false, + ], [ // Verify that TOC is properly removed (HTML mode) "Lead\n

Section

\n

Section text

",