diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php index 1cde6f7..f755ae4 100644 --- a/includes/ExtractFormatter.php +++ b/includes/ExtractFormatter.php @@ -4,6 +4,7 @@ namespace TextExtracts; use DOMElement; use HtmlFormatter\HtmlFormatter; +use Wikimedia\Parsoid\Utils\DOMCompat; /** * Provides text-only or limited-HTML extracts of page HTML @@ -80,9 +81,21 @@ class ExtractFormatter extends HtmlFormatter { * @return array Array of removed DOMElements */ public function filterContent(): array { + $doc = $this->getDoc(); + + // Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses, + // move it outside the header to rescue it (T363445) + // https://www.mediawiki.org/wiki/Heading_HTML_changes + $headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' ); + foreach ( $headings as $heading ) { + // @phan-suppress-next-line PhanTypeMismatchArgumentSuperType + if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) { + $heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode ); + } + } + $removed = parent::filterContent(); - $doc = $this->getDoc(); $spans = $doc->getElementsByTagName( 'span' ); /** @var DOMElement $span */ diff --git a/tests/phpunit/ExtractFormatterTest.php b/tests/phpunit/ExtractFormatterTest.php index ecc37a4..64c8ceb 100644 --- a/tests/phpunit/ExtractFormatterTest.php +++ b/tests/phpunit/ExtractFormatterTest.php @@ -67,6 +67,17 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase { 'quux', false, ], + [ + // New heading markup from T13555 (T363445) + "
hello
bye
", + 'hello
' . + 'bye
', + false, + ], [ // Verify that TOC is properly removed (HTML mode) "Lead\nSection text
",