Merge "ExtractFormatter: Rescue headings from being removed"

This commit is contained in:
jenkins-bot 2024-07-16 15:05:46 +00:00 committed by Gerrit Code Review
commit 173012d395
2 changed files with 25 additions and 1 deletions

View file

@ -4,6 +4,7 @@ namespace TextExtracts;
use DOMElement; use DOMElement;
use HtmlFormatter\HtmlFormatter; use HtmlFormatter\HtmlFormatter;
use Wikimedia\Parsoid\Utils\DOMCompat;
/** /**
* Provides text-only or limited-HTML extracts of page HTML * Provides text-only or limited-HTML extracts of page HTML
@ -80,9 +81,21 @@ class ExtractFormatter extends HtmlFormatter {
* @return array Array of removed DOMElements * @return array Array of removed DOMElements
*/ */
public function filterContent(): array { public function filterContent(): array {
$doc = $this->getDoc();
// Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses,
// move it outside the header to rescue it (T363445)
// https://www.mediawiki.org/wiki/Heading_HTML_changes
$headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' );
foreach ( $headings as $heading ) {
// @phan-suppress-next-line PhanTypeMismatchArgumentSuperType
if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) {
$heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode );
}
}
$removed = parent::filterContent(); $removed = parent::filterContent();
$doc = $this->getDoc();
$spans = $doc->getElementsByTagName( 'span' ); $spans = $doc->getElementsByTagName( 'span' );
/** @var DOMElement $span */ /** @var DOMElement $span */

View file

@ -67,6 +67,17 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase {
'<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>', '<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>',
false, false,
], ],
[
// New heading markup from T13555 (T363445)
"<p>hello</p><h2 id=\"heading\">heading</h2><p>bye</p>",
'<p>hello</p>' .
'<div class="mw-heading mw-heading2">' .
'<h2 id="heading">heading</h2>' .
'<span class="mw-editsection">xxx</span>' .
'</div>' .
'<p>bye</p>',
false,
],
[ [
// Verify that TOC is properly removed (HTML mode) // Verify that TOC is properly removed (HTML mode)
"Lead\n<h1>Section</h1>\n<p>Section text</p>", "Lead\n<h1>Section</h1>\n<p>Section text</p>",