Use Parsoid's HTML5-compliant helpers for HTML transformations

Previously XML parsing and serialization was used, causing `<br>`
to be disallowed and `<br />` to turn into `<br></br>`.

Also update some uses of XPath to use querySelector() helper.

Bug: T113791
Change-Id: Ic2069015b4cbb5d05a72eb383052319048111623
This commit is contained in:
Bartosz Dziewoński 2022-10-28 06:00:05 +02:00
parent 7aa9d03bea
commit 2cc6549898
3 changed files with 70 additions and 41 deletions

14
.phan/stubs/DomImpl.php Normal file
View file

@ -0,0 +1,14 @@
<?php
class_alias( "DOMAttr", "Wikimedia\\Parsoid\\DOM\\Attr" );
class_alias( "DOMCharacterData", "Wikimedia\\Parsoid\\DOM\\CharacterData" );
class_alias( "DOMComment", "Wikimedia\\Parsoid\\DOM\\Comment" );
class_alias( "DOMDocument", "Wikimedia\\Parsoid\\DOM\\Document" );
class_alias( "DOMDocumentFragment", "Wikimedia\\Parsoid\\DOM\\DocumentFragment" );
class_alias( "DOMDocumentType", "Wikimedia\\Parsoid\\DOM\\DocumentType" );
class_alias( "DOMDOMException", "Wikimedia\\Parsoid\\DOM\\DOMException" );
class_alias( "DOMDOMParser", "Wikimedia\\Parsoid\\DOM\\DOMParser" );
class_alias( "DOMElement", "Wikimedia\\Parsoid\\DOM\\Element" );
class_alias( "DOMNode", "Wikimedia\\Parsoid\\DOM\\Node" );
class_alias( "DOMNodeList", "Wikimedia\\Parsoid\\DOM\\NodeList" );
class_alias( "DOMProcessingInstruction", "Wikimedia\\Parsoid\\DOM\\ProcessingInstruction" );
class_alias( "DOMText", "Wikimedia\\Parsoid\\DOM\\Text" );

View file

@ -21,16 +21,15 @@
namespace MediaWiki\Extension\ImageMap;
use ConfigFactory;
use DOMDocument;
use DOMElement;
use DOMXPath;
use DOMDocumentFragment;
use MediaWiki\Hook\ParserFirstCallInitHook;
use MediaWiki\MediaWikiServices;
use OutputPage;
use Parser;
use Sanitizer;
use Title;
use Wikimedia\AtEase\AtEase;
use Wikimedia\Parsoid\Utils\DOMCompat;
use Wikimedia\Parsoid\Utils\DOMUtils;
use Xml;
class ImageMap implements ParserFirstCallInitHook {
@ -64,7 +63,8 @@ class ImageMap implements ParserFirstCallInitHook {
$first = true;
$scale = 1;
$imageNode = null;
$domDoc = null;
$domDoc = DOMCompat::newDocument( true );
$domFragment = null;
$thumbWidth = 0;
$thumbHeight = 0;
$imageTitle = null;
@ -120,21 +120,14 @@ class ImageMap implements ParserFirstCallInitHook {
$imageHTML = $parser->getStripState()->unstripBoth( $imageHTML );
$imageHTML = Sanitizer::normalizeCharReferences( $imageHTML );
$domDoc = new DOMDocument();
AtEase::suppressWarnings();
$ok = $domDoc->loadXML( $imageHTML );
AtEase::restoreWarnings();
if ( !$ok ) {
$domFragment = $domDoc->createDocumentFragment();
DOMUtils::setFragmentInnerHTML( $domFragment, $imageHTML );
$imageNode = DOMCompat::querySelector( $domFragment, 'img' );
if ( !$imageNode ) {
return $this->error( 'imagemap_invalid_image' );
}
$xpath = new DOMXPath( $domDoc );
$imgs = $xpath->query( '//img' );
if ( !$imgs->length ) {
return $this->error( 'imagemap_invalid_image' );
}
$imageNode = $imgs->item( 0 );
$thumbWidth = $imageNode->getAttribute( 'width' );
$thumbHeight = $imageNode->getAttribute( 'height' );
$thumbWidth = (int)$imageNode->getAttribute( 'width' );
$thumbHeight = (int)$imageNode->getAttribute( 'height' );
$imageObj = $repoGroup->findFile( $imageTitle );
if ( !$imageObj || !$imageObj->exists() ) {
@ -284,7 +277,7 @@ class ImageMap implements ParserFirstCallInitHook {
}
}
if ( $first || !$imageNode || !$domDoc ) {
if ( !$imageNode || !$domFragment ) {
return $this->error( 'imagemap_no_image' );
}
@ -307,9 +300,9 @@ class ImageMap implements ParserFirstCallInitHook {
}
if ( $mapHTML !== '' ) {
$mapDoc = new DOMDocument();
$mapDoc->loadXML( $mapHTML );
$mapNode = $domDoc->importNode( $mapDoc->documentElement, true );
$mapFragment = $domDoc->createDocumentFragment();
DOMUtils::setFragmentInnerHTML( $mapFragment, $mapHTML );
$mapNode = $mapFragment->firstChild;
}
$div = null;
@ -320,15 +313,15 @@ class ImageMap implements ParserFirstCallInitHook {
$parent = $anchor->parentNode;
// Handle cases where there are no anchors, like `|link=`
if ( $anchor instanceof DOMDocument ) {
if ( $anchor instanceof DOMDocumentFragment ) {
$parent = $anchor;
$anchor = $imageNode;
}
$div = $parent->insertBefore( new DOMElement( 'div' ), $anchor );
$div = $parent->insertBefore( $domDoc->createElement( 'div' ), $anchor );
$div->setAttribute( 'class', 'noresize' );
if ( $defaultLinkAttribs ) {
$defaultAnchor = $div->appendChild( new DOMElement( 'a' ) );
$defaultAnchor = $div->appendChild( $domDoc->createElement( 'a' ) );
foreach ( $defaultLinkAttribs as $name => $value ) {
$defaultAnchor->setAttribute( $name, $value );
}
@ -338,7 +331,6 @@ class ImageMap implements ParserFirstCallInitHook {
}
// Add the map HTML to the div
// We used to add it before the div, but that made tidy unhappy
if ( isset( $mapNode ) ) {
$div->appendChild( $mapNode );
}
@ -362,12 +354,12 @@ class ImageMap implements ParserFirstCallInitHook {
$wrapper->setAttribute( 'class', $classes );
if ( $defaultLinkAttribs ) {
$imageParent = $wrapper->ownerDocument->createElement( 'a' );
$imageParent = $domDoc->createElement( 'a' );
foreach ( $defaultLinkAttribs as $name => $value ) {
$imageParent->setAttribute( $name, $value );
}
} else {
$imageParent = new DOMElement( 'span' );
$imageParent = $domDoc->createElement( 'span' );
}
$wrapper->insertBefore( $imageParent, $anchor );
@ -377,9 +369,7 @@ class ImageMap implements ParserFirstCallInitHook {
$hasVisibleMedia = in_array( $format, [ 'Thumb', 'Frame' ], true );
if ( !$hasVisibleMedia ) {
$xpath = new DOMXPath( $wrapper->ownerDocument );
$captions = $xpath->query( '//figcaption', $wrapper );
$caption = $captions->item( 0 );
$caption = DOMCompat::querySelector( $domFragment, 'figcaption' );
$captionText = trim( $caption->textContent );
if ( $captionText ) {
$imageParent->setAttribute( 'title', $captionText );
@ -395,9 +385,8 @@ class ImageMap implements ParserFirstCallInitHook {
}
// Determine whether a "magnify" link is present
$xpath = new DOMXPath( $domDoc );
$magnify = $xpath->query( '//div[@class="magnify"]' );
if ( $enableLegacyMediaDOM && !$magnify->length && $descType !== self::NONE ) {
$magnify = DOMCompat::querySelector( $domFragment, '.magnify' );
if ( $enableLegacyMediaDOM && !$magnify && $descType !== self::NONE ) {
// Add image description link
if ( $descType === self::TOP_LEFT || $descType === self::BOTTOM_LEFT ) {
$marginLeft = 0;
@ -412,20 +401,20 @@ class ImageMap implements ParserFirstCallInitHook {
$marginTop = -20;
}
$div->setAttribute( 'style', "height: {$thumbHeight}px; width: {$thumbWidth}px; " );
$descWrapper = $div->appendChild( new DOMElement( 'div' ) );
$descWrapper = $div->appendChild( $domDoc->createElement( 'div' ) );
$descWrapper->setAttribute( 'style',
"margin-left: {$marginLeft}px; " .
"margin-top: {$marginTop}px; " .
"text-align: left;"
);
$descAnchor = $descWrapper->appendChild( new DOMElement( 'a' ) );
$descAnchor = $descWrapper->appendChild( $domDoc->createElement( 'a' ) );
$descAnchor->setAttribute( 'href', $imageTitle->getLocalURL() );
$descAnchor->setAttribute(
'title',
wfMessage( 'imagemap_description' )->inContentLanguage()->text()
);
$descImg = $descAnchor->appendChild( new DOMElement( 'img' ) );
$descImg = $descAnchor->appendChild( $domDoc->createElement( 'img' ) );
$descImg->setAttribute(
'alt',
wfMessage( 'imagemap_description' )->inContentLanguage()->text()
@ -438,10 +427,8 @@ class ImageMap implements ParserFirstCallInitHook {
$descImg->setAttribute( 'style', 'border: none;' );
}
// Output the result
// We use saveXML() not saveHTML() because then we get XHTML-compliant output.
// The disadvantage is that we have to strip out the DTD
$output = preg_replace( '/<\?xml[^?]*\?>/', '', $domDoc->saveXML( null, LIBXML_NOEMPTYTAG ) );
// Output the result (XHTML-compliant)
$output = DOMUtils::getFragmentInnerHTML( $domFragment );
// Register links
foreach ( $links as $title ) {

View file

@ -240,3 +240,31 @@ File:Foobar.jpg|link=|Hi ho
!! html/parsoid
<figure class="mw-default-size noresize" typeof="mw:File mw:Extension/imagemap" about="#mwt2" data-mw='{"name":"imagemap","attrs":{},"body":{"extsrc":"\nFile:Foobar.jpg|link=|Hi ho\n"}}'><span title="Hi ho"><img alt="Hi ho" resource="./File:Foobar.jpg" src="http://example.com/images/3/3a/Foobar.jpg" decoding="async" data-file-width="1941" data-file-height="220" data-file-type="bitmap" height="220" width="1941"/></span><figcaption>Hi ho</figcaption></figure>
!! end
!! test
File with <br> in caption (T113791)
!! config
wgParserEnableLegacyMediaDOM=false
!! wikitext
<imagemap>
File:Foobar.jpg|thumb|a<br>b
</imagemap>
!! html/php
<figure class="mw-default-size noresize" typeof="mw:File/Thumb"><span title="ab"><img alt="" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" width="180" height="20" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x" /></span><figcaption>a<br />b</figcaption></figure>
!! html/parsoid
<figure class="mw-default-size noresize" typeof="mw:File/Thumb mw:Extension/imagemap" data-mw='{"name":"imagemap","attrs":{},"body":{"extsrc":"\nFile:Foobar.jpg|thumb|a&lt;br>b\n"}}'><span><img resource="./File:Foobar.jpg" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" data-file-width="1941" data-file-height="220" data-file-type="bitmap" height="20" width="180" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x"/></span><figcaption>a<br/>b</figcaption></figure>
!! end
!! test
File with <br /> in caption (T113791)
!! config
wgParserEnableLegacyMediaDOM=false
!! wikitext
<imagemap>
File:Foobar.jpg|thumb|a<br />b
</imagemap>
!! html/php
<figure class="mw-default-size noresize" typeof="mw:File/Thumb"><span title="ab"><img alt="" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" width="180" height="20" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x" /></span><figcaption>a<br />b</figcaption></figure>
!! html/parsoid
<figure class="mw-default-size noresize" typeof="mw:File/Thumb mw:Extension/imagemap" data-mw='{"name":"imagemap","attrs":{},"body":{"extsrc":"\nFile:Foobar.jpg|thumb|a&lt;br />b\n"}}'><span><img resource="./File:Foobar.jpg" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" data-file-width="1941" data-file-height="220" data-file-type="bitmap" height="20" width="180" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x"/></span><figcaption>a<br/>b</figcaption></figure>
!! end