Use Parsoid's HTML5-compliant helpers for HTML transformations

Previously XML parsing and serialization was used, causing `<br>`
to be disallowed and `<br />` to turn into `<br></br>`.

Also update some uses of XPath to use querySelector() helper.

Bug: T113791
Change-Id: Ic2069015b4cbb5d05a72eb383052319048111623
This commit is contained in:
Bartosz Dziewoński 2022-10-28 06:00:05 +02:00
parent 7aa9d03bea
commit 2cc6549898
3 changed files with 70 additions and 41 deletions

14
.phan/stubs/DomImpl.php Normal file
View file

@ -0,0 +1,14 @@
<?php
class_alias( "DOMAttr", "Wikimedia\\Parsoid\\DOM\\Attr" );
class_alias( "DOMCharacterData", "Wikimedia\\Parsoid\\DOM\\CharacterData" );
class_alias( "DOMComment", "Wikimedia\\Parsoid\\DOM\\Comment" );
class_alias( "DOMDocument", "Wikimedia\\Parsoid\\DOM\\Document" );
class_alias( "DOMDocumentFragment", "Wikimedia\\Parsoid\\DOM\\DocumentFragment" );
class_alias( "DOMDocumentType", "Wikimedia\\Parsoid\\DOM\\DocumentType" );
class_alias( "DOMDOMException", "Wikimedia\\Parsoid\\DOM\\DOMException" );
class_alias( "DOMDOMParser", "Wikimedia\\Parsoid\\DOM\\DOMParser" );
class_alias( "DOMElement", "Wikimedia\\Parsoid\\DOM\\Element" );
class_alias( "DOMNode", "Wikimedia\\Parsoid\\DOM\\Node" );
class_alias( "DOMNodeList", "Wikimedia\\Parsoid\\DOM\\NodeList" );
class_alias( "DOMProcessingInstruction", "Wikimedia\\Parsoid\\DOM\\ProcessingInstruction" );
class_alias( "DOMText", "Wikimedia\\Parsoid\\DOM\\Text" );

View file

@ -21,16 +21,15 @@
namespace MediaWiki\Extension\ImageMap; namespace MediaWiki\Extension\ImageMap;
use ConfigFactory; use ConfigFactory;
use DOMDocument; use DOMDocumentFragment;
use DOMElement;
use DOMXPath;
use MediaWiki\Hook\ParserFirstCallInitHook; use MediaWiki\Hook\ParserFirstCallInitHook;
use MediaWiki\MediaWikiServices; use MediaWiki\MediaWikiServices;
use OutputPage; use OutputPage;
use Parser; use Parser;
use Sanitizer; use Sanitizer;
use Title; use Title;
use Wikimedia\AtEase\AtEase; use Wikimedia\Parsoid\Utils\DOMCompat;
use Wikimedia\Parsoid\Utils\DOMUtils;
use Xml; use Xml;
class ImageMap implements ParserFirstCallInitHook { class ImageMap implements ParserFirstCallInitHook {
@ -64,7 +63,8 @@ class ImageMap implements ParserFirstCallInitHook {
$first = true; $first = true;
$scale = 1; $scale = 1;
$imageNode = null; $imageNode = null;
$domDoc = null; $domDoc = DOMCompat::newDocument( true );
$domFragment = null;
$thumbWidth = 0; $thumbWidth = 0;
$thumbHeight = 0; $thumbHeight = 0;
$imageTitle = null; $imageTitle = null;
@ -120,21 +120,14 @@ class ImageMap implements ParserFirstCallInitHook {
$imageHTML = $parser->getStripState()->unstripBoth( $imageHTML ); $imageHTML = $parser->getStripState()->unstripBoth( $imageHTML );
$imageHTML = Sanitizer::normalizeCharReferences( $imageHTML ); $imageHTML = Sanitizer::normalizeCharReferences( $imageHTML );
$domDoc = new DOMDocument(); $domFragment = $domDoc->createDocumentFragment();
AtEase::suppressWarnings(); DOMUtils::setFragmentInnerHTML( $domFragment, $imageHTML );
$ok = $domDoc->loadXML( $imageHTML ); $imageNode = DOMCompat::querySelector( $domFragment, 'img' );
AtEase::restoreWarnings(); if ( !$imageNode ) {
if ( !$ok ) {
return $this->error( 'imagemap_invalid_image' ); return $this->error( 'imagemap_invalid_image' );
} }
$xpath = new DOMXPath( $domDoc ); $thumbWidth = (int)$imageNode->getAttribute( 'width' );
$imgs = $xpath->query( '//img' ); $thumbHeight = (int)$imageNode->getAttribute( 'height' );
if ( !$imgs->length ) {
return $this->error( 'imagemap_invalid_image' );
}
$imageNode = $imgs->item( 0 );
$thumbWidth = $imageNode->getAttribute( 'width' );
$thumbHeight = $imageNode->getAttribute( 'height' );
$imageObj = $repoGroup->findFile( $imageTitle ); $imageObj = $repoGroup->findFile( $imageTitle );
if ( !$imageObj || !$imageObj->exists() ) { if ( !$imageObj || !$imageObj->exists() ) {
@ -284,7 +277,7 @@ class ImageMap implements ParserFirstCallInitHook {
} }
} }
if ( $first || !$imageNode || !$domDoc ) { if ( !$imageNode || !$domFragment ) {
return $this->error( 'imagemap_no_image' ); return $this->error( 'imagemap_no_image' );
} }
@ -307,9 +300,9 @@ class ImageMap implements ParserFirstCallInitHook {
} }
if ( $mapHTML !== '' ) { if ( $mapHTML !== '' ) {
$mapDoc = new DOMDocument(); $mapFragment = $domDoc->createDocumentFragment();
$mapDoc->loadXML( $mapHTML ); DOMUtils::setFragmentInnerHTML( $mapFragment, $mapHTML );
$mapNode = $domDoc->importNode( $mapDoc->documentElement, true ); $mapNode = $mapFragment->firstChild;
} }
$div = null; $div = null;
@ -320,15 +313,15 @@ class ImageMap implements ParserFirstCallInitHook {
$parent = $anchor->parentNode; $parent = $anchor->parentNode;
// Handle cases where there are no anchors, like `|link=` // Handle cases where there are no anchors, like `|link=`
if ( $anchor instanceof DOMDocument ) { if ( $anchor instanceof DOMDocumentFragment ) {
$parent = $anchor; $parent = $anchor;
$anchor = $imageNode; $anchor = $imageNode;
} }
$div = $parent->insertBefore( new DOMElement( 'div' ), $anchor ); $div = $parent->insertBefore( $domDoc->createElement( 'div' ), $anchor );
$div->setAttribute( 'class', 'noresize' ); $div->setAttribute( 'class', 'noresize' );
if ( $defaultLinkAttribs ) { if ( $defaultLinkAttribs ) {
$defaultAnchor = $div->appendChild( new DOMElement( 'a' ) ); $defaultAnchor = $div->appendChild( $domDoc->createElement( 'a' ) );
foreach ( $defaultLinkAttribs as $name => $value ) { foreach ( $defaultLinkAttribs as $name => $value ) {
$defaultAnchor->setAttribute( $name, $value ); $defaultAnchor->setAttribute( $name, $value );
} }
@ -338,7 +331,6 @@ class ImageMap implements ParserFirstCallInitHook {
} }
// Add the map HTML to the div // Add the map HTML to the div
// We used to add it before the div, but that made tidy unhappy
if ( isset( $mapNode ) ) { if ( isset( $mapNode ) ) {
$div->appendChild( $mapNode ); $div->appendChild( $mapNode );
} }
@ -362,12 +354,12 @@ class ImageMap implements ParserFirstCallInitHook {
$wrapper->setAttribute( 'class', $classes ); $wrapper->setAttribute( 'class', $classes );
if ( $defaultLinkAttribs ) { if ( $defaultLinkAttribs ) {
$imageParent = $wrapper->ownerDocument->createElement( 'a' ); $imageParent = $domDoc->createElement( 'a' );
foreach ( $defaultLinkAttribs as $name => $value ) { foreach ( $defaultLinkAttribs as $name => $value ) {
$imageParent->setAttribute( $name, $value ); $imageParent->setAttribute( $name, $value );
} }
} else { } else {
$imageParent = new DOMElement( 'span' ); $imageParent = $domDoc->createElement( 'span' );
} }
$wrapper->insertBefore( $imageParent, $anchor ); $wrapper->insertBefore( $imageParent, $anchor );
@ -377,9 +369,7 @@ class ImageMap implements ParserFirstCallInitHook {
$hasVisibleMedia = in_array( $format, [ 'Thumb', 'Frame' ], true ); $hasVisibleMedia = in_array( $format, [ 'Thumb', 'Frame' ], true );
if ( !$hasVisibleMedia ) { if ( !$hasVisibleMedia ) {
$xpath = new DOMXPath( $wrapper->ownerDocument ); $caption = DOMCompat::querySelector( $domFragment, 'figcaption' );
$captions = $xpath->query( '//figcaption', $wrapper );
$caption = $captions->item( 0 );
$captionText = trim( $caption->textContent ); $captionText = trim( $caption->textContent );
if ( $captionText ) { if ( $captionText ) {
$imageParent->setAttribute( 'title', $captionText ); $imageParent->setAttribute( 'title', $captionText );
@ -395,9 +385,8 @@ class ImageMap implements ParserFirstCallInitHook {
} }
// Determine whether a "magnify" link is present // Determine whether a "magnify" link is present
$xpath = new DOMXPath( $domDoc ); $magnify = DOMCompat::querySelector( $domFragment, '.magnify' );
$magnify = $xpath->query( '//div[@class="magnify"]' ); if ( $enableLegacyMediaDOM && !$magnify && $descType !== self::NONE ) {
if ( $enableLegacyMediaDOM && !$magnify->length && $descType !== self::NONE ) {
// Add image description link // Add image description link
if ( $descType === self::TOP_LEFT || $descType === self::BOTTOM_LEFT ) { if ( $descType === self::TOP_LEFT || $descType === self::BOTTOM_LEFT ) {
$marginLeft = 0; $marginLeft = 0;
@ -412,20 +401,20 @@ class ImageMap implements ParserFirstCallInitHook {
$marginTop = -20; $marginTop = -20;
} }
$div->setAttribute( 'style', "height: {$thumbHeight}px; width: {$thumbWidth}px; " ); $div->setAttribute( 'style', "height: {$thumbHeight}px; width: {$thumbWidth}px; " );
$descWrapper = $div->appendChild( new DOMElement( 'div' ) ); $descWrapper = $div->appendChild( $domDoc->createElement( 'div' ) );
$descWrapper->setAttribute( 'style', $descWrapper->setAttribute( 'style',
"margin-left: {$marginLeft}px; " . "margin-left: {$marginLeft}px; " .
"margin-top: {$marginTop}px; " . "margin-top: {$marginTop}px; " .
"text-align: left;" "text-align: left;"
); );
$descAnchor = $descWrapper->appendChild( new DOMElement( 'a' ) ); $descAnchor = $descWrapper->appendChild( $domDoc->createElement( 'a' ) );
$descAnchor->setAttribute( 'href', $imageTitle->getLocalURL() ); $descAnchor->setAttribute( 'href', $imageTitle->getLocalURL() );
$descAnchor->setAttribute( $descAnchor->setAttribute(
'title', 'title',
wfMessage( 'imagemap_description' )->inContentLanguage()->text() wfMessage( 'imagemap_description' )->inContentLanguage()->text()
); );
$descImg = $descAnchor->appendChild( new DOMElement( 'img' ) ); $descImg = $descAnchor->appendChild( $domDoc->createElement( 'img' ) );
$descImg->setAttribute( $descImg->setAttribute(
'alt', 'alt',
wfMessage( 'imagemap_description' )->inContentLanguage()->text() wfMessage( 'imagemap_description' )->inContentLanguage()->text()
@ -438,10 +427,8 @@ class ImageMap implements ParserFirstCallInitHook {
$descImg->setAttribute( 'style', 'border: none;' ); $descImg->setAttribute( 'style', 'border: none;' );
} }
// Output the result // Output the result (XHTML-compliant)
// We use saveXML() not saveHTML() because then we get XHTML-compliant output. $output = DOMUtils::getFragmentInnerHTML( $domFragment );
// The disadvantage is that we have to strip out the DTD
$output = preg_replace( '/<\?xml[^?]*\?>/', '', $domDoc->saveXML( null, LIBXML_NOEMPTYTAG ) );
// Register links // Register links
foreach ( $links as $title ) { foreach ( $links as $title ) {

View file

@ -240,3 +240,31 @@ File:Foobar.jpg|link=|Hi ho
!! html/parsoid !! html/parsoid
<figure class="mw-default-size noresize" typeof="mw:File mw:Extension/imagemap" about="#mwt2" data-mw='{"name":"imagemap","attrs":{},"body":{"extsrc":"\nFile:Foobar.jpg|link=|Hi ho\n"}}'><span title="Hi ho"><img alt="Hi ho" resource="./File:Foobar.jpg" src="http://example.com/images/3/3a/Foobar.jpg" decoding="async" data-file-width="1941" data-file-height="220" data-file-type="bitmap" height="220" width="1941"/></span><figcaption>Hi ho</figcaption></figure> <figure class="mw-default-size noresize" typeof="mw:File mw:Extension/imagemap" about="#mwt2" data-mw='{"name":"imagemap","attrs":{},"body":{"extsrc":"\nFile:Foobar.jpg|link=|Hi ho\n"}}'><span title="Hi ho"><img alt="Hi ho" resource="./File:Foobar.jpg" src="http://example.com/images/3/3a/Foobar.jpg" decoding="async" data-file-width="1941" data-file-height="220" data-file-type="bitmap" height="220" width="1941"/></span><figcaption>Hi ho</figcaption></figure>
!! end !! end
!! test
File with <br> in caption (T113791)
!! config
wgParserEnableLegacyMediaDOM=false
!! wikitext
<imagemap>
File:Foobar.jpg|thumb|a<br>b
</imagemap>
!! html/php
<figure class="mw-default-size noresize" typeof="mw:File/Thumb"><span title="ab"><img alt="" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" width="180" height="20" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x" /></span><figcaption>a<br />b</figcaption></figure>
!! html/parsoid
<figure class="mw-default-size noresize" typeof="mw:File/Thumb mw:Extension/imagemap" data-mw='{"name":"imagemap","attrs":{},"body":{"extsrc":"\nFile:Foobar.jpg|thumb|a&lt;br>b\n"}}'><span><img resource="./File:Foobar.jpg" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" data-file-width="1941" data-file-height="220" data-file-type="bitmap" height="20" width="180" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x"/></span><figcaption>a<br/>b</figcaption></figure>
!! end
!! test
File with <br /> in caption (T113791)
!! config
wgParserEnableLegacyMediaDOM=false
!! wikitext
<imagemap>
File:Foobar.jpg|thumb|a<br />b
</imagemap>
!! html/php
<figure class="mw-default-size noresize" typeof="mw:File/Thumb"><span title="ab"><img alt="" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" width="180" height="20" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x" /></span><figcaption>a<br />b</figcaption></figure>
!! html/parsoid
<figure class="mw-default-size noresize" typeof="mw:File/Thumb mw:Extension/imagemap" data-mw='{"name":"imagemap","attrs":{},"body":{"extsrc":"\nFile:Foobar.jpg|thumb|a&lt;br />b\n"}}'><span><img resource="./File:Foobar.jpg" src="http://example.com/images/thumb/3/3a/Foobar.jpg/180px-Foobar.jpg" decoding="async" data-file-width="1941" data-file-height="220" data-file-type="bitmap" height="20" width="180" srcset="http://example.com/images/thumb/3/3a/Foobar.jpg/270px-Foobar.jpg 1.5x, http://example.com/images/thumb/3/3a/Foobar.jpg/360px-Foobar.jpg 2x"/></span><figcaption>a<br/>b</figcaption></figure>
!! end