mediawiki-extensions-Visual.../modules/ve-mw/preinit/ve.utils.parsoid.js
Bartosz Dziewoński 461c76981f Rewrite mw.libs.ve.getTargetDataFromHref with URL API
At first I was going for a more minimal replacement of mw.Uri with URL,
until I discovered that this code depends on a mw.Uri bug that would be
difficult to replicate:

  // Expected: Relative URLs are accepted
  new mw.Uri( '/foo' ).toString() // => 'https://localhost/foo'
  // Expected: Protocol is optional
  new mw.Uri( 'example.com/foo' ).toString() // => 'https://example.com/foo'
  // Unexpected: Treated as empty domain with no protocol rather than relative URL
  new mw.Uri( './foo' ).toString() // => 'https://./foo'

So I went for a bigger rewrite to preserve the intent rather than the
exact logic.

I had to change some test cases to use more realistic fake data. They
previously relied on bugs in our URL handling to pass despite the base
URLs being incorrect, particularly for non-short URLs (see T270219).
In my testing non-short URLs behave the same as before in practice.

Depends-On: I07a8c097dba0f5572c0aedf4febdf1434063ea6f
Bug: T325249
Change-Id: I232361266c1dda795b88018c3aaa3d9ecbe42b93
2023-01-23 14:59:23 +00:00

368 lines
13 KiB
JavaScript

/*!
* Parsoid utilities.
*
* @copyright 2011-2020 VisualEditor Team and others; see http://ve.mit-license.org
*/
mw.libs.ve = mw.libs.ve || {};
/**
* Decode a URI component into a mediawiki article title
*
* N.B. Illegal article titles can result from fairly reasonable input (e.g. "100%25beef");
* see https://phabricator.wikimedia.org/T137847 .
*
* @param {string} s String to decode
* @param {boolean} [preserveUnderscores=false] Don't convert underscores to spaces
* @return {string} Decoded string, or original string if decodeURIComponent failed
*/
mw.libs.ve.decodeURIComponentIntoArticleTitle = function ( s, preserveUnderscores ) {
try {
s = decodeURIComponent( s );
} catch ( e ) {
return s;
}
if ( preserveUnderscores ) {
return s;
}
return s.replace( /_/g, ' ' );
};
/**
* Unwrap Parsoid sections
*
* @param {HTMLElement} element Parent element, e.g. document body
* @param {string} [keepSection] Section to keep
*/
mw.libs.ve.unwrapParsoidSections = function ( element, keepSection ) {
Array.prototype.forEach.call( element.querySelectorAll( 'section[data-mw-section-id]' ), function ( section ) {
var parent = section.parentNode,
sectionId = section.getAttribute( 'data-mw-section-id' );
// Copy section ID to first child (should be a heading)
// Pseudo-sections (with negative section IDs) may not have a heading
if ( sectionId !== null && +sectionId > 0 ) {
section.firstChild.setAttribute( 'data-mw-section-id', sectionId );
}
if ( keepSection !== undefined && sectionId === keepSection ) {
return;
}
while ( section.firstChild ) {
parent.insertBefore( section.firstChild, section );
}
parent.removeChild( section );
} );
};
/**
* Strip legacy (non-HTML5) IDs; typically found as section IDs inside
* headings.
*
* @param {HTMLElement} element Parent element, e.g. document body
*/
mw.libs.ve.stripParsoidFallbackIds = function ( element ) {
Array.prototype.forEach.call( element.querySelectorAll( 'span[typeof="mw:FallbackId"][id]:empty' ), function ( legacySpan ) {
legacySpan.parentNode.removeChild( legacySpan );
} );
};
mw.libs.ve.restbaseIdRegExp = /^mw[a-zA-Z0-9\-_]{2,6}$/;
mw.libs.ve.stripRestbaseIds = function ( doc ) {
var restbaseIdRegExp = mw.libs.ve.restbaseIdRegExp;
Array.prototype.forEach.call( doc.querySelectorAll( '[id^="mw"]' ), function ( element ) {
if ( restbaseIdRegExp.test( element.id ) ) {
element.removeAttribute( 'id' );
}
} );
};
/**
* Re-duplicate deduplicated TemplateStyles, for correct rendering when editing a section or
* when templates are removed during the edit.
*
* @param {HTMLElement} element Parent element, e.g. document body
*/
mw.libs.ve.reduplicateStyles = function ( element ) {
Array.prototype.forEach.call( element.querySelectorAll( 'link[rel~="mw-deduplicated-inline-style"]' ), function ( link ) {
var href = link.getAttribute( 'href' );
if ( !href || href.slice( 0, 'mw-data:'.length ) !== 'mw-data:' ) {
return;
}
var key = href.slice( 'mw-data:'.length );
var style = element.querySelector( 'style[data-mw-deduplicate="' + key + '"]' );
if ( !style ) {
return;
}
var newStyle = link.ownerDocument.createElement( 'style' );
newStyle.setAttribute( 'data-mw-deduplicate', key );
// Copy content from the old `style` node (for rendering)
for ( var i = 0; i < style.childNodes.length; i++ ) {
newStyle.appendChild( style.childNodes[ i ].cloneNode( true ) );
}
// Copy attributes from the old `link` node (for selser)
Array.prototype.forEach.call( link.attributes, function ( attr ) {
if ( attr.name !== 'rel' && attr.name !== 'href' ) {
newStyle.setAttribute( attr.name, attr.value );
}
} );
link.parentNode.replaceChild( newStyle, link );
} );
Array.prototype.forEach.call( element.querySelectorAll( 'style[data-mw-deduplicate]:empty' ), function ( style ) {
var key = style.getAttribute( 'data-mw-deduplicate' );
var firstStyle = element.querySelector( 'style[data-mw-deduplicate="' + key + '"]' );
if ( !firstStyle || firstStyle === style ) {
return;
}
// Copy content from the first matching `style` node (for rendering)
for ( var i = 0; i < firstStyle.childNodes.length; i++ ) {
style.appendChild( firstStyle.childNodes[ i ].cloneNode( true ) );
}
} );
};
/**
* De-duplicate TemplateStyles, like Parsoid does.
*
* @param {HTMLElement} element Parent element, e.g. document body
*/
mw.libs.ve.deduplicateStyles = function ( element ) {
/**
* Check whether `node` is in a fosterable position. (Nodes in these positions may be moved
* elsewhere in the DOM by the HTML5 parsing algorithm, if they don't have the right tag name.)
* https://html.spec.whatwg.org/#appropriate-place-for-inserting-a-node
*
* @private
* @param {Node|null} node
* @return {boolean}
*/
function isFosterablePosition( node ) {
var fosterablePositions = [ 'table', 'thead', 'tbody', 'tfoot', 'tr' ];
return node && fosterablePositions.indexOf( node.parentNode.nodeName.toLowerCase() ) !== -1;
}
var styleTagKeys = {};
Array.prototype.forEach.call( element.querySelectorAll( 'style[data-mw-deduplicate]' ), function ( style ) {
var key = style.getAttribute( 'data-mw-deduplicate' );
if ( !styleTagKeys[ key ] ) {
// Not a dupe
styleTagKeys[ key ] = true;
return;
}
if ( !isFosterablePosition( style ) ) {
// Dupe - replace with a placeholder <link> reference
var link = style.ownerDocument.createElement( 'link' );
link.setAttribute( 'rel', 'mw-deduplicated-inline-style' );
link.setAttribute( 'href', 'mw-data:' + key );
// Copy attributes from the old `link` node (for selser)
Array.prototype.forEach.call( style.attributes, function ( attr ) {
if ( attr.name !== 'rel' && attr.name !== 'data-mw-deduplicate' ) {
link.setAttribute( attr.name, attr.value );
}
} );
style.parentNode.replaceChild( link, style );
} else {
// Duplicate style tag found in fosterable position.
// Not deduping it (to avoid corruption when the resulting HTML is parsed: T299767),
// but emptying out the style tag for consistency with Parsoid.
// Parsoid says it does this for performance reasons.
style.innerHTML = '';
}
} );
};
/**
* Fix fragment links which should be relative to the current document
*
* This prevents these links from trying to navigate to another page,
* or open in a new window.
*
* Call this after ve.targetLinksToNewWindow, as it removes the target attribute.
* Call this after LinkCache.styleParsoidElements, as it breaks that method by including the query string.
*
* @param {HTMLElement} container Parent element, e.g. document body
* @param {mw.Title} docTitle Current title, only links to this title will be normalized
* @param {string} [prefix] Prefix to add to fragment and target ID to avoid collisions
*/
mw.libs.ve.fixFragmentLinks = function ( container, docTitle, prefix ) {
var docTitleText = docTitle.getPrefixedText();
prefix = prefix || '';
Array.prototype.forEach.call( container.querySelectorAll( 'a[href*="#"]' ), function ( el ) {
var fragment = null;
if ( el.getAttribute( 'href' )[ 0 ] === '#' ) {
// Legacy parser
fragment = el.getAttribute( 'href' ).slice( 1 );
} else {
// Parsoid HTML
var targetData = mw.libs.ve.getTargetDataFromHref( el.href, el.ownerDocument );
if ( targetData.isInternal ) {
var title = mw.Title.newFromText( targetData.title );
if ( title && title.getPrefixedText() === docTitleText ) {
fragment = new URL( el.href ).hash.slice( 1 );
}
}
}
if ( fragment !== null ) {
if ( !fragment ) {
// Special case for empty fragment, even if prefix set
el.setAttribute( 'href', '#' );
} else {
if ( prefix ) {
var target = container.querySelector( '#' + $.escapeSelector( fragment ) );
// There may be multiple links to a specific target, so check the target
// hasn't already been fixed (in which case it would be null)
if ( target ) {
target.setAttribute( 'id', prefix + fragment );
target.setAttribute( 'data-mw-id-fixed', '' );
}
}
el.setAttribute( 'href', '#' + prefix + fragment );
}
el.removeAttribute( 'target' );
}
} );
// Remove any section heading anchors which weren't fixed above (T218492)
Array.prototype.forEach.call( container.querySelectorAll( 'h1, h2, h3, h4, h5, h6' ), function ( el ) {
if ( el.hasAttribute( 'id' ) && !el.hasAttribute( 'data-mw-id-fixed' ) ) {
el.removeAttribute( 'id' );
}
} );
};
/**
* Parse URL to get title it points to.
*
* @param {string} href
* @param {HTMLDocument} doc Document whose base URL to use
* @return {Object} Information about the given href
* @return {string} [return.title]
* The title of the internal link (if the href is internal)
* @return {boolean} return.isInternal
* True if the href pointed to the local wiki, false if href is external
*/
mw.libs.ve.getTargetDataFromHref = function ( href, doc ) {
function regexEscape( str ) {
return str.replace( /([.?*+^$[\]\\(){}|-])/g, '\\$1' );
}
function returnExternalData() {
return { isInternal: false };
}
function returnInternalData( titleish ) {
// This value doesn't necessarily come from Parsoid (and it might not have the "./" prefix), but
// this method will work fine.
var data = mw.libs.ve.parseParsoidResourceName( titleish );
data.isInternal = true;
return data;
}
var url = new URL( href, doc.baseURI );
// Equivalent to `ve.init.platform.getExternalLinkUrlProtocolsRegExp()`, which can not be called here
var externalLinkUrlProtocolsRegExp = new RegExp( '^(' + mw.config.get( 'wgUrlProtocols' ) + ')', 'i' );
// We don't want external links that don't start with a registered external URL protocol
// (to avoid generating 'javascript:' URLs), so treat it as internal
if ( !externalLinkUrlProtocolsRegExp.test( url.toString() ) ) {
return returnInternalData( url.toString() );
}
// Strip red link query parameters
if ( url.searchParams.get( 'action' ) === 'edit' && url.searchParams.get( 'redlink' ) === '1' ) {
url.searchParams.delete( 'action' );
url.searchParams.delete( 'redlink' );
}
// Count remaining query parameters
var keys = [];
url.searchParams.forEach( function ( val, key ) {
keys.push( key );
} );
var queryLength = keys.length;
var relativeHref = url.toString().replace( /^https?:/i, '' );
// Check if this matches the server's script path (as used by red links)
var scriptBase = new URL( mw.config.get( 'wgScript' ), doc.baseURI ).toString().replace( /^https?:/i, '' );
if ( relativeHref.indexOf( scriptBase ) === 0 ) {
if ( queryLength === 1 && url.searchParams.get( 'title' ) ) {
return returnInternalData( url.searchParams.get( 'title' ) + url.hash );
}
}
// Check if this matches the server's article path
var articleBase = new URL( mw.config.get( 'wgArticlePath' ), doc.baseURI ).toString().replace( /^https?:/i, '' );
var articleBaseRegex = new RegExp( regexEscape( articleBase ).replace( regexEscape( '$1' ), '(.*)' ) );
var matches = relativeHref.match( articleBaseRegex );
if ( matches ) {
if ( queryLength === 0 && matches && matches[ 1 ].split( '#' )[ 0 ].indexOf( '?' ) === -1 ) {
// Take the relative path
return returnInternalData( matches[ 1 ] );
}
}
// Doesn't match any of the known URL patterns, or has extra parameters
return returnExternalData();
};
/**
* Encode a page title into a Parsoid resource name.
*
* @param {string} title
* @return {string}
*/
mw.libs.ve.encodeParsoidResourceName = function ( title ) {
// Parsoid: Sanitizer::sanitizeTitleURI, Env::makeLink
var idx = title.indexOf( '#' );
var anchor = null;
if ( idx !== -1 ) {
anchor = title.slice( idx + 1 );
title = title.slice( 0, idx );
}
var encodedTitle = title.replace( /[%? [\]#|<>]/g, function ( match ) {
return mw.util.wikiUrlencode( match );
} );
if ( anchor !== null ) {
encodedTitle += '#' + mw.util.escapeIdForLink( anchor );
}
return './' + encodedTitle;
};
/**
* Split Parsoid resource name into the href prefix and the page title.
*
* @param {string} resourceName Resource name, from a `href` or `resource` attribute
* @return {Object} Object with the following properties:
* @return {string} return.title Full page title in text form (with namespace, and spaces instead of underscores)
*/
mw.libs.ve.parseParsoidResourceName = function ( resourceName ) {
// Resource names are always prefixed with './' to prevent the MediaWiki namespace from being
// interpreted as a URL protocol, consider e.g. 'href="./File:Foo.png"'.
// (We accept input without the prefix, so this can also take plain page titles.)
var matches = resourceName.match( /^(\.\/|)(.*)$/ );
return {
// '%' and '?' are valid in page titles, but normally URI-encoded. This also changes underscores
// to spaces.
title: mw.libs.ve.decodeURIComponentIntoArticleTitle( matches[ 2 ] )
};
};
/**
* Extract the page title from a Parsoid resource name.
*
* @param {string} resourceName Resource name, from a `href` or `resource` attribute
* @return {string} Full page title in text form (with namespace, and spaces instead of underscores)
*/
mw.libs.ve.normalizeParsoidResourceName = function ( resourceName ) {
return mw.libs.ve.parseParsoidResourceName( resourceName ).title;
};