mediawiki-extensions-Visual.../modules/ve-mw/preinit/ve.utils.parsoid.js
Bartosz Dziewoński 25019d7f55 Fix encoding to roundtrip links without 'rawTitle'/'origTitle'
Our encoding for the hrefs like "./Foo" that we send to Parsoid
differed slightly from how Parsoid outputs them, so to avoid dirty
diffs, we had to store the original ones we received from Parsoid
and send them back if they were unchanged.

Change the encoding to match Parsoid's exactly (by referring to the
Parsoid source code), and then remove 'rawTitle'/'origTitle'.

On a historical note, 'rawTitle'/'origTitle' were originally added to
fix other issues with links, which I hope are long behind us:
* bb45d984ca (T145978)
* fda2e6c1b5 (T44140)

Follow-up to 362df66b47, which removed
some other old stuff from the handling of Parsoid links.

Bug: T325766
Change-Id: I0ad0a655380eb2fb29b5ac01e2e399ac550ce34a
2023-01-07 20:00:42 +00:00

405 lines
14 KiB
JavaScript

/*!
* Parsoid utilities.
*
* @copyright 2011-2020 VisualEditor Team and others; see http://ve.mit-license.org
*/
mw.libs.ve = mw.libs.ve || {};
/**
* Resolve a URL relative to a given base.
*
* Copied from ve.resolveUrl
*
* @param {string} url URL to resolve
* @param {HTMLDocument} base Document whose base URL to use
* @return {string} Resolved URL
*/
mw.libs.ve.resolveUrl = function ( url, base ) {
var node = base.createElement( 'a' );
node.setAttribute( 'href', url );
// If doc.baseURI isn't set, node.href will be an empty string
// This is crazy, returning the original URL is better
return node.href || url;
};
/**
* Decode a URI component into a mediawiki article title
*
* N.B. Illegal article titles can result from fairly reasonable input (e.g. "100%25beef");
* see https://phabricator.wikimedia.org/T137847 .
*
* @param {string} s String to decode
* @param {boolean} [preserveUnderscores=false] Don't convert underscores to spaces
* @return {string} Decoded string, or original string if decodeURIComponent failed
*/
mw.libs.ve.decodeURIComponentIntoArticleTitle = function ( s, preserveUnderscores ) {
try {
s = decodeURIComponent( s );
} catch ( e ) {
return s;
}
if ( preserveUnderscores ) {
return s;
}
return s.replace( /_/g, ' ' );
};
/**
* Unwrap Parsoid sections
*
* @param {HTMLElement} element Parent element, e.g. document body
* @param {string} [keepSection] Section to keep
*/
mw.libs.ve.unwrapParsoidSections = function ( element, keepSection ) {
Array.prototype.forEach.call( element.querySelectorAll( 'section[data-mw-section-id]' ), function ( section ) {
var parent = section.parentNode,
sectionId = section.getAttribute( 'data-mw-section-id' );
// Copy section ID to first child (should be a heading)
// Pseudo-sections (with negative section IDs) may not have a heading
if ( sectionId !== null && +sectionId > 0 ) {
section.firstChild.setAttribute( 'data-mw-section-id', sectionId );
}
if ( keepSection !== undefined && sectionId === keepSection ) {
return;
}
while ( section.firstChild ) {
parent.insertBefore( section.firstChild, section );
}
parent.removeChild( section );
} );
};
/**
* Strip legacy (non-HTML5) IDs; typically found as section IDs inside
* headings.
*
* @param {HTMLElement} element Parent element, e.g. document body
*/
mw.libs.ve.stripParsoidFallbackIds = function ( element ) {
Array.prototype.forEach.call( element.querySelectorAll( 'span[typeof="mw:FallbackId"][id]:empty' ), function ( legacySpan ) {
legacySpan.parentNode.removeChild( legacySpan );
} );
};
mw.libs.ve.restbaseIdRegExp = /^mw[a-zA-Z0-9\-_]{2,6}$/;
mw.libs.ve.stripRestbaseIds = function ( doc ) {
var restbaseIdRegExp = mw.libs.ve.restbaseIdRegExp;
Array.prototype.forEach.call( doc.querySelectorAll( '[id^="mw"]' ), function ( element ) {
if ( restbaseIdRegExp.test( element.id ) ) {
element.removeAttribute( 'id' );
}
} );
};
/**
* Re-duplicate deduplicated TemplateStyles, for correct rendering when editing a section or
* when templates are removed during the edit.
*
* @param {HTMLElement} element Parent element, e.g. document body
*/
mw.libs.ve.reduplicateStyles = function ( element ) {
Array.prototype.forEach.call( element.querySelectorAll( 'link[rel~="mw-deduplicated-inline-style"]' ), function ( link ) {
var href = link.getAttribute( 'href' );
if ( !href || href.slice( 0, 'mw-data:'.length ) !== 'mw-data:' ) {
return;
}
var key = href.slice( 'mw-data:'.length );
var style = element.querySelector( 'style[data-mw-deduplicate="' + key + '"]' );
if ( !style ) {
return;
}
var newStyle = link.ownerDocument.createElement( 'style' );
newStyle.setAttribute( 'data-mw-deduplicate', key );
// Copy content from the old `style` node (for rendering)
for ( var i = 0; i < style.childNodes.length; i++ ) {
newStyle.appendChild( style.childNodes[ i ].cloneNode( true ) );
}
// Copy attributes from the old `link` node (for selser)
Array.prototype.forEach.call( link.attributes, function ( attr ) {
if ( attr.name !== 'rel' && attr.name !== 'href' ) {
newStyle.setAttribute( attr.name, attr.value );
}
} );
link.parentNode.replaceChild( newStyle, link );
} );
Array.prototype.forEach.call( element.querySelectorAll( 'style[data-mw-deduplicate]:empty' ), function ( style ) {
var key = style.getAttribute( 'data-mw-deduplicate' );
var firstStyle = element.querySelector( 'style[data-mw-deduplicate="' + key + '"]' );
if ( !firstStyle || firstStyle === style ) {
return;
}
// Copy content from the first matching `style` node (for rendering)
for ( var i = 0; i < firstStyle.childNodes.length; i++ ) {
style.appendChild( firstStyle.childNodes[ i ].cloneNode( true ) );
}
} );
};
/**
* De-duplicate TemplateStyles, like Parsoid does.
*
* @param {HTMLElement} element Parent element, e.g. document body
*/
mw.libs.ve.deduplicateStyles = function ( element ) {
/**
* Check whether `node` is in a fosterable position. (Nodes in these positions may be moved
* elsewhere in the DOM by the HTML5 parsing algorithm, if they don't have the right tag name.)
* https://html.spec.whatwg.org/#appropriate-place-for-inserting-a-node
*
* @private
* @param {Node|null} node
* @return {boolean}
*/
function isFosterablePosition( node ) {
var fosterablePositions = [ 'table', 'thead', 'tbody', 'tfoot', 'tr' ];
return node && fosterablePositions.indexOf( node.parentNode.nodeName.toLowerCase() ) !== -1;
}
var styleTagKeys = {};
Array.prototype.forEach.call( element.querySelectorAll( 'style[data-mw-deduplicate]' ), function ( style ) {
var key = style.getAttribute( 'data-mw-deduplicate' );
if ( !styleTagKeys[ key ] ) {
// Not a dupe
styleTagKeys[ key ] = true;
return;
}
if ( !isFosterablePosition( style ) ) {
// Dupe - replace with a placeholder <link> reference
var link = style.ownerDocument.createElement( 'link' );
link.setAttribute( 'rel', 'mw-deduplicated-inline-style' );
link.setAttribute( 'href', 'mw-data:' + key );
// Copy attributes from the old `link` node (for selser)
Array.prototype.forEach.call( style.attributes, function ( attr ) {
if ( attr.name !== 'rel' && attr.name !== 'data-mw-deduplicate' ) {
link.setAttribute( attr.name, attr.value );
}
} );
style.parentNode.replaceChild( link, style );
} else {
// Duplicate style tag found in fosterable position.
// Not deduping it (to avoid corruption when the resulting HTML is parsed: T299767),
// but emptying out the style tag for consistency with Parsoid.
// Parsoid says it does this for performance reasons.
style.innerHTML = '';
}
} );
};
/**
* Fix fragment links which should be relative to the current document
*
* This prevents these links from trying to navigate to another page,
* or open in a new window.
*
* Call this after ve.targetLinksToNewWindow, as it removes the target attribute.
* Call this after LinkCache.styleParsoidElements, as it breaks that method by including the query string.
*
* @param {HTMLElement} container Parent element, e.g. document body
* @param {mw.Title} docTitle Current title, only links to this title will be normalized
* @param {string} [prefix] Prefix to add to fragment and target ID to avoid collisions
*/
mw.libs.ve.fixFragmentLinks = function ( container, docTitle, prefix ) {
var docTitleText = docTitle.getPrefixedText();
prefix = prefix || '';
Array.prototype.forEach.call( container.querySelectorAll( 'a[href*="#"]' ), function ( el ) {
var fragment = null;
if ( el.getAttribute( 'href' )[ 0 ] === '#' ) {
// Legacy parser
fragment = el.getAttribute( 'href' ).slice( 1 );
} else {
// Parsoid HTML
var targetData = mw.libs.ve.getTargetDataFromHref( el.href, el.ownerDocument );
if ( targetData.isInternal ) {
var title = mw.Title.newFromText( targetData.title );
if ( title && title.getPrefixedText() === docTitleText ) {
fragment = new URL( el.href ).hash.slice( 1 );
}
}
}
if ( fragment !== null ) {
if ( !fragment ) {
// Special case for empty fragment, even if prefix set
el.setAttribute( 'href', '#' );
} else {
if ( prefix ) {
var target = container.querySelector( '#' + $.escapeSelector( fragment ) );
// There may be multiple links to a specific target, so check the target
// hasn't already been fixed (in which case it would be null)
if ( target ) {
target.setAttribute( 'id', prefix + fragment );
target.setAttribute( 'data-mw-id-fixed', '' );
}
}
el.setAttribute( 'href', '#' + prefix + fragment );
}
el.removeAttribute( 'target' );
}
} );
// Remove any section heading anchors which weren't fixed above (T218492)
Array.prototype.forEach.call( container.querySelectorAll( 'h1, h2, h3, h4, h5, h6' ), function ( el ) {
if ( el.hasAttribute( 'id' ) && !el.hasAttribute( 'data-mw-id-fixed' ) ) {
el.removeAttribute( 'id' );
}
} );
};
/**
* Parse URL to get title it points to.
*
* @param {string} href
* @param {HTMLDocument} doc Document whose base URL to use
* @return {Object} Information about the given href
* @return {string} [return.title]
* The title of the internal link (if the href is internal)
* @return {boolean} return.isInternal
* True if the href pointed to the local wiki, false if href is external
*/
mw.libs.ve.getTargetDataFromHref = function ( href, doc ) {
function regexEscape( str ) {
return str.replace( /([.?*+^$[\]\\(){}|-])/g, '\\$1' );
}
var isInternal = null;
// Protocol relative href
var relativeHref = href.replace( /^https?:/i, '' );
var uri, queryLength;
// Equivalent to `ve.init.platform.getExternalLinkUrlProtocolsRegExp()`, which can not be called here
var externalLinkUrlProtocolsRegExp = new RegExp( '^(' + mw.config.get( 'wgUrlProtocols' ) + ')', 'i' );
// Paths that don't start with a registered external url protocol
if ( !externalLinkUrlProtocolsRegExp.test( href ) ) {
isInternal = true;
if ( href.match( /^\.\// ) ) {
// The specific case of parsoid resource URIs, which are in the form `./Title`.
// If they're redlinks they now include a querystring which should be stripped.
try {
uri = new mw.Uri( href );
} catch ( e ) {
// probably an incorrecly encoded URI, try a very-naïve fallback
href = href.replace( /\?action=edit&redlink=1$/, '' );
}
if ( uri ) {
queryLength = Object.keys( uri.query ).length;
if (
( queryLength === 2 && uri.query.action === 'edit' && uri.query.redlink === '1' )
) {
uri.query = {};
href = '.' + uri.getRelativePath();
}
}
}
} else {
// Check if this matches the server's script path (as used by red links)
var scriptBase = mw.libs.ve.resolveUrl( mw.config.get( 'wgScript' ), doc ).replace( /^https?:/i, '' );
if ( relativeHref.indexOf( scriptBase ) === 0 ) {
try {
uri = new mw.Uri( relativeHref );
} catch ( e ) {
// probably an incorrectly encoded URI
}
if ( uri ) {
queryLength = Object.keys( uri.query ).length;
if (
( queryLength === 1 && uri.query.title ) ||
( queryLength === 3 && uri.query.title && uri.query.action === 'edit' && uri.query.redlink === '1' )
) {
href = uri.query.title + ( uri.fragment ? '#' + uri.fragment : '' );
isInternal = true;
} else if ( queryLength > 1 ) {
href = relativeHref;
isInternal = false;
}
}
}
if ( isInternal === null ) {
// Check if this matches the server's article path
var articleBase = mw.libs.ve.resolveUrl( mw.config.get( 'wgArticlePath' ), doc ).replace( /^https?:/i, '' );
var articleBaseRegex = new RegExp( regexEscape( articleBase ).replace( regexEscape( '$1' ), '(.*)' ) );
var matches = relativeHref.match( articleBaseRegex );
if ( matches && matches[ 1 ].split( '#' )[ 0 ].indexOf( '?' ) === -1 ) {
// Take the relative path
href = matches[ 1 ];
isInternal = true;
} else {
isInternal = false;
}
}
}
if ( !isInternal ) {
return { isInternal: false };
}
// This href doesn't necessarily come from Parsoid (and it might not have the "./" prefix), but
// this method will work fine.
var data = mw.libs.ve.parseParsoidResourceName( href );
data.isInternal = true;
return data;
};
/**
* Encode a page title into a Parsoid resource name.
*
* @param {string} title
* @return {string}
*/
mw.libs.ve.encodeParsoidResourceName = function ( title ) {
// Parsoid: Sanitizer::sanitizeTitleURI, Env::makeLink
var idx = title.indexOf( '#' );
var anchor = null;
if ( idx !== -1 ) {
anchor = title.slice( idx + 1 );
title = title.slice( 0, idx );
}
var encodedTitle = title.replace( /[%? [\]#|<>]/g, function ( match ) {
return mw.util.wikiUrlencode( match );
} );
if ( anchor !== null ) {
encodedTitle += '#' + mw.util.escapeIdForLink( anchor );
}
return './' + encodedTitle;
};
/**
* Split Parsoid resource name into the href prefix and the page title.
*
* @param {string} resourceName Resource name, from a `href` or `resource` attribute
* @return {Object} Object with the following properties:
* @return {string} return.title Full page title in text form (with namespace, and spaces instead of underscores)
*/
mw.libs.ve.parseParsoidResourceName = function ( resourceName ) {
// Resource names are always prefixed with './' to prevent the MediaWiki namespace from being
// interpreted as a URL protocol, consider e.g. 'href="./File:Foo.png"'.
// (We accept input without the prefix, so this can also take plain page titles.)
var matches = resourceName.match( /^(\.\/|)(.*)$/ );
return {
// '%' and '?' are valid in page titles, but normally URI-encoded. This also changes underscores
// to spaces.
title: mw.libs.ve.decodeURIComponentIntoArticleTitle( matches[ 2 ] )
};
};
/**
* Extract the page title from a Parsoid resource name.
*
* @param {string} resourceName Resource name, from a `href` or `resource` attribute
* @return {string} Full page title in text form (with namespace, and spaces instead of underscores)
*/
mw.libs.ve.normalizeParsoidResourceName = function ( resourceName ) {
return mw.libs.ve.parseParsoidResourceName( resourceName ).title;
};