2014-03-26 23:59:04 +00:00
|
|
|
/*
|
|
|
|
* This file is part of the MediaWiki extension MediaViewer.
|
|
|
|
*
|
|
|
|
* MediaViewer is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* MediaViewer is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with MediaViewer. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Shared cache between HtmlUtils instances to store the results of expensive text operations.
|
|
|
|
*
|
|
|
|
* @member HtmlUtils
|
|
|
|
* @private
|
|
|
|
* @static
|
|
|
|
* @type {{text: Object.<string, string>, textWithLinks: Object.<string, string>, textWithTags: Object.<string, string>}}
|
|
|
|
*/
|
|
|
|
const cache = {
|
|
|
|
text: {},
|
|
|
|
textWithLinks: {},
|
|
|
|
textWithTags: {}
|
|
|
|
};
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Helper class that does various HTML-to-text transformations
|
|
|
|
*/
|
|
|
|
class HtmlUtils {
|
2014-03-26 23:59:04 +00:00
|
|
|
/**
|
2024-05-21 20:12:08 +00:00
|
|
|
* Returns a jQuery node which contains the given HTML (wrapped into a `<div>` - this is
|
|
|
|
* necessary since an arbitrary HTML string might not have a jQuery representation).
|
2016-07-18 13:49:27 +00:00
|
|
|
*
|
2024-05-21 20:12:08 +00:00
|
|
|
* @param {string|HTMLElement|jQuery} html
|
|
|
|
* @return {jQuery}
|
2014-03-26 23:59:04 +00:00
|
|
|
*/
|
2024-05-21 20:12:08 +00:00
|
|
|
static wrapAndJquerify( html ) {
|
|
|
|
if ( HtmlUtils.isJQueryOrHTMLElement( html ) ) {
|
|
|
|
return $( '<div>' ).append( $( html ).clone() );
|
|
|
|
} else if ( typeof html === 'string' ) {
|
|
|
|
return $( '<div>' ).html( html );
|
|
|
|
} else {
|
|
|
|
mw.log.warn( 'wrapAndJquerify: unknown type', html );
|
|
|
|
throw new Error( 'wrapAndJquerify: unknown type' );
|
|
|
|
}
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
|
|
|
/**
|
2024-05-21 20:12:08 +00:00
|
|
|
* Returns true of the object is a jQuery object or an HTMLElement, false otherwise
|
|
|
|
*
|
|
|
|
* @param {string|HTMLElement|jQuery} html
|
|
|
|
* @return {boolean}
|
2014-03-26 23:59:04 +00:00
|
|
|
*/
|
2024-05-21 20:12:08 +00:00
|
|
|
static isJQueryOrHTMLElement( html ) {
|
|
|
|
if ( html instanceof $ ) {
|
|
|
|
return true;
|
2014-04-01 07:56:18 +00:00
|
|
|
}
|
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
if ( window.HTMLElement ) {
|
|
|
|
if ( html instanceof HTMLElement ) {
|
2014-04-01 07:56:18 +00:00
|
|
|
return true;
|
|
|
|
}
|
2023-05-20 12:38:59 +00:00
|
|
|
}
|
2018-12-07 19:29:25 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
return false;
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Filters display:none and <style></style> children of a node.
|
|
|
|
* The root element is never filtered, and generally ignored (i.e. whether the root element is
|
|
|
|
* visible won't affect the filtering).
|
|
|
|
* Works in place.
|
|
|
|
*
|
|
|
|
* @param {jQuery} $jq
|
|
|
|
*/
|
|
|
|
static filterInvisible( $jq ) {
|
|
|
|
// We are not using :visible because
|
|
|
|
// 1) it would require appending $jq to the document which makes things complicated;
|
|
|
|
// 2) the main difference is that it looks for CSS rules hiding the element;
|
|
|
|
// since this function is intended to be used on html originating from a different
|
|
|
|
// document, possibly a different site, that would probably have unexpected results.
|
|
|
|
$jq
|
|
|
|
.find( '[style]' )
|
2024-06-07 13:35:46 +00:00
|
|
|
.filter( ( i, el ) => el.style.display === 'none' )
|
2024-05-21 20:12:08 +00:00
|
|
|
.remove();
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
// TemplateStyles can generate inline style tags
|
|
|
|
$jq
|
|
|
|
.find( 'style' )
|
|
|
|
.remove();
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Discards all nodes which do not match the whitelist,
|
|
|
|
* but keeps the text and whitelisted nodes inside them.
|
|
|
|
* Works in-place.
|
|
|
|
*
|
|
|
|
* @param {jQuery} $el
|
|
|
|
* @param {string} whitelist a jQuery selector string such as 'a, span, br'
|
|
|
|
*/
|
|
|
|
static whitelistHtml( $el, whitelist ) {
|
|
|
|
let $prev;
|
|
|
|
let $child = $el.children().first();
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
while ( $child && $child.length ) {
|
|
|
|
const child = $child.get( 0 );
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
if ( child.nodeType !== child.ELEMENT_NODE ) {
|
|
|
|
return;
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
HtmlUtils.whitelistHtml( $child, whitelist );
|
2023-05-20 12:38:59 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
if ( !$child.is( whitelist ) ) {
|
|
|
|
$prev = $child.prev();
|
|
|
|
$child.replaceWith( $child.contents() );
|
|
|
|
} else {
|
|
|
|
$prev = $child;
|
2014-03-26 23:59:04 +00:00
|
|
|
}
|
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
if ( $prev && $prev.length === 1 ) {
|
|
|
|
$child = $prev.next();
|
|
|
|
} else {
|
|
|
|
$child = $el.children().first();
|
|
|
|
}
|
2023-05-20 12:38:59 +00:00
|
|
|
}
|
2024-05-21 20:12:08 +00:00
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Adds a whitespace to block elements. This is useful if you want to convert the contents
|
|
|
|
* to text and don't want words that are visually separate (e.g. table cells) to be fused.
|
|
|
|
* Works in-place.
|
|
|
|
*
|
|
|
|
* @param {jQuery} $el
|
|
|
|
*/
|
|
|
|
static appendWhitespaceToBlockElements( $el ) {
|
|
|
|
// the list of what elements to add whitespace to is somewhat ad-hoc (not all of these
|
|
|
|
// are technically block-level elements, and a lot of block-level elements are missing)
|
|
|
|
// but will hopefully cover the common cases where text is fused together.
|
|
|
|
$el
|
|
|
|
.find( 'blockquote, dd, dl, dt, li, td' )
|
|
|
|
.before( ' ' )
|
|
|
|
.after( ' ' );
|
|
|
|
$el
|
|
|
|
.find( 'br, tr, p' )
|
|
|
|
.before( '\n' )
|
|
|
|
.after( '\n' );
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Returns the HTML code for a jQuery element (only the first one if passed a set of elements).
|
|
|
|
* Unlike .html(), this includes HTML code for the outermost element; compare
|
|
|
|
* - `$('<div>').html() // ''`
|
|
|
|
* - `HtmlUtils.jqueryToHtml( $('<div>') ) // '<div></div>'`
|
|
|
|
*
|
|
|
|
* @param {jQuery} $el
|
|
|
|
* @return {string}
|
|
|
|
*/
|
|
|
|
static jqueryToHtml( $el ) {
|
|
|
|
// There are two possible implementations for this:
|
|
|
|
// 1) load into a wrapper element and get its innerHTML;
|
|
|
|
// 2) use outerHTML.
|
|
|
|
// We go with 1) because it handles the case when a jQuery object contains something
|
|
|
|
// that is not an element (this can happen with e.g. $x.children() which returns text
|
|
|
|
// nodes as well).
|
|
|
|
return $( '<div>' ).append( $el ).html();
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Cleans up superfluous whitespace.
|
|
|
|
* Given that the results will be displayed in a HTML environment, this doesn't have any real
|
|
|
|
* effect. It is mostly there to make testing easier.
|
|
|
|
*
|
|
|
|
* @protected
|
|
|
|
* @param {string} html a HTML (or plaintext) string
|
|
|
|
* @return {string}
|
|
|
|
*/
|
|
|
|
static mergeWhitespace( html ) {
|
|
|
|
html = html.replace( /^\s+|\s+$/g, '' );
|
|
|
|
html = html.replace( /\s*\n\s*/g, '\n' );
|
|
|
|
html = html.replace( / {2,}/g, ' ' );
|
|
|
|
return html;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the text content of a html string.
|
|
|
|
* Tries to give an approximation of what would be visible if the HTML would be displayed.
|
|
|
|
*
|
|
|
|
* @param {string} html
|
|
|
|
* @return {string}
|
|
|
|
*/
|
|
|
|
static htmlToText( html ) {
|
|
|
|
if ( !cache.text[ html ] ) {
|
|
|
|
const $html = HtmlUtils.wrapAndJquerify( html );
|
|
|
|
HtmlUtils.filterInvisible( $html );
|
|
|
|
HtmlUtils.appendWhitespaceToBlockElements( $html );
|
|
|
|
cache.text[ html ] = HtmlUtils.mergeWhitespace( $html.text() );
|
2014-03-26 23:59:04 +00:00
|
|
|
}
|
2024-05-21 20:12:08 +00:00
|
|
|
return cache.text[ html ];
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Returns the text content of a html string, with the `<a>`, `<i>`, `<b>` tags left intact.
|
|
|
|
* Tries to give an approximation of what would be visible if the HTML would be displayed.
|
|
|
|
*
|
|
|
|
* @param {string} html
|
|
|
|
* @return {string}
|
|
|
|
*/
|
|
|
|
static htmlToTextWithTags( html ) {
|
|
|
|
if ( !cache.textWithTags[ html ] ) {
|
|
|
|
const $html = HtmlUtils.wrapAndJquerify( html );
|
|
|
|
HtmlUtils.filterInvisible( $html );
|
|
|
|
HtmlUtils.appendWhitespaceToBlockElements( $html );
|
|
|
|
HtmlUtils.whitelistHtml( $html, 'a, span, i, b, sup, sub, s' );
|
|
|
|
cache.textWithTags[ html ] = HtmlUtils.mergeWhitespace( $html.html() );
|
2014-12-30 22:00:37 +00:00
|
|
|
}
|
2024-05-21 20:12:08 +00:00
|
|
|
return cache.textWithTags[ html ];
|
|
|
|
}
|
2014-12-30 22:00:37 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Returns the text content of a html string, with the `<a>` tags left intact.
|
|
|
|
* Tries to give an approximation of what would be visible if the HTML would be displayed.
|
|
|
|
*
|
|
|
|
* @param {string} html
|
|
|
|
* @return {string}
|
|
|
|
*/
|
|
|
|
static htmlToTextWithLinks( html ) {
|
|
|
|
if ( !cache.textWithLinks[ html ] ) {
|
|
|
|
const $html = HtmlUtils.wrapAndJquerify( html );
|
|
|
|
HtmlUtils.filterInvisible( $html );
|
|
|
|
HtmlUtils.appendWhitespaceToBlockElements( $html );
|
|
|
|
HtmlUtils.whitelistHtml( $html, 'a, span' );
|
|
|
|
cache.textWithLinks[ html ] = HtmlUtils.mergeWhitespace( $html.html() );
|
2014-03-26 23:59:04 +00:00
|
|
|
}
|
2024-05-21 20:12:08 +00:00
|
|
|
return cache.textWithLinks[ html ];
|
|
|
|
}
|
2014-03-26 23:59:04 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
/**
|
|
|
|
* Generates HTML code for a link.
|
|
|
|
*
|
|
|
|
* @param {string} text Link text (plain text; will be sanitized)
|
|
|
|
* @param {Object} props Link attributes (should at a minimum include href; will be sanitized)
|
|
|
|
* @return {string}
|
|
|
|
*/
|
|
|
|
static makeLinkText( text, props ) {
|
|
|
|
for ( const key in props ) {
|
|
|
|
if ( !Object.prototype.hasOwnProperty.call( props, key ) ) {
|
|
|
|
continue;
|
2016-06-19 12:11:30 +00:00
|
|
|
}
|
2024-05-21 20:12:08 +00:00
|
|
|
props[ key ] = HtmlUtils.htmlToText( props[ key ] );
|
2016-06-19 12:11:30 +00:00
|
|
|
}
|
2024-05-21 20:12:08 +00:00
|
|
|
return HtmlUtils.jqueryToHtml( $( '<a>' ).prop( props ).text( text ) );
|
2023-05-20 12:38:59 +00:00
|
|
|
}
|
2024-05-21 20:12:08 +00:00
|
|
|
}
|
2016-06-19 12:11:30 +00:00
|
|
|
|
2024-05-21 20:12:08 +00:00
|
|
|
module.exports = HtmlUtils;
|