mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/DiscussionTools
synced 2024-11-12 09:58:17 +00:00
33d69e26c9
Notably, JS trims the no-break space, while PHP doesn't. There are some other differences that don't come up in our tests. What we really want is to trim the ASCII whitespace as defined in the HTML spec. https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/trim https://www.php.net/manual/en/function.trim.php https://infra.spec.whatwg.org/#ascii-whitespace Change-Id: I95b8fb38878716a2fa7ec84c9f2e8065ebe77c0d
1060 lines
33 KiB
JavaScript
1060 lines
33 KiB
JavaScript
/* eslint-disable no-console */
|
|
'use strict';
|
|
|
|
var
|
|
utils = require( './utils.js' ),
|
|
// DiscussionToolsHooks::getLocalData()
|
|
data = require( './parser/data.json' ),
|
|
moment = require( './lib/moment-timezone/moment-timezone-with-data-1970-2030.js' );
|
|
|
|
/**
|
|
* Utilities for detecting and parsing components of discussion pages: signatures, timestamps,
|
|
* comments and threads.
|
|
*
|
|
* @class mw.dt.parser
|
|
*/
|
|
|
|
/**
|
|
* Get text of localisation messages in content language.
|
|
*
|
|
* @private
|
|
* @param {string[]} messages
|
|
* @return {string[]}
|
|
*/
|
|
function getMessages( messages ) {
|
|
return messages.map( function ( code ) {
|
|
return data.contLangMessages[ code ];
|
|
} );
|
|
}
|
|
|
|
/**
|
|
* Get a regexp that matches timestamps generated using the given date format.
|
|
*
|
|
* This only supports format characters that are used by the default date format in any of
|
|
* MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters),
|
|
* and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are
|
|
* complicated).
|
|
*
|
|
* @param {string} format Date format, as used by MediaWiki
|
|
* @param {string} digitsRegexp Regular expression matching a single localised digit, e.g. `[0-9]`
|
|
* @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
|
|
* for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
|
|
* @return {string} Regular expression
|
|
*/
|
|
function getTimestampRegexp( format, digitsRegexp, tzAbbrs ) {
|
|
var s, p, num, code, endQuote, tzRegexp, regexp;
|
|
|
|
function regexpGroup( regexp ) {
|
|
return '(' + regexp + ')';
|
|
}
|
|
|
|
function regexpAlternateGroup( array ) {
|
|
return '(' + array.map( mw.util.escapeRegExp ).join( '|' ) + ')';
|
|
}
|
|
|
|
s = '';
|
|
// Adapted from Language::sprintfDate()
|
|
for ( p = 0; p < format.length; p++ ) {
|
|
num = false;
|
|
code = format[ p ];
|
|
if ( code === 'x' && p < format.length - 1 ) {
|
|
code += format[ ++p ];
|
|
}
|
|
if ( code === 'xk' && p < format.length - 1 ) {
|
|
code += format[ ++p ];
|
|
}
|
|
|
|
switch ( code ) {
|
|
case 'xx':
|
|
s += 'x';
|
|
break;
|
|
case 'xg':
|
|
s += regexpAlternateGroup( getMessages( [
|
|
'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
|
|
'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
|
|
'december-gen'
|
|
] ) );
|
|
break;
|
|
case 'd':
|
|
num = '2';
|
|
break;
|
|
case 'D':
|
|
s += regexpAlternateGroup( getMessages( [
|
|
'sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'
|
|
] ) );
|
|
break;
|
|
case 'j':
|
|
num = '1,2';
|
|
break;
|
|
case 'l':
|
|
s += regexpAlternateGroup( getMessages( [
|
|
'sunday', 'monday', 'tuesday', 'wednesday', 'thursday',
|
|
'friday', 'saturday'
|
|
] ) );
|
|
break;
|
|
case 'F':
|
|
s += regexpAlternateGroup( getMessages( [
|
|
'january', 'february', 'march', 'april', 'may_long', 'june',
|
|
'july', 'august', 'september', 'october', 'november',
|
|
'december'
|
|
] ) );
|
|
break;
|
|
case 'M':
|
|
s += regexpAlternateGroup( getMessages( [
|
|
'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
|
|
'sep', 'oct', 'nov', 'dec'
|
|
] ) );
|
|
break;
|
|
case 'n':
|
|
num = '1,2';
|
|
break;
|
|
case 'Y':
|
|
num = '4';
|
|
break;
|
|
case 'xkY':
|
|
num = '4';
|
|
break;
|
|
case 'G':
|
|
num = '1,2';
|
|
break;
|
|
case 'H':
|
|
num = '2';
|
|
break;
|
|
case 'i':
|
|
num = '2';
|
|
break;
|
|
case '\\':
|
|
// Backslash escaping
|
|
if ( p < format.length - 1 ) {
|
|
s += mw.util.escapeRegExp( format[ ++p ] );
|
|
} else {
|
|
s += mw.util.escapeRegExp( '\\' );
|
|
}
|
|
break;
|
|
case '"':
|
|
// Quoted literal
|
|
if ( p < format.length - 1 ) {
|
|
endQuote = format.indexOf( '"', p + 1 );
|
|
if ( endQuote === -1 ) {
|
|
// No terminating quote, assume literal "
|
|
s += '"';
|
|
} else {
|
|
s += mw.util.escapeRegExp( format.substr( p + 1, endQuote - p - 1 ) );
|
|
p = endQuote;
|
|
}
|
|
} else {
|
|
// Quote at end of string, assume literal "
|
|
s += '"';
|
|
}
|
|
break;
|
|
default:
|
|
s += mw.util.escapeRegExp( format[ p ] );
|
|
}
|
|
if ( num !== false ) {
|
|
s += regexpGroup( digitsRegexp + '{' + num + '}' );
|
|
}
|
|
}
|
|
|
|
tzRegexp = regexpAlternateGroup( Object.keys( tzAbbrs ) );
|
|
// Hardcoded parentheses and space like in Parser::pstPass2
|
|
// Ignore some invisible Unicode characters that often sneak into copypasted timestamps (T245784)
|
|
regexp = s + '[\\u200E\\u200F]? [\\u200E\\u200F]?\\(' + tzRegexp + '\\)';
|
|
|
|
return regexp;
|
|
}
|
|
|
|
/**
|
|
* Get a function that parses timestamps generated using the given date format, based on the result
|
|
* of matching the regexp returned by #getTimestampRegexp.
|
|
*
|
|
* @param {string} format Date format, as used by MediaWiki
|
|
* @param {string|null} digits Localised digits from 0 to 9, e.g. `0123456789`
|
|
* @param {string} localTimezone Local timezone IANA name, e.g. `America/New_York`
|
|
* @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
|
|
* for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
|
|
* @return {Function} Parser function
|
|
* @return {Array} return.match Regexp match data
|
|
* @return {Object} return.return Moment object
|
|
*/
|
|
function getTimestampParser( format, digits, localTimezone, tzAbbrs ) {
|
|
var p, code, endQuote, matchingGroups = [];
|
|
for ( p = 0; p < format.length; p++ ) {
|
|
code = format[ p ];
|
|
if ( code === 'x' && p < format.length - 1 ) {
|
|
code += format[ ++p ];
|
|
}
|
|
if ( code === 'xk' && p < format.length - 1 ) {
|
|
code += format[ ++p ];
|
|
}
|
|
|
|
switch ( code ) {
|
|
case 'xx':
|
|
break;
|
|
case 'xg':
|
|
case 'd':
|
|
case 'j':
|
|
case 'D':
|
|
case 'l':
|
|
case 'F':
|
|
case 'M':
|
|
case 'n':
|
|
case 'Y':
|
|
case 'xkY':
|
|
case 'G':
|
|
case 'H':
|
|
case 'i':
|
|
matchingGroups.push( code );
|
|
break;
|
|
case '\\':
|
|
// Backslash escaping
|
|
if ( p < format.length - 1 ) {
|
|
++p;
|
|
}
|
|
break;
|
|
case '"':
|
|
// Quoted literal
|
|
if ( p < format.length - 1 ) {
|
|
endQuote = format.indexOf( '"', p + 1 );
|
|
if ( endQuote !== -1 ) {
|
|
p = endQuote;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
function untransformDigits( text ) {
|
|
if ( !digits ) {
|
|
return text;
|
|
}
|
|
return text.replace(
|
|
new RegExp( '[' + digits + ']', 'g' ),
|
|
function ( m ) {
|
|
return digits.indexOf( m );
|
|
}
|
|
);
|
|
}
|
|
|
|
return function timestampParser( match ) {
|
|
var
|
|
year = 0,
|
|
monthIdx = 0,
|
|
day = 0,
|
|
hour = 0,
|
|
minute = 0,
|
|
tzAbbr,
|
|
i, code, text,
|
|
date;
|
|
for ( i = 0; i < matchingGroups.length; i++ ) {
|
|
code = matchingGroups[ i ];
|
|
text = match[ i + 1 ];
|
|
|
|
switch ( code ) {
|
|
case 'xg':
|
|
monthIdx = getMessages( [
|
|
'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
|
|
'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
|
|
'december-gen'
|
|
] ).indexOf( text );
|
|
break;
|
|
case 'd':
|
|
case 'j':
|
|
day = Number( untransformDigits( text ) );
|
|
break;
|
|
case 'D':
|
|
case 'l':
|
|
// Day of the week - unused
|
|
break;
|
|
case 'F':
|
|
monthIdx = getMessages( [
|
|
'january', 'february', 'march', 'april', 'may_long', 'june',
|
|
'july', 'august', 'september', 'october', 'november',
|
|
'december'
|
|
] ).indexOf( text );
|
|
break;
|
|
case 'M':
|
|
monthIdx = getMessages( [
|
|
'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
|
|
'sep', 'oct', 'nov', 'dec'
|
|
] ).indexOf( text );
|
|
break;
|
|
case 'n':
|
|
monthIdx = Number( untransformDigits( text ) ) - 1;
|
|
break;
|
|
case 'Y':
|
|
year = Number( untransformDigits( text ) );
|
|
break;
|
|
case 'xkY':
|
|
// Thai year
|
|
year = Number( untransformDigits( text ) ) - 543;
|
|
break;
|
|
case 'G':
|
|
case 'H':
|
|
hour = Number( untransformDigits( text ) );
|
|
break;
|
|
case 'i':
|
|
minute = Number( untransformDigits( text ) );
|
|
break;
|
|
default:
|
|
throw new Error( 'Not implemented' );
|
|
}
|
|
}
|
|
// The last matching group is the timezone abbreviation
|
|
tzAbbr = tzAbbrs[ match[ match.length - 1 ] ];
|
|
|
|
// Most of the time, the timezone abbreviation is not necessary to parse the date, since we
|
|
// can assume all times are in the wiki's local timezone.
|
|
date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );
|
|
|
|
// But during the "fall back" at the end of DST, some times will happen twice. Per the docs,
|
|
// "Moment Timezone handles this by always using the earlier instance of a duplicated hour."
|
|
// https://momentjs.com/timezone/docs/#/using-timezones/parsing-ambiguous-inputs/
|
|
|
|
// Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect when
|
|
// that behavior was incorrect...
|
|
if ( date.zoneAbbr() !== tzAbbr ) {
|
|
// ...and force the correct parsing. I can't find proper documentation for this feature,
|
|
// but this pull request explains it: https://github.com/moment/moment-timezone/pull/101
|
|
moment.tz.moveAmbiguousForward = true;
|
|
date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );
|
|
moment.tz.moveAmbiguousForward = false;
|
|
if ( date.zoneAbbr() !== tzAbbr ) {
|
|
// This should not be possible for "genuine" timestamps generated by MediaWiki.
|
|
// But bots and humans get it wrong when marking up unsigned comments…
|
|
// https://pl.wikipedia.org/w/index.php?title=Wikipedia:Kawiarenka/Artykuły&diff=prev&oldid=54772606
|
|
console.log( 'Timestamp has timezone abbreviation for the wrong time: ' + match[ 0 ] );
|
|
} else {
|
|
console.log( 'Ambiguous time at DST switchover was parsed: ' + match[ 0 ] );
|
|
}
|
|
}
|
|
|
|
return date;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get a regexp that matches timestamps in the local date format.
|
|
*
|
|
* This calls #getTimestampRegexp with predefined data for the current wiki.
|
|
*
|
|
* @private
|
|
* @return {string} Regular expression
|
|
*/
|
|
function getLocalTimestampRegexp() {
|
|
var
|
|
df = data.dateFormat,
|
|
digitsRegexp = mw.config.get( 'wgTranslateNumerals' ) ? '[' + data.digits + ']' : '\\d',
|
|
dfRegexp = getTimestampRegexp( df, digitsRegexp, data.timezones );
|
|
return dfRegexp;
|
|
}
|
|
|
|
/**
|
|
* Get a function that parses timestamps in the local date format, based on the result
|
|
* of matching the regexp returned by #getLocalTimestampRegexp.
|
|
*
|
|
* This calls #getTimestampParser with predefined data for the current wiki.
|
|
*
|
|
* @private
|
|
* @return {Function} Parser function
|
|
* @return {Array} return.match Regexp match data
|
|
* @return {Date} return.return
|
|
*/
|
|
function getLocalTimestampParser() {
|
|
var
|
|
df = data.dateFormat,
|
|
digits = mw.config.get( 'wgTranslateNumerals' ) ? data.digits : null,
|
|
parseFunction = getTimestampParser( df, digits, data.localTimezone, data.timezones );
|
|
return parseFunction;
|
|
}
|
|
|
|
/**
|
|
* Callback for document.createTreeWalker that will skip over nodes where we don't want to detect
|
|
* comments (or section headings).
|
|
*
|
|
* @param {Node} node
|
|
* @return {number} Appropriate NodeFilter constant
|
|
*/
|
|
function acceptOnlyNodesAllowingComments( node ) {
|
|
// The table of contents has a heading that gets erroneously detected as a section
|
|
if ( node.id === 'toc' ) {
|
|
return NodeFilter.FILTER_REJECT;
|
|
}
|
|
return NodeFilter.FILTER_ACCEPT;
|
|
}
|
|
|
|
/**
|
|
* Find all timestamps within a DOM subtree.
|
|
*
|
|
* @param {HTMLElement} rootNode Node to search
|
|
* @return {Array[]} Results. Each result is a two-element array.
|
|
* @return {Text} return.0 Text node containing the timestamp
|
|
* @return {Array} return.1 Regexp match data, which specifies the location of the match, and which
|
|
* can be parsed using #getLocalTimestampParser
|
|
*/
|
|
function findTimestamps( rootNode ) {
|
|
var
|
|
matches = [],
|
|
treeWalker = rootNode.ownerDocument.createTreeWalker(
|
|
rootNode,
|
|
NodeFilter.SHOW_TEXT,
|
|
acceptOnlyNodesAllowingComments,
|
|
false
|
|
),
|
|
dateRegexp = getLocalTimestampRegexp(),
|
|
node, startNode, nodeText, match;
|
|
|
|
while ( ( node = treeWalker.nextNode() ) ) {
|
|
startNode = node;
|
|
nodeText = '';
|
|
|
|
while ( node ) {
|
|
nodeText += node.nodeValue;
|
|
|
|
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
|
|
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
|
|
// which apparently are often turned into entities by buggy editing tools. To handle
|
|
// this, we must piece together the text, so that our regexp can match those timestamps.
|
|
if (
|
|
node.nextSibling &&
|
|
node.nextSibling.nodeType === Node.ELEMENT_NODE &&
|
|
node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
|
|
) {
|
|
nodeText += node.nextSibling.firstChild.nodeValue;
|
|
|
|
// If the entity is followed by more text, do this again
|
|
if (
|
|
node.nextSibling.nextSibling &&
|
|
node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
|
|
) {
|
|
node = node.nextSibling.nextSibling;
|
|
} else {
|
|
node = null;
|
|
}
|
|
} else {
|
|
node = null;
|
|
}
|
|
}
|
|
|
|
// Technically, there could be multiple matches in a single text node. However, the ultimate
|
|
// point of this is to find the signatures which precede the timestamps, and any later
|
|
// timestamps in the text node can't be directly preceded by a signature (as we require them to
|
|
// have links), so we only concern ourselves with the first match.
|
|
if ( ( match = nodeText.match( dateRegexp ) ) ) {
|
|
matches.push( [ startNode, match ] );
|
|
}
|
|
}
|
|
return matches;
|
|
}
|
|
|
|
/**
|
|
* Get a MediaWiki page title from a URL.
|
|
*
|
|
* @private
|
|
* @param {string} url
|
|
* @return {mw.Title|null} Page title, or null if this isn't a link to a page
|
|
*/
|
|
function getTitleFromUrl( url ) {
|
|
var articlePathRegexp, match;
|
|
|
|
try {
|
|
url = new mw.Uri( url );
|
|
} catch ( err ) {
|
|
// T106244: URL encoded values using fallback 8-bit encoding (invalid UTF-8) cause mediawiki.Uri to crash
|
|
return null;
|
|
}
|
|
articlePathRegexp = new RegExp(
|
|
mw.util.escapeRegExp( mw.config.get( 'wgArticlePath' ) )
|
|
.replace( mw.util.escapeRegExp( '$1' ), '(.*)' )
|
|
);
|
|
|
|
if ( ( match = url.path.match( articlePathRegexp ) ) ) {
|
|
return mw.Title.newFromText( decodeURIComponent( match[ 1 ] ) );
|
|
}
|
|
if ( url.query.title ) {
|
|
return mw.Title.newFromText( url.query.title );
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Find a user signature preceding a timestamp.
|
|
*
|
|
* The signature includes the timestamp node.
|
|
*
|
|
* A signature must contain at least one link to the user's userpage, discussion page or
|
|
* contributions (and may contain other links). The link may be nested in other elements.
|
|
*
|
|
* @private
|
|
* @param {Text} timestampNode Text node
|
|
* @param {Node} [until] Node to stop searching at
|
|
* @return {Array} Result, a two-element array
|
|
* @return {Node[]} return.0 Sibling nodes comprising the signature, in reverse order (with
|
|
* `timestampNode` as the first element)
|
|
* @return {string|null} return.1 Username, null for unsigned comments
|
|
*/
|
|
function findSignature( timestampNode, until ) {
|
|
var
|
|
node = timestampNode,
|
|
sigNodes = [ node ],
|
|
sigUsername = null,
|
|
length = 0,
|
|
lastLinkNode = timestampNode,
|
|
links, nodes;
|
|
while ( ( node = node.previousSibling ) && length < data.signatureScanLimit && node !== until ) {
|
|
sigNodes.push( node );
|
|
length += ( node.textContent || '' ).length;
|
|
if ( !node.tagName ) {
|
|
continue;
|
|
}
|
|
links = [];
|
|
if ( node.tagName.toLowerCase() === 'a' ) {
|
|
links.push( node );
|
|
} else {
|
|
// Handle links nested in formatting elements.
|
|
// Helpful accidental feature: users whose signature is not detected in full (due to
|
|
// text formatting) can just wrap it in a <span> to fix that.
|
|
// "Ten Pound Hammer • (What did I screw up now?)"
|
|
// "« Saper // dyskusja »"
|
|
nodes = node.getElementsByTagName( 'a' );
|
|
links.push.apply( links, nodes );
|
|
}
|
|
if ( !links.length ) {
|
|
continue;
|
|
}
|
|
// Use .some() rather than .every() to permit vanity links
|
|
// "TonyTheTiger (T / C / WP:FOUR / WP:CHICAGO / WP:WAWARD)"
|
|
// eslint-disable-next-line no-loop-func
|
|
if ( links.some( function ( link ) {
|
|
var username, title;
|
|
title = getTitleFromUrl( link.href );
|
|
if ( !title ) {
|
|
return false;
|
|
}
|
|
if (
|
|
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user ||
|
|
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user_talk
|
|
) {
|
|
username = title.getMainText();
|
|
if ( username.indexOf( '/' ) !== -1 ) {
|
|
return false;
|
|
}
|
|
} else if (
|
|
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).special &&
|
|
title.getMainText().split( '/' )[ 0 ] === data.specialContributionsName
|
|
) {
|
|
username = title.getMainText().split( '/' )[ 1 ];
|
|
// Users may link to their contributions with non-standard name
|
|
username = mw.Title.makeTitle( mw.config.get( 'wgNamespaceIds' ).user, username ).getMainText();
|
|
}
|
|
if ( !username ) {
|
|
return false;
|
|
}
|
|
if ( mw.util.isIPv6Address( username ) ) {
|
|
// Bot-generated links "Preceding unsigned comment added by" have non-standard case
|
|
username = username.toUpperCase();
|
|
}
|
|
|
|
// Check that every link points to the same user
|
|
if ( !sigUsername ) {
|
|
sigUsername = username;
|
|
}
|
|
return username === sigUsername;
|
|
} ) ) {
|
|
lastLinkNode = node;
|
|
}
|
|
// Keep looking if a node with links wasn't a link to a user page
|
|
// "Doc James (talk · contribs · email)"
|
|
}
|
|
// Pop excess text nodes
|
|
while ( sigNodes[ sigNodes.length - 1 ] !== lastLinkNode ) {
|
|
sigNodes.pop();
|
|
}
|
|
return [ sigNodes, sigUsername ];
|
|
}
|
|
|
|
/**
|
|
* Get the indent level of a node, relative to its ancestor node.
|
|
*
|
|
* The indent level is the number of lists inside of which it is nested.
|
|
*
|
|
* @private
|
|
* @param {Node} node
|
|
* @param {HTMLElement} rootNode Node to stop counting at
|
|
* @return {number}
|
|
*/
|
|
function getIndentLevel( node, rootNode ) {
|
|
var indent = 0, tagName;
|
|
while ( node ) {
|
|
if ( node === rootNode ) {
|
|
break;
|
|
}
|
|
tagName = node.tagName && node.tagName.toLowerCase();
|
|
if ( tagName === 'li' || tagName === 'dd' ) {
|
|
indent++;
|
|
}
|
|
node = node.parentNode;
|
|
}
|
|
return indent;
|
|
}
|
|
|
|
/**
|
|
* Trim ASCII whitespace, as defined in the HTML spec.
|
|
*
|
|
* @param {string} str
|
|
* @return {string}
|
|
*/
|
|
function htmlTrim( str ) {
|
|
// https://infra.spec.whatwg.org/#ascii-whitespace
|
|
return str.replace( /^[\t\n\f\r ]+/, '' ).replace( /[\t\n\f\r ]+$/, '' );
|
|
}
|
|
|
|
/**
|
|
* Return the next leaf node in the tree order that is not an empty or whitespace-only text node.
|
|
*
|
|
* In other words, this returns a Text node with content other than whitespace, or an Element node
|
|
* with no children, that follows the given node in the HTML source.
|
|
*
|
|
* @private
|
|
* @param {Node} node Node to start searching at. If it isn't a leaf node, its children are ignored.
|
|
* @param {HTMLElement} rootNode Node to stop searching at
|
|
* @return {Node|null}
|
|
*/
|
|
function nextInterestingLeafNode( node, rootNode ) {
|
|
var treeWalker = rootNode.ownerDocument.createTreeWalker(
|
|
rootNode,
|
|
// eslint-disable-next-line no-bitwise
|
|
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
|
|
function ( n ) {
|
|
// Ignore this node and its descendants
|
|
// (unless it's the root node, this is a special case for "fakeHeading" handling)
|
|
if ( node !== rootNode && ( n === node || n.parentNode === node ) ) {
|
|
return NodeFilter.FILTER_REJECT;
|
|
}
|
|
if (
|
|
( n.nodeType === Node.TEXT_NODE && htmlTrim( n.textContent ) !== '' ) ||
|
|
( n.nodeType === Node.ELEMENT_NODE && !n.firstChild )
|
|
) {
|
|
return NodeFilter.FILTER_ACCEPT;
|
|
}
|
|
return NodeFilter.FILTER_SKIP;
|
|
},
|
|
false
|
|
);
|
|
treeWalker.currentNode = node;
|
|
treeWalker.nextNode();
|
|
return treeWalker.currentNode;
|
|
}
|
|
|
|
/**
|
|
* Get all discussion comments (and headings) within a DOM subtree.
|
|
*
|
|
* This returns a flat list, use #groupThreads to associate replies to original messages and get a
|
|
* tree structure starting at section headings.
|
|
*
|
|
* For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
|
|
* syntax is just for illustration):
|
|
*
|
|
* == A ==
|
|
* B. ~~~~
|
|
* : C.
|
|
* : C. ~~~~
|
|
* :: D. ~~~~
|
|
* ::: E. ~~~~
|
|
* ::: F. ~~~~
|
|
* : G. ~~~~
|
|
* H. ~~~~
|
|
* : I. ~~~~
|
|
*
|
|
* This function would return a structure like:
|
|
*
|
|
* [
|
|
* { type: 'heading', level: 0, range: (h2: A) },
|
|
* { type: 'comment', level: 1, range: (p: B) },
|
|
* { type: 'comment', level: 2, range: (li: C, li: C) },
|
|
* { type: 'comment', level: 3, range: (li: D) },
|
|
* { type: 'comment', level: 4, range: (li: E) },
|
|
* { type: 'comment', level: 4, range: (li: F) },
|
|
* { type: 'comment', level: 2, range: (li: G) },
|
|
* { type: 'comment', level: 1, range: (p: H) },
|
|
* { type: 'comment', level: 2, range: (li: I) }
|
|
* ]
|
|
*
|
|
* @param {HTMLElement} rootNode
|
|
* @return {Object[]} Results. Each result is an object.
|
|
* @return {string} return.type `heading` or `comment`
|
|
* @return {Object} return.range Object describing the extent of the comment, including the
|
|
* signature and timestamp. It has the same properties as a Range object: `startContainer`,
|
|
* `startOffset`, `endContainer`, `endOffset` (we don't use a real Range because they change
|
|
* magically when the DOM structure changes).
|
|
* @return {Object[]} [return.signatureRanges] Objects describing the extent of signatures (plus
|
|
* timestamps) for this comment. There is always at least one signature, but there may be
|
|
* multiple. The author and timestamp of the comment is determined from the first signature.
|
|
* The last node in every signature range is the text node containing the timestamp.
|
|
* @return {number} return.level Indentation level of the comment. Headings are `0`, comments start
|
|
* at `1`.
|
|
* @return {Object} [return.timestamp] Timestamp (Moment object), undefined for headings
|
|
* @return {string} [return.author] Comment author's username, undefined for headings
|
|
*/
|
|
function getComments( rootNode ) {
|
|
var
|
|
dfParser = getLocalTimestampParser(),
|
|
comments = [],
|
|
timestamps, nextTimestamp, treeWalker,
|
|
node, range, fakeHeading, curComment,
|
|
foundSignature, firstSigNode, sigRange, author, startNode, match, startLevel, endLevel;
|
|
|
|
timestamps = findTimestamps( rootNode );
|
|
|
|
treeWalker = rootNode.ownerDocument.createTreeWalker(
|
|
rootNode,
|
|
// eslint-disable-next-line no-bitwise
|
|
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
|
|
acceptOnlyNodesAllowingComments,
|
|
false
|
|
);
|
|
|
|
// Placeholder heading in case there are comments in the 0th section
|
|
range = {
|
|
startContainer: rootNode,
|
|
startOffset: 0,
|
|
endContainer: rootNode,
|
|
endOffset: 0
|
|
};
|
|
fakeHeading = {
|
|
placeholderHeading: true,
|
|
type: 'heading',
|
|
range: range,
|
|
level: 0
|
|
};
|
|
|
|
curComment = fakeHeading;
|
|
|
|
nextTimestamp = 0;
|
|
while ( ( node = treeWalker.nextNode() ) ) {
|
|
if ( node.tagName && node.tagName.match( /^h[1-6]$/i ) ) {
|
|
range = {
|
|
startContainer: node,
|
|
startOffset: 0,
|
|
endContainer: node,
|
|
endOffset: node.childNodes.length
|
|
};
|
|
curComment = {
|
|
type: 'heading',
|
|
range: range,
|
|
level: 0
|
|
};
|
|
comments.push( curComment );
|
|
} else if ( timestamps[ nextTimestamp ] && node === timestamps[ nextTimestamp ][ 0 ] ) {
|
|
foundSignature = findSignature( node, curComment.range.endContainer );
|
|
author = foundSignature[ 1 ];
|
|
firstSigNode = foundSignature[ 0 ][ foundSignature[ 0 ].length - 1 ];
|
|
|
|
if ( !author ) {
|
|
// Ignore timestamps for which we couldn't find a signature. It's probably not a real
|
|
// comment, but just a false match due to a copypasted timestamp.
|
|
nextTimestamp++;
|
|
continue;
|
|
}
|
|
|
|
// Everything from last comment up to here is the next comment
|
|
startNode = nextInterestingLeafNode( curComment.range.endContainer, rootNode );
|
|
match = timestamps[ nextTimestamp ][ 1 ];
|
|
range = {
|
|
startContainer: startNode.parentNode,
|
|
startOffset: Array.prototype.indexOf.call( startNode.parentNode.childNodes, startNode ),
|
|
endContainer: node,
|
|
endOffset: match.index + match[ 0 ].length
|
|
};
|
|
sigRange = {
|
|
startContainer: firstSigNode.parentNode,
|
|
startOffset: Array.prototype.indexOf.call( firstSigNode.parentNode.childNodes, firstSigNode ),
|
|
endContainer: node,
|
|
endOffset: match.index + match[ 0 ].length
|
|
};
|
|
|
|
startLevel = getIndentLevel( startNode, rootNode ) + 1;
|
|
endLevel = getIndentLevel( node, rootNode ) + 1;
|
|
if ( startLevel !== endLevel ) {
|
|
console.log( 'Comment starts and ends with different indentation', startNode, node );
|
|
}
|
|
|
|
// Avoid generating multiple comments when there is more than one signature on a single "line".
|
|
// Often this is done when someone edits their comment later and wants to add a note about that.
|
|
// (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments
|
|
// within one paragraph/listitem result in a confusing double "Reply" button, and we also have
|
|
// no way to indicate which one you're replying to (this might matter in the future for
|
|
// notifications or something).
|
|
if (
|
|
curComment.type === 'comment' &&
|
|
( utils.closestElement( node, [ 'li', 'dd', 'p' ] ) || node.parentNode ) ===
|
|
( utils.closestElement( curComment.range.endContainer, [ 'li', 'dd', 'p' ] ) || curComment.range.endContainer.parentNode )
|
|
) {
|
|
// Merge this with the previous comment. Use that comment's author and timestamp.
|
|
curComment.range.endContainer = range.endContainer;
|
|
curComment.range.endOffset = range.endOffset;
|
|
curComment.signatureRanges.push( sigRange );
|
|
curComment.level = Math.min( Math.min( startLevel, endLevel ), curComment.level );
|
|
|
|
nextTimestamp++;
|
|
continue;
|
|
}
|
|
|
|
curComment = {
|
|
type: 'comment',
|
|
timestamp: dfParser( match ),
|
|
author: author,
|
|
range: range,
|
|
signatureRanges: [ sigRange ],
|
|
// Should this use the indent level of `startNode` or `node`?
|
|
level: Math.min( startLevel, endLevel )
|
|
};
|
|
comments.push( curComment );
|
|
nextTimestamp++;
|
|
}
|
|
}
|
|
|
|
// Insert the fake placeholder heading if there are any comments in the 0th section
|
|
// (before the first real heading)
|
|
if ( comments.length && comments[ 0 ].type !== 'heading' ) {
|
|
comments.unshift( fakeHeading );
|
|
}
|
|
|
|
return comments;
|
|
}
|
|
|
|
/**
|
|
* Group discussion comments into threads and associate replies to original messages.
|
|
*
|
|
* Each thread must begin with a heading. Original messages in the thread are treated as replies to
|
|
* its heading. Other replies are associated based on the order and indentation level.
|
|
*
|
|
* Note that the objects in `comments` are extended in-place with the additional data.
|
|
*
|
|
* For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
|
|
* syntax is just for illustration):
|
|
*
|
|
* == A ==
|
|
* B. ~~~~
|
|
* : C.
|
|
* : C. ~~~~
|
|
* :: D. ~~~~
|
|
* ::: E. ~~~~
|
|
* ::: F. ~~~~
|
|
* : G. ~~~~
|
|
* H. ~~~~
|
|
* : I. ~~~~
|
|
*
|
|
* This function would return a structure like:
|
|
*
|
|
* [
|
|
* { type: 'heading', level: 0, range: (h2: A), replies: [
|
|
* { type: 'comment', level: 1, range: (p: B), replies: [
|
|
* { type: 'comment', level: 2, range: (li: C, li: C), replies: [
|
|
* { type: 'comment', level: 3, range: (li: D), replies: [
|
|
* { type: 'comment', level: 4, range: (li: E), replies: [] },
|
|
* { type: 'comment', level: 4, range: (li: F), replies: [] },
|
|
* ] },
|
|
* ] },
|
|
* { type: 'comment', level: 2, range: (li: G), replies: [] },
|
|
* ] },
|
|
* { type: 'comment', level: 1, range: (p: H), replies: [
|
|
* { type: 'comment', level: 2, range: (li: I), replies: [] },
|
|
* ] },
|
|
* ] },
|
|
* ]
|
|
*
|
|
* @param {Object} comments Result of #getComments
|
|
* @return {Object[]} Tree structure of comments, using the same objects as `comments`. Top-level
|
|
* items are the headings. The following properties are added:
|
|
* @return {string} return.id Unique ID (within the page) for this comment, intended to be used to
|
|
* find this comment in other revisions of the same page
|
|
* @return {Object[]} return.replies Comment objects which are replies to this comment
|
|
* @return {Object|null} return.parent Comment object which this is a reply to (null for headings)
|
|
*/
|
|
function groupThreads( comments ) {
|
|
var
|
|
threads = [],
|
|
replies = [],
|
|
commentsById = {},
|
|
i, comment, id, number;
|
|
|
|
for ( i = 0; i < comments.length; i++ ) {
|
|
comment = comments[ i ];
|
|
|
|
if ( comment.level === 0 ) {
|
|
// We don't need ids for section headings right now, but we might in the future
|
|
// e.g. if we allow replying directly to sections (adding top-level comments)
|
|
id = null;
|
|
} else {
|
|
// username+timestamp
|
|
id = [
|
|
comment.author || '',
|
|
comment.timestamp.toISOString()
|
|
].join( '|' );
|
|
|
|
// If there would be multiple comments with the same ID (i.e. the user left multiple comments
|
|
// in one edit, or within a minute), append sequential numbers
|
|
number = 0;
|
|
while ( commentsById[ id + '|' + number ] ) {
|
|
number++;
|
|
}
|
|
id = id + '|' + number;
|
|
}
|
|
|
|
if ( id ) {
|
|
commentsById[ id ] = comment;
|
|
}
|
|
|
|
// This modifies the original objects in `comments`!
|
|
comment.id = id;
|
|
comment.replies = [];
|
|
comment.parent = null;
|
|
|
|
if ( replies.length < comment.level ) {
|
|
// Someone skipped an indentation level (or several). Pretend that the previous reply
|
|
// covers multiple indentation levels, so that following comments get connected to it.
|
|
console.log( 'Comment skips indentation level', comment.range );
|
|
while ( replies.length < comment.level ) {
|
|
replies[ replies.length ] = replies[ replies.length - 1 ];
|
|
}
|
|
}
|
|
|
|
if ( comment.level === 0 ) {
|
|
// new root (thread)
|
|
threads.push( comment );
|
|
} else if ( replies[ comment.level - 1 ] ) {
|
|
// add as a reply to closest less nested comment
|
|
replies[ comment.level - 1 ].replies.push( comment );
|
|
comment.parent = replies[ comment.level - 1 ];
|
|
} else {
|
|
console.log( 'Comment could not be connected to a thread', comment.range );
|
|
}
|
|
|
|
replies[ comment.level ] = comment;
|
|
// cut off more deeply nested replies
|
|
replies.length = comment.level + 1;
|
|
}
|
|
|
|
return threads;
|
|
}
|
|
|
|
/**
|
|
* Get the list of authors involved in a comment and its replies.
|
|
*
|
|
* You probably want to pass a thread root here (a heading).
|
|
*
|
|
* @param {Object} comment Comment object, as returned by #groupThreads
|
|
* @return {Object} Object with comment author usernames as keys
|
|
*/
|
|
|
|
/**
|
|
* Get the list of authors involved in a comment and its replies.
|
|
*
|
|
* You probably want to pass a thread root here (a heading).
|
|
*
|
|
* @param {Object} comment Comment object, as returned by #groupThreads
|
|
* @return {string[]} Author usernames
|
|
*/
|
|
function getAuthors( comment ) {
|
|
var authors = {};
|
|
function getAuthorSet( comment ) {
|
|
if ( comment.author ) {
|
|
authors[ comment.author ] = true;
|
|
}
|
|
// Get the set of authors in the same format from each reply
|
|
comment.replies.map( getAuthorSet );
|
|
}
|
|
|
|
getAuthorSet( comment );
|
|
|
|
return Object.keys( authors ).sort();
|
|
}
|
|
|
|
/**
|
|
* Get the name of the page from which this comment is transcluded (if any).
|
|
*
|
|
* @param {Object} comment Comment object, as returned by #groupThreads
|
|
* @return {string|boolean} `false` if this comment is not transcluded. A string if it's transcluded
|
|
* from a single page (the page title, in text form with spaces). `true` if it's transcluded, but
|
|
* we can't determine the source.
|
|
*/
|
|
function getTranscludedFrom( comment ) {
|
|
var node, about, dataMw;
|
|
|
|
// If some template is used within the comment (e.g. {{ping|…}} or {{tl|…}}), that *does not* mean
|
|
// the comment is transcluded. We only want to consider comments to be transcluded if the wrapper
|
|
// element (usually <li> or <p>) is marked as part of a transclusion.
|
|
// TODO: This seems to work fine but I'm having a hard time explaining why it is correct...
|
|
node = comment.range.endContainer;
|
|
|
|
// Find the node containing information about the transclusion:
|
|
// 1. Find the closest ancestor with an 'about' attribute
|
|
// 2. Find the main node of the about-group (first sibling with the same 'about' attribute)
|
|
// 3. If this is an mw:Transclusion node, return it; otherwise, go to step 1
|
|
while ( node ) {
|
|
// 1.
|
|
if (
|
|
node.nodeType === Node.ELEMENT_NODE &&
|
|
node.getAttribute( 'about' ) &&
|
|
/^#mwt\d+$/.test( node.getAttribute( 'about' ) )
|
|
) {
|
|
about = node.getAttribute( 'about' );
|
|
|
|
// 2.
|
|
while (
|
|
node.previousSibling &&
|
|
node.previousSibling.nodeType === Node.ELEMENT_NODE &&
|
|
node.previousSibling.getAttribute( 'about' ) === about
|
|
) {
|
|
node = node.previousSibling;
|
|
}
|
|
|
|
// 3.
|
|
if (
|
|
node.getAttribute( 'typeof' ) &&
|
|
node.getAttribute( 'typeof' ).split( ' ' ).indexOf( 'mw:Transclusion' ) !== -1
|
|
) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
node = node.parentNode;
|
|
}
|
|
|
|
if ( !node ) {
|
|
// No mw:Transclusion node found, this comment is not transcluded
|
|
return false;
|
|
}
|
|
|
|
dataMw = JSON.parse( node.getAttribute( 'data-mw' ) );
|
|
|
|
// Only return a page name if this is a simple single-template transclusion.
|
|
if (
|
|
dataMw.parts &&
|
|
dataMw.parts.length === 1 &&
|
|
dataMw.parts[ 0 ].template &&
|
|
dataMw.parts[ 0 ].template.target.href
|
|
) {
|
|
// Slice off the './' prefix and convert to text form (underscores to spaces, URL-decoded)
|
|
return mw.libs.ve.normalizeParsoidResourceName( dataMw.parts[ 0 ].template.target.href );
|
|
}
|
|
|
|
// Multi-template transclusion, or a parser function call, or template-affected wikitext outside
|
|
// of a template call, or a mix of the above
|
|
return true;
|
|
}
|
|
|
|
module.exports = {
|
|
findTimestamps: findTimestamps,
|
|
getLocalTimestampParser: getLocalTimestampParser,
|
|
getTimestampRegexp: getTimestampRegexp,
|
|
getTimestampParser: getTimestampParser,
|
|
getComments: getComments,
|
|
groupThreads: groupThreads,
|
|
findSignature: findSignature,
|
|
getAuthors: getAuthors,
|
|
getTranscludedFrom: getTranscludedFrom
|
|
};
|