mediawiki-extensions-Discus.../modules/parser.js
Bartosz Dziewoński 515af82061 Reduce duplication between PHP parser and data gen for JS parser
Also, make the handling of TranslateNumerals and digitsRegexp the same
between PHP and JS.

Change-Id: I1d81343d0b59ab3ecd59ba1c2ad99a729d983ac4
2020-05-19 20:54:44 +02:00

1052 lines
32 KiB
JavaScript

'use strict';
var
utils = require( './utils.js' ),
// Hooks::getLocalData()
data = require( './parser/data.json' ),
moment = require( './lib/moment-timezone/moment-timezone-with-data-1970-2030.js' );
/**
* Utilities for detecting and parsing components of discussion pages: signatures, timestamps,
* comments and threads.
*
* @class mw.dt.parser
*/
/**
* Get text of localisation messages in content language.
*
* @private
* @param {string[]} messages
* @return {string[]}
*/
function getMessages( messages ) {
return messages.map( function ( code ) {
return data.contLangMessages[ code ];
} );
}
/**
* Get a regexp that matches timestamps generated using the given date format.
*
* This only supports format characters that are used by the default date format in any of
* MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters),
* and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are
* complicated).
*
* @param {string} format Date format, as used by MediaWiki
* @param {string} digitsRegexp Regular expression matching a single localised digit, e.g. `[0-9]`
* @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
* for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
* @return {string} Regular expression
*/
function getTimestampRegexp( format, digitsRegexp, tzAbbrs ) {
var s, p, num, code, endQuote, tzRegexp, regexp;
function regexpGroup( regexp ) {
return '(' + regexp + ')';
}
function regexpAlternateGroup( array ) {
return '(' + array.map( mw.util.escapeRegExp ).join( '|' ) + ')';
}
s = '';
// Adapted from Language::sprintfDate()
for ( p = 0; p < format.length; p++ ) {
num = false;
code = format[ p ];
if ( code === 'x' && p < format.length - 1 ) {
code += format[ ++p ];
}
if ( code === 'xk' && p < format.length - 1 ) {
code += format[ ++p ];
}
switch ( code ) {
case 'xx':
s += 'x';
break;
case 'xg':
s += regexpAlternateGroup( getMessages( [
'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
'december-gen'
] ) );
break;
case 'd':
num = '2';
break;
case 'D':
s += regexpAlternateGroup( getMessages( [
'sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'
] ) );
break;
case 'j':
num = '1,2';
break;
case 'l':
s += regexpAlternateGroup( getMessages( [
'sunday', 'monday', 'tuesday', 'wednesday', 'thursday',
'friday', 'saturday'
] ) );
break;
case 'F':
s += regexpAlternateGroup( getMessages( [
'january', 'february', 'march', 'april', 'may_long', 'june',
'july', 'august', 'september', 'october', 'november',
'december'
] ) );
break;
case 'M':
s += regexpAlternateGroup( getMessages( [
'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
'sep', 'oct', 'nov', 'dec'
] ) );
break;
case 'n':
num = '1,2';
break;
case 'Y':
num = '4';
break;
case 'xkY':
num = '4';
break;
case 'G':
num = '1,2';
break;
case 'H':
num = '2';
break;
case 'i':
num = '2';
break;
case '\\':
// Backslash escaping
if ( p < format.length - 1 ) {
s += mw.util.escapeRegExp( format[ ++p ] );
} else {
s += mw.util.escapeRegExp( '\\' );
}
break;
case '"':
// Quoted literal
if ( p < format.length - 1 ) {
endQuote = format.indexOf( '"', p + 1 );
if ( endQuote === -1 ) {
// No terminating quote, assume literal "
s += '"';
} else {
s += mw.util.escapeRegExp( format.substr( p + 1, endQuote - p - 1 ) );
p = endQuote;
}
} else {
// Quote at end of string, assume literal "
s += '"';
}
break;
default:
s += mw.util.escapeRegExp( format[ p ] );
}
if ( num !== false ) {
s += regexpGroup( digitsRegexp + '{' + num + '}' );
}
}
tzRegexp = regexpAlternateGroup( Object.keys( tzAbbrs ) );
// Hard-coded parentheses and space like in Parser::pstPass2
// Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T245784)
regexp = s + '[\\u200E\\u200F]? [\\u200E\\u200F]?\\(' + tzRegexp + '\\)';
return regexp;
}
/**
* Get a function that parses timestamps generated using the given date format, based on the result
* of matching the regexp returned by #getTimestampRegexp.
*
* @param {string} format Date format, as used by MediaWiki
* @param {string|null} digits Localised digits from 0 to 9, e.g. `0123456789`
* @param {string} localTimezone Local timezone IANA name, e.g. `America/New_York`
* @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
* for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
* @return {Function} Parser function
* @return {Array} return.match Regexp match data
* @return {Object} return.return Moment object
*/
function getTimestampParser( format, digits, localTimezone, tzAbbrs ) {
var p, code, endQuote, matchingGroups = [];
for ( p = 0; p < format.length; p++ ) {
code = format[ p ];
if ( code === 'x' && p < format.length - 1 ) {
code += format[ ++p ];
}
if ( code === 'xk' && p < format.length - 1 ) {
code += format[ ++p ];
}
switch ( code ) {
case 'xx':
break;
case 'xg':
case 'd':
case 'j':
case 'D':
case 'l':
case 'F':
case 'M':
case 'n':
case 'Y':
case 'xkY':
case 'G':
case 'H':
case 'i':
matchingGroups.push( code );
break;
case '\\':
// Backslash escaping
if ( p < format.length - 1 ) {
++p;
}
break;
case '"':
// Quoted literal
if ( p < format.length - 1 ) {
endQuote = format.indexOf( '"', p + 1 );
if ( endQuote !== -1 ) {
p = endQuote;
}
}
break;
default:
break;
}
}
function untransformDigits( text ) {
if ( !digits ) {
return text;
}
return text.replace(
new RegExp( '[' + digits + ']', 'g' ),
function ( m ) {
return digits.indexOf( m );
}
);
}
return function timestampParser( match ) {
var
year = 0,
monthIdx = 0,
day = 0,
hour = 0,
minute = 0,
tzAbbr,
i, code, text,
date;
for ( i = 0; i < matchingGroups.length; i++ ) {
code = matchingGroups[ i ];
text = match[ i + 1 ];
switch ( code ) {
case 'xg':
monthIdx = getMessages( [
'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
'december-gen'
] ).indexOf( text );
break;
case 'd':
case 'j':
day = Number( untransformDigits( text ) );
break;
case 'D':
case 'l':
// Day of the week - unused
break;
case 'F':
monthIdx = getMessages( [
'january', 'february', 'march', 'april', 'may_long', 'june',
'july', 'august', 'september', 'october', 'november',
'december'
] ).indexOf( text );
break;
case 'M':
monthIdx = getMessages( [
'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
'sep', 'oct', 'nov', 'dec'
] ).indexOf( text );
break;
case 'n':
monthIdx = Number( untransformDigits( text ) ) - 1;
break;
case 'Y':
year = Number( untransformDigits( text ) );
break;
case 'xkY':
// Thai year
year = Number( untransformDigits( text ) ) - 543;
break;
case 'G':
case 'H':
hour = Number( untransformDigits( text ) );
break;
case 'i':
minute = Number( untransformDigits( text ) );
break;
default:
throw new Error( 'Not implemented' );
}
}
// The last matching group is the timezone abbreviation
tzAbbr = tzAbbrs[ match[ match.length - 1 ] ];
// Most of the time, the timezone abbreviation is not necessary to parse the date, since we
// can assume all times are in the wiki's local timezone.
date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );
// But during the "fall back" at the end of DST, some times will happen twice. Per the docs,
// "Moment Timezone handles this by always using the earlier instance of a duplicated hour."
// https://momentjs.com/timezone/docs/#/using-timezones/parsing-ambiguous-inputs/
// Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect when
// that behavior was incorrect...
if ( date.zoneAbbr() !== tzAbbr ) {
// ...and force the correct parsing. I can't find proper documentation for this feature,
// but this pull request explains it: https://github.com/moment/moment-timezone/pull/101
moment.tz.moveAmbiguousForward = true;
date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );
moment.tz.moveAmbiguousForward = false;
if ( date.zoneAbbr() !== tzAbbr ) {
// This should not be possible for "genuine" timestamps generated by MediaWiki.
// But bots and humans get it wrong when marking up unsigned comments…
// https://pl.wikipedia.org/w/index.php?title=Wikipedia:Kawiarenka/Artykuły&diff=prev&oldid=54772606
date.discussionToolsWarning = 'Timestamp has timezone abbreviation for the wrong time';
} else {
date.discussionToolsWarning = 'Ambiguous time at DST switchover was parsed';
}
}
return date;
};
}
/**
* Get a regexp that matches timestamps in the local date format.
*
* This calls #getTimestampRegexp with predefined data for the current wiki.
*
* @private
* @return {string} Regular expression
*/
function getLocalTimestampRegexp() {
return getTimestampRegexp(
data.dateFormat,
data.digits ? '[' + data.digits + ']' : '\\d',
data.timezones
);
}
/**
* Get a function that parses timestamps in the local date format, based on the result
* of matching the regexp returned by #getLocalTimestampRegexp.
*
* This calls #getTimestampParser with predefined data for the current wiki.
*
* @private
* @return {Function} Parser function
* @return {Array} return.match Regexp match data
* @return {Date} return.return
*/
function getLocalTimestampParser() {
return getTimestampParser(
data.dateFormat,
data.digits,
data.localTimezone,
data.timezones
);
}
/**
* Callback for document.createTreeWalker that will skip over nodes where we don't want to detect
* comments (or section headings).
*
* @param {Node} node
* @return {number} Appropriate NodeFilter constant
*/
function acceptOnlyNodesAllowingComments( node ) {
// The table of contents has a heading that gets erroneously detected as a section
if ( node.id === 'toc' ) {
return NodeFilter.FILTER_REJECT;
}
return NodeFilter.FILTER_ACCEPT;
}
/**
* Find all timestamps within a DOM subtree.
*
* @param {HTMLElement} rootNode Node to search
* @return {Array[]} Results. Each result is a two-element array.
* @return {Text} return.0 Text node containing the timestamp
* @return {Array} return.1 Regexp match data, which specifies the location of the match, and which
* can be parsed using #getLocalTimestampParser
*/
function findTimestamps( rootNode ) {
var
matches = [],
treeWalker = rootNode.ownerDocument.createTreeWalker(
rootNode,
NodeFilter.SHOW_TEXT,
acceptOnlyNodesAllowingComments,
false
),
dateRegexp = getLocalTimestampRegexp(),
node, startNode, nodeText, match;
while ( ( node = treeWalker.nextNode() ) ) {
startNode = node;
nodeText = '';
while ( node ) {
nodeText += node.nodeValue;
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
// which apparently are often turned into &nbsp; entities by buggy editing tools. To handle
// this, we must piece together the text, so that our regexp can match those timestamps.
if (
node.nextSibling &&
node.nextSibling.nodeType === Node.ELEMENT_NODE &&
node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
) {
nodeText += node.nextSibling.firstChild.nodeValue;
// If the entity is followed by more text, do this again
if (
node.nextSibling.nextSibling &&
node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
) {
node = node.nextSibling.nextSibling;
} else {
node = null;
}
} else {
node = null;
}
}
// Technically, there could be multiple matches in a single text node. However, the ultimate
// point of this is to find the signatures which precede the timestamps, and any later
// timestamps in the text node can't be directly preceded by a signature (as we require them to
// have links), so we only concern ourselves with the first match.
if ( ( match = nodeText.match( dateRegexp ) ) ) {
matches.push( [ startNode, match ] );
}
}
return matches;
}
/**
* Get a MediaWiki page title from a URL.
*
* @private
* @param {string} url
* @return {mw.Title|null} Page title, or null if this isn't a link to a page
*/
function getTitleFromUrl( url ) {
var articlePathRegexp, match;
try {
url = new mw.Uri( url );
} catch ( err ) {
// T106244: URL encoded values using fallback 8-bit encoding (invalid UTF-8) cause mediawiki.Uri to crash
return null;
}
articlePathRegexp = new RegExp(
mw.util.escapeRegExp( mw.config.get( 'wgArticlePath' ) )
.replace( mw.util.escapeRegExp( '$1' ), '(.*)' )
);
if ( ( match = url.path.match( articlePathRegexp ) ) ) {
return mw.Title.newFromText( decodeURIComponent( match[ 1 ] ) );
}
if ( url.query.title ) {
return mw.Title.newFromText( url.query.title );
}
return null;
}
/**
* Find a user signature preceding a timestamp.
*
* The signature includes the timestamp node.
*
* A signature must contain at least one link to the user's userpage, discussion page or
* contributions (and may contain other links). The link may be nested in other elements.
*
* @private
* @param {Text} timestampNode Text node
* @param {Node} [until] Node to stop searching at
* @return {Array} Result, a two-element array
* @return {Node[]} return.0 Sibling nodes comprising the signature, in reverse order (with
* `timestampNode` as the first element)
* @return {string|null} return.1 Username, null for unsigned comments
*/
function findSignature( timestampNode, until ) {
var
node = timestampNode,
sigNodes = [ node ],
sigUsername = null,
length = 0,
lastLinkNode = timestampNode,
links, nodes;
while ( ( node = node.previousSibling ) && length < data.signatureScanLimit && node !== until ) {
sigNodes.push( node );
length += ( node.textContent || '' ).length;
if ( !node.tagName ) {
continue;
}
links = [];
if ( node.tagName.toLowerCase() === 'a' ) {
links.push( node );
} else {
// Handle links nested in formatting elements.
// Helpful accidental feature: users whose signature is not detected in full (due to
// text formatting) can just wrap it in a <span> to fix that.
// "Ten Pound Hammer • (What did I screw up now?)"
// "« Saper // dyskusja »"
nodes = node.getElementsByTagName( 'a' );
links.push.apply( links, nodes );
}
if ( !links.length ) {
continue;
}
// Use .some() rather than .every() to permit vanity links
// "TonyTheTiger (T / C / WP:FOUR / WP:CHICAGO / WP:WAWARD)"
// eslint-disable-next-line no-loop-func
if ( links.some( function ( link ) {
var username, title;
title = getTitleFromUrl( link.href );
if ( !title ) {
return false;
}
if (
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user ||
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user_talk
) {
username = title.getMainText();
if ( username.indexOf( '/' ) !== -1 ) {
return false;
}
} else if (
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).special &&
title.getMainText().split( '/' )[ 0 ] === data.specialContributionsName
) {
username = title.getMainText().split( '/' )[ 1 ];
// Normalize the username: users may link to their contributions with an unnormalized name
username = mw.Title.makeTitle( mw.config.get( 'wgNamespaceIds' ).user, username ).getMainText();
}
if ( !username ) {
return false;
}
if ( mw.util.isIPv6Address( username ) ) {
// Bot-generated links "Preceding unsigned comment added by" have non-standard case
username = username.toUpperCase();
}
// Accept the first link to the user namespace, then only accept links to that user
if ( !sigUsername ) {
sigUsername = username;
}
return username === sigUsername;
} ) ) {
lastLinkNode = node;
}
// Keep looking if a node with links wasn't a link to a user page
// "Doc James (talk · contribs · email)"
}
// Pop excess text nodes
while ( sigNodes[ sigNodes.length - 1 ] !== lastLinkNode ) {
sigNodes.pop();
}
return [ sigNodes, sigUsername ];
}
/**
* Get the indent level of a node, relative to its ancestor node.
*
* The indent level is the number of lists inside of which it is nested.
*
* @private
* @param {Node} node
* @param {HTMLElement} rootNode Node to stop counting at
* @return {number}
*/
function getIndentLevel( node, rootNode ) {
var indent = 0, tagName;
while ( node ) {
if ( node === rootNode ) {
break;
}
tagName = node.tagName && node.tagName.toLowerCase();
if ( tagName === 'li' || tagName === 'dd' ) {
indent++;
}
node = node.parentNode;
}
return indent;
}
/**
* Return the next leaf node in the tree order that is not an empty or whitespace-only text node.
*
* In other words, this returns a Text node with content other than whitespace, or an Element node
* with no children, that follows the given node in the HTML source.
*
* @private
* @param {Node} node Node to start searching at. If it isn't a leaf node, its children are ignored.
* @param {HTMLElement} rootNode Node to stop searching at
* @return {Node|null}
*/
function nextInterestingLeafNode( node, rootNode ) {
var treeWalker = rootNode.ownerDocument.createTreeWalker(
rootNode,
// eslint-disable-next-line no-bitwise
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
function ( n ) {
// Ignore this node and its descendants
// (unless it's the root node, this is a special case for "fakeHeading" handling)
if ( node !== rootNode && ( n === node || n.parentNode === node ) ) {
return NodeFilter.FILTER_REJECT;
}
if (
( n.nodeType === Node.TEXT_NODE && utils.htmlTrim( n.textContent ) !== '' ) ||
( n.nodeType === Node.CDATA_SECTION_NODE && utils.htmlTrim( n.textContent ) !== '' ) ||
( n.nodeType === Node.ELEMENT_NODE && !n.firstChild )
) {
return NodeFilter.FILTER_ACCEPT;
}
return NodeFilter.FILTER_SKIP;
},
false
);
treeWalker.currentNode = node;
treeWalker.nextNode();
return treeWalker.currentNode;
}
/**
* Get all discussion comments (and headings) within a DOM subtree.
*
* This returns a flat list, use #groupThreads to associate replies to original messages and get a
* tree structure starting at section headings.
*
* For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
* syntax is just for illustration):
*
* == A ==
* B. ~~~~
* : C.
* : C. ~~~~
* :: D. ~~~~
* ::: E. ~~~~
* ::: F. ~~~~
* : G. ~~~~
* H. ~~~~
* : I. ~~~~
*
* This function would return a structure like:
*
* [
* { type: 'heading', level: 0, range: (h2: A) },
* { type: 'comment', level: 1, range: (p: B) },
* { type: 'comment', level: 2, range: (li: C, li: C) },
* { type: 'comment', level: 3, range: (li: D) },
* { type: 'comment', level: 4, range: (li: E) },
* { type: 'comment', level: 4, range: (li: F) },
* { type: 'comment', level: 2, range: (li: G) },
* { type: 'comment', level: 1, range: (p: H) },
* { type: 'comment', level: 2, range: (li: I) }
* ]
*
* @param {HTMLElement} rootNode
* @return {Object[]} Results. Each result is an object.
* @return {string} return.type `heading` or `comment`
* @return {Object} return.range Object describing the extent of the comment, including the
* signature and timestamp. It has the same properties as a Range object: `startContainer`,
* `startOffset`, `endContainer`, `endOffset` (we don't use a real Range because they change
* magically when the DOM structure changes).
* @return {Object[]} [return.signatureRanges] Objects describing the extent of signatures (plus
* timestamps) for this comment. There is always at least one signature, but there may be
* multiple. The author and timestamp of the comment is determined from the first signature.
* The last node in every signature range is the text node containing the timestamp.
* @return {number} return.level Indentation level of the comment. Headings are `0`, comments start
* at `1`.
* @return {Object} [return.timestamp] Timestamp (Moment object), undefined for headings
* @return {string} [return.author] Comment author's username, undefined for headings
*/
function getComments( rootNode ) {
var
dfParser = getLocalTimestampParser(),
comments = [],
timestamps, nextTimestamp, treeWalker,
node, range, fakeHeading, curComment,
foundSignature, firstSigNode, sigRange, author, startNode, match, startLevel, endLevel, dateTime, warnings;
timestamps = findTimestamps( rootNode );
treeWalker = rootNode.ownerDocument.createTreeWalker(
rootNode,
// eslint-disable-next-line no-bitwise
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
acceptOnlyNodesAllowingComments,
false
);
// Placeholder heading in case there are comments in the 0th section
range = {
startContainer: rootNode,
startOffset: 0,
endContainer: rootNode,
endOffset: 0
};
fakeHeading = {
placeholderHeading: true,
type: 'heading',
range: range,
level: 0
};
curComment = fakeHeading;
nextTimestamp = 0;
while ( ( node = treeWalker.nextNode() ) ) {
if ( node.tagName && node.tagName.match( /^h[1-6]$/i ) ) {
range = {
startContainer: node,
startOffset: 0,
endContainer: node,
endOffset: node.childNodes.length
};
curComment = {
type: 'heading',
range: range,
level: 0
};
comments.push( curComment );
} else if ( timestamps[ nextTimestamp ] && node === timestamps[ nextTimestamp ][ 0 ] ) {
warnings = [];
foundSignature = findSignature( node, curComment.range.endContainer );
author = foundSignature[ 1 ];
firstSigNode = foundSignature[ 0 ][ foundSignature[ 0 ].length - 1 ];
if ( !author ) {
// Ignore timestamps for which we couldn't find a signature. It's probably not a real
// comment, but just a false match due to a copypasted timestamp.
nextTimestamp++;
continue;
}
// Everything from last comment up to here is the next comment
startNode = nextInterestingLeafNode( curComment.range.endContainer, rootNode );
match = timestamps[ nextTimestamp ][ 1 ];
range = {
startContainer: startNode.parentNode,
startOffset: Array.prototype.indexOf.call( startNode.parentNode.childNodes, startNode ),
endContainer: node,
endOffset: match.index + match[ 0 ].length
};
sigRange = {
startContainer: firstSigNode.parentNode,
startOffset: Array.prototype.indexOf.call( firstSigNode.parentNode.childNodes, firstSigNode ),
endContainer: node,
endOffset: match.index + match[ 0 ].length
};
startLevel = getIndentLevel( startNode, rootNode ) + 1;
endLevel = getIndentLevel( node, rootNode ) + 1;
if ( startLevel !== endLevel ) {
warnings.push( 'Comment starts and ends with different indentation' );
}
// Avoid generating multiple comments when there is more than one signature on a single "line".
// Often this is done when someone edits their comment later and wants to add a note about that.
// (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments
// within one paragraph/list-item result in a confusing double "Reply" button, and we also have
// no way to indicate which one you're replying to (this might matter in the future for
// notifications or something).
if (
curComment.type === 'comment' &&
( utils.closestElement( node, [ 'li', 'dd', 'p' ] ) || node.parentNode ) ===
( utils.closestElement( curComment.range.endContainer, [ 'li', 'dd', 'p' ] ) || curComment.range.endContainer.parentNode )
) {
// Merge this with the previous comment. Use that comment's author and timestamp.
curComment.range.endContainer = range.endContainer;
curComment.range.endOffset = range.endOffset;
curComment.signatureRanges.push( sigRange );
curComment.level = Math.min( Math.min( startLevel, endLevel ), curComment.level );
nextTimestamp++;
continue;
}
dateTime = dfParser( match );
if ( dateTime.discussionToolsWarning ) {
warnings.push( dateTime.discussionToolsWarning );
}
curComment = {
type: 'comment',
timestamp: dateTime,
author: author,
range: range,
signatureRanges: [ sigRange ],
// Should this use the indent level of `startNode` or `node`?
level: Math.min( startLevel, endLevel )
};
if ( warnings.length ) {
curComment.warnings = warnings;
}
comments.push( curComment );
nextTimestamp++;
}
}
// Insert the fake placeholder heading if there are any comments in the 0th section
// (before the first real heading)
if ( comments.length && comments[ 0 ].type !== 'heading' ) {
comments.unshift( fakeHeading );
}
return comments;
}
/**
* Group discussion comments into threads and associate replies to original messages.
*
* Each thread must begin with a heading. Original messages in the thread are treated as replies to
* its heading. Other replies are associated based on the order and indentation level.
*
* Note that the objects in `comments` are extended in-place with the additional data.
*
* For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
* syntax is just for illustration):
*
* == A ==
* B. ~~~~
* : C.
* : C. ~~~~
* :: D. ~~~~
* ::: E. ~~~~
* ::: F. ~~~~
* : G. ~~~~
* H. ~~~~
* : I. ~~~~
*
* This function would return a structure like:
*
* [
* { type: 'heading', level: 0, range: (h2: A), replies: [
* { type: 'comment', level: 1, range: (p: B), replies: [
* { type: 'comment', level: 2, range: (li: C, li: C), replies: [
* { type: 'comment', level: 3, range: (li: D), replies: [
* { type: 'comment', level: 4, range: (li: E), replies: [] },
* { type: 'comment', level: 4, range: (li: F), replies: [] },
* ] },
* ] },
* { type: 'comment', level: 2, range: (li: G), replies: [] },
* ] },
* { type: 'comment', level: 1, range: (p: H), replies: [
* { type: 'comment', level: 2, range: (li: I), replies: [] },
* ] },
* ] },
* ]
*
* @param {Object} comments Result of #getComments
* @return {Object[]} Tree structure of comments, using the same objects as `comments`. Top-level
* items are the headings. The following properties are added:
* @return {string} return.id Unique ID (within the page) for this comment, intended to be used to
* find this comment in other revisions of the same page
* @return {Object[]} return.replies Comment objects which are replies to this comment
* @return {Object|null} return.parent Comment object which this is a reply to (null for headings)
*/
function groupThreads( comments ) {
var
threads = [],
replies = [],
commentsById = {},
i, comment, id, number;
for ( i = 0; i < comments.length; i++ ) {
comment = comments[ i ];
if ( comment.level === 0 ) {
// We don't need ids for section headings right now, but we might in the future
// e.g. if we allow replying directly to sections (adding top-level comments)
id = null;
} else {
// username+timestamp
id = [
comment.author || '',
comment.timestamp.toISOString()
].join( '|' );
// If there would be multiple comments with the same ID (i.e. the user left multiple comments
// in one edit, or within a minute), append sequential numbers
number = 0;
while ( commentsById[ id + '|' + number ] ) {
number++;
}
id = id + '|' + number;
}
if ( id ) {
commentsById[ id ] = comment;
}
// This modifies the original objects in `comments`!
comment.id = id;
comment.replies = [];
comment.parent = null;
if ( replies.length < comment.level ) {
// Someone skipped an indentation level (or several). Pretend that the previous reply
// covers multiple indentation levels, so that following comments get connected to it.
comment.warnings = comment.warnings || [];
comment.warnings.push( 'Comment skips indentation level' );
while ( replies.length < comment.level ) {
replies[ replies.length ] = replies[ replies.length - 1 ];
}
}
if ( comment.level === 0 ) {
// New root (thread)
threads.push( comment );
} else if ( replies[ comment.level - 1 ] ) {
// Add as a reply to the closest less-nested comment
comment.parent = replies[ comment.level - 1 ];
comment.parent.replies.push( comment );
} else {
comment.warnings = comment.warnings || [];
comment.warnings.push( 'Comment could not be connected to a thread' );
}
replies[ comment.level ] = comment;
// Cut off more deeply nested replies
replies.length = comment.level + 1;
}
return threads;
}
/**
* Get the list of authors involved in a comment and its replies.
*
* You probably want to pass a thread root here (a heading).
*
* @param {Object} comment Comment object, as returned by #groupThreads
* @return {string[]} Author usernames
*/
function getAuthors( comment ) {
var authors = {};
function getAuthorSet( comment ) {
if ( comment.author ) {
authors[ comment.author ] = true;
}
// Get the set of authors in the same format from each reply
comment.replies.map( getAuthorSet );
}
getAuthorSet( comment );
return Object.keys( authors ).sort();
}
/**
* Get the name of the page from which this comment is transcluded (if any).
*
* @param {Object} comment Comment object, as returned by #groupThreads
* @return {string|boolean} `false` if this comment is not transcluded. A string if it's transcluded
* from a single page (the page title, in text form with spaces). `true` if it's transcluded, but
* we can't determine the source.
*/
function getTranscludedFrom( comment ) {
var node, about, dataMw;
// If some template is used within the comment (e.g. {{ping|…}} or {{tl|…}}), that *does not* mean
// the comment is transcluded. We only want to consider comments to be transcluded if the wrapper
// element (usually <li> or <p>) is marked as part of a transclusion.
// TODO: This seems to work fine but I'm having a hard time explaining why it is correct...
node = comment.range.endContainer;
// Find the node containing information about the transclusion:
// 1. Find the closest ancestor with an 'about' attribute
// 2. Find the main node of the about-group (first sibling with the same 'about' attribute)
// 3. If this is an mw:Transclusion node, return it; otherwise, go to step 1
while ( node ) {
// 1.
if (
node.nodeType === Node.ELEMENT_NODE &&
node.getAttribute( 'about' ) &&
/^#mwt\d+$/.test( node.getAttribute( 'about' ) )
) {
about = node.getAttribute( 'about' );
// 2.
while (
node.previousSibling &&
node.previousSibling.nodeType === Node.ELEMENT_NODE &&
node.previousSibling.getAttribute( 'about' ) === about
) {
node = node.previousSibling;
}
// 3.
if (
node.getAttribute( 'typeof' ) &&
node.getAttribute( 'typeof' ).split( ' ' ).indexOf( 'mw:Transclusion' ) !== -1
) {
break;
}
}
node = node.parentNode;
}
if ( !node ) {
// No mw:Transclusion node found, this comment is not transcluded
return false;
}
dataMw = JSON.parse( node.getAttribute( 'data-mw' ) );
// Only return a page name if this is a simple single-template transclusion.
if (
dataMw.parts &&
dataMw.parts.length === 1 &&
dataMw.parts[ 0 ].template &&
dataMw.parts[ 0 ].template.target.href
) {
// Slice off the './' prefix and convert to text form (underscores to spaces, URL-decoded)
return mw.libs.ve.normalizeParsoidResourceName( dataMw.parts[ 0 ].template.target.href );
}
// Multi-template transclusion, or a parser function call, or template-affected wikitext outside
// of a template call, or a mix of the above
return true;
}
module.exports = {
findTimestamps: findTimestamps,
getLocalTimestampParser: getLocalTimestampParser,
getTimestampRegexp: getTimestampRegexp,
getTimestampParser: getTimestampParser,
getComments: getComments,
groupThreads: groupThreads,
findSignature: findSignature,
getAuthors: getAuthors,
getTranscludedFrom: getTranscludedFrom
};