mediawiki-extensions-Discus.../modules/parser.js

/* eslint-disable no-console */
'use strict';

var
	utils = require( './utils.js' ),
	// DiscussionToolsHooks::getLocalData()
	data = require( './parser/data.json' ),
	moment = require( './lib/moment-timezone/moment-timezone-with-data-1970-2030.js' );

/**
 * Utilities for detecting and parsing components of discussion pages: signatures, timestamps,
 * comments and threads.
 *
 * @class mw.dt.parser
 */

/**
 * Get text of localisation messages in content language.
 *
 * @private
 * @param {string[]} messages
 * @return {string[]}
 */
function getMessages( messages ) {
	return messages.map( function ( code ) {
		return data.contLangMessages[ code ];
	} );
}

/**
 * Get a regexp that matches timestamps generated using the given date format.
 *
 * This only supports format characters that are used by the default date format in any of
 * MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters),
 * and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are
 * complicated).
 *
 * @param {string} format Date format, as used by MediaWiki
 * @param {string} digitsRegexp Regular expression matching a single localised digit, e.g. `[0-9]`
 * @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
 *   for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
 * @return {string} Regular expression
 */
function getTimestampRegexp( format, digitsRegexp, tzAbbrs ) {
	var s, p, num, code, endQuote, tzRegexp, regexp;

	function regexpGroup( regexp ) {
		return '(' + regexp + ')';
	}

	function regexpAlternateGroup( array ) {
		return '(' + array.map( mw.util.escapeRegExp ).join( '|' ) + ')';
	}

	s = '';
	// Adapted from Language::sprintfDate()
	for ( p = 0; p < format.length; p++ ) {
		num = false;
		code = format[ p ];
		if ( code === 'x' && p < format.length - 1 ) {
			code += format[ ++p ];
		}
		if ( code === 'xk' && p < format.length - 1 ) {
			code += format[ ++p ];
		}

		switch ( code ) {
			case 'xx':
				s += 'x';
				break;
			case 'xg':
				s += regexpAlternateGroup( getMessages( [
					'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
					'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
					'december-gen'
				] ) );
				break;
			case 'd':
				num = '2';
				break;
			case 'D':
				s += regexpAlternateGroup( getMessages( [
					'sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'
				] ) );
				break;
			case 'j':
				num = '1,2';
				break;
			case 'l':
				s += regexpAlternateGroup( getMessages( [
					'sunday', 'monday', 'tuesday', 'wednesday', 'thursday',
					'friday', 'saturday'
				] ) );
				break;
			case 'F':
				s += regexpAlternateGroup( getMessages( [
					'january', 'february', 'march', 'april', 'may_long', 'june',
					'july', 'august', 'september', 'october', 'november',
					'december'
				] ) );
				break;
			case 'M':
				s += regexpAlternateGroup( getMessages( [
					'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
					'sep', 'oct', 'nov', 'dec'
				] ) );
				break;
			case 'n':
				num = '1,2';
				break;
			case 'Y':
				num = '4';
				break;
			case 'xkY':
				num = '4';
				break;
			case 'G':
				num = '1,2';
				break;
			case 'H':
				num = '2';
				break;
			case 'i':
				num = '2';
				break;
			case '\\':
				// Backslash escaping
				if ( p < format.length - 1 ) {
					s += mw.util.escapeRegExp( format[ ++p ] );
				} else {
					s += mw.util.escapeRegExp( '\\' );
				}
				break;
			case '"':
				// Quoted literal
				if ( p < format.length - 1 ) {
					endQuote = format.indexOf( '"', p + 1 );
					if ( endQuote === -1 ) {
						// No terminating quote, assume literal "
						s += '"';
					} else {
						s += mw.util.escapeRegExp( format.substr( p + 1, endQuote - p - 1 ) );
						p = endQuote;
					}
				} else {
					// Quote at end of string, assume literal "
					s += '"';
				}
				break;
			default:
				s += mw.util.escapeRegExp( format[ p ] );
		}
		if ( num !== false ) {
			s += regexpGroup( digitsRegexp + '{' + num + '}' );
		}
	}

	tzRegexp = regexpAlternateGroup( Object.keys( tzAbbrs ) );
	// Hardcoded parentheses and space like in Parser::pstPass2
	// Ignore some invisible Unicode characters that often sneak into copypasted timestamps (T245784)
	regexp = s + '[\\u200E\\u200F]? [\\u200E\\u200F]?\\(' + tzRegexp + '\\)';

	return regexp;
}

/**
 * Get a function that parses timestamps generated using the given date format, based on the result
 * of matching the regexp returned by #getTimestampRegexp.
 *
 * @param {string} format Date format, as used by MediaWiki
 * @param {string|null} digits Localised digits from 0 to 9, e.g. `0123456789`
 * @param {string} localTimezone Local timezone IANA name, e.g. `America/New_York`
 * @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
 *   for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
 * @return {Function} Parser function
 * @return {Array} return.match Regexp match data
 * @return {Object} return.return Moment object
 */
function getTimestampParser( format, digits, localTimezone, tzAbbrs ) {
	var p, code, endQuote, matchingGroups = [];
	for ( p = 0; p < format.length; p++ ) {
		code = format[ p ];
		if ( code === 'x' && p < format.length - 1 ) {
			code += format[ ++p ];
		}
		if ( code === 'xk' && p < format.length - 1 ) {
			code += format[ ++p ];
		}

		switch ( code ) {
			case 'xx':
				break;
			case 'xg':
			case 'd':
			case 'j':
			case 'D':
			case 'l':
			case 'F':
			case 'M':
			case 'n':
			case 'Y':
			case 'xkY':
			case 'G':
			case 'H':
			case 'i':
				matchingGroups.push( code );
				break;
			case '\\':
				// Backslash escaping
				if ( p < format.length - 1 ) {
					++p;
				}
				break;
			case '"':
				// Quoted literal
				if ( p < format.length - 1 ) {
					endQuote = format.indexOf( '"', p + 1 );
					if ( endQuote !== -1 ) {
						p = endQuote;
					}
				}
				break;
			default:
				break;
		}
	}

	function untransformDigits( text ) {
		if ( !digits ) {
			return text;
		}
		return text.replace(
			new RegExp( '[' + digits + ']', 'g' ),
			function ( m ) {
				return digits.indexOf( m );
			}
		);
	}

	return function timestampParser( match ) {
		var
			year = 0,
			monthIdx = 0,
			day = 0,
			hour = 0,
			minute = 0,
			tzAbbr,
			i, code, text,
			date;
		for ( i = 0; i < matchingGroups.length; i++ ) {
			code = matchingGroups[ i ];
			text = match[ i + 1 ];

			switch ( code ) {
				case 'xg':
					monthIdx = getMessages( [
						'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
						'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
						'december-gen'
					] ).indexOf( text );
					break;
				case 'd':
				case 'j':
					day = Number( untransformDigits( text ) );
					break;
				case 'D':
				case 'l':
					// Day of the week - unused
					break;
				case 'F':
					monthIdx = getMessages( [
						'january', 'february', 'march', 'april', 'may_long', 'june',
						'july', 'august', 'september', 'october', 'november',
						'december'
					] ).indexOf( text );
					break;
				case 'M':
					monthIdx = getMessages( [
						'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
						'sep', 'oct', 'nov', 'dec'
					] ).indexOf( text );
					break;
				case 'n':
					monthIdx = Number( untransformDigits( text ) ) - 1;
					break;
				case 'Y':
					year = Number( untransformDigits( text ) );
					break;
				case 'xkY':
					// Thai year
					year = Number( untransformDigits( text ) ) - 543;
					break;
				case 'G':
				case 'H':
					hour = Number( untransformDigits( text ) );
					break;
				case 'i':
					minute = Number( untransformDigits( text ) );
					break;
				default:
					throw new Error( 'Not implemented' );
			}
		}
		// The last matching group is the timezone abbreviation
		tzAbbr = tzAbbrs[ match[ match.length - 1 ] ];

		// Most of the time, the timezone abbreviation is not necessary to parse the date, since we
		// can assume all times are in the wiki's local timezone.
		date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );

		// But during the "fall back" at the end of DST, some times will happen twice. Per the docs,
		// "Moment Timezone handles this by always using the earlier instance of a duplicated hour."
		// https://momentjs.com/timezone/docs/#/using-timezones/parsing-ambiguous-inputs/

		// Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect when
		// that behavior was incorrect...
		if ( date.zoneAbbr() !== tzAbbr ) {
			// ...and force the correct parsing. I can't find proper documentation for this feature,
			// but this pull request explains it: https://github.com/moment/moment-timezone/pull/101
			moment.tz.moveAmbiguousForward = true;
			date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );
			moment.tz.moveAmbiguousForward = false;
			if ( date.zoneAbbr() !== tzAbbr ) {
				// This should not be possible for "genuine" timestamps generated by MediaWiki.
				// But bots and humans get it wrong when marking up unsigned comments…
				// https://pl.wikipedia.org/w/index.php?title=Wikipedia:Kawiarenka/Artykuły&diff=prev&oldid=54772606
				console.log( 'Timestamp has timezone abbreviation for the wrong time: ' + match[ 0 ] );
			} else {
				console.log( 'Ambiguous time at DST switchover was parsed: ' + match[ 0 ] );
			}
		}

		return date;
	};
}

/**
 * Get a regexp that matches timestamps in the local date format.
 *
 * This calls #getTimestampRegexp with predefined data for the current wiki.
 *
 * @private
 * @return {string} Regular expression
 */
function getLocalTimestampRegexp() {
	var
		df = data.dateFormat,
		digitsRegexp = mw.config.get( 'wgTranslateNumerals' ) ? '[' + data.digits + ']' : '\\d',
		dfRegexp = getTimestampRegexp( df, digitsRegexp, data.timezones );
	return dfRegexp;
}

/**
 * Get a function that parses timestamps in the local date format, based on the result
 * of matching the regexp returned by #getLocalTimestampRegexp.
 *
 * This calls #getTimestampParser with predefined data for the current wiki.
 *
 * @private
 * @return {Function} Parser function
 * @return {Array} return.match Regexp match data
 * @return {Date} return.return
 */
function getLocalTimestampParser() {
	var
		df = data.dateFormat,
		digits = mw.config.get( 'wgTranslateNumerals' ) ? data.digits : null,
		parseFunction = getTimestampParser( df, digits, data.localTimezone, data.timezones );
	return parseFunction;
}

/**
 * Callback for document.createTreeWalker that will skip over nodes where we don't want to detect
 * comments (or section headings).
 *
 * @param {Node} node
 * @return {number} Appropriate NodeFilter constant
 */
function acceptOnlyNodesAllowingComments( node ) {
	// The table of contents has a heading that gets erroneously detected as a section
	if ( node.id === 'toc' ) {
		return NodeFilter.FILTER_REJECT;
	}
	return NodeFilter.FILTER_ACCEPT;
}

/**
 * Find all timestamps within a DOM subtree.
 *
 * @param {HTMLElement} rootNode Node to search
 * @return {Array[]} Results. Each result is a two-element array.
 * @return {Text} return.0 Text node containing the timestamp
 * @return {Array} return.1 Regexp match data, which specifies the location of the match, and which
 *   can be parsed using #getLocalTimestampParser
 */
function findTimestamps( rootNode ) {
	var
		matches = [],
		treeWalker = rootNode.ownerDocument.createTreeWalker(
			rootNode,
			NodeFilter.SHOW_TEXT,
			acceptOnlyNodesAllowingComments,
			false
		),
		dateRegexp = getLocalTimestampRegexp(),
		node, startNode, nodeText, match;

	while ( ( node = treeWalker.nextNode() ) ) {
		startNode = node;
		nodeText = '';

		while ( node ) {
			nodeText += node.nodeValue;

			// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
			// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
			// which apparently are often turned into &nbsp; entities by buggy editing tools. To handle
			// this, we must piece together the text, so that our regexp can match those timestamps.
			if (
				node.nextSibling &&
				node.nextSibling.nodeType === Node.ELEMENT_NODE &&
				node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
			) {
				nodeText += node.nextSibling.firstChild.nodeValue;

				// If the entity is followed by more text, do this again
				if (
					node.nextSibling.nextSibling &&
					node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
				) {
					node = node.nextSibling.nextSibling;
				} else {
					node = null;
				}
			} else {
				node = null;
			}
		}

		// Technically, there could be multiple matches in a single text node. However, the ultimate
		// point of this is to find the signatures which precede the timestamps, and any later
		// timestamps in the text node can't be directly preceded by a signature (as we require them to
		// have links), so we only concern ourselves with the first match.
		if ( ( match = nodeText.match( dateRegexp ) ) ) {
			matches.push( [ startNode, match ] );
		}
	}
	return matches;
}

/**
 * Get a MediaWiki page title from a URL.
 *
 * @private
 * @param {string} url
 * @return {mw.Title|null} Page title, or null if this isn't a link to a page
 */
function getTitleFromUrl( url ) {
	var articlePathRegexp, match;

	try {
		url = new mw.Uri( url );
	} catch ( err ) {
		// T106244: URL encoded values using fallback 8-bit encoding (invalid UTF-8) cause mediawiki.Uri to crash
		return null;
	}
	articlePathRegexp = new RegExp(
		mw.util.escapeRegExp( mw.config.get( 'wgArticlePath' ) )
			.replace( mw.util.escapeRegExp( '$1' ), '(.*)' )
	);

	if ( ( match = url.path.match( articlePathRegexp ) ) ) {
		return mw.Title.newFromText( decodeURIComponent( match[ 1 ] ) );
	}
	if ( url.query.title ) {
		return mw.Title.newFromText( url.query.title );
	}
	return null;
}

/**
 * Find a user signature preceding a timestamp.
 *
 * The signature includes the timestamp node.
 *
 * A signature must contain at least one link to the user's userpage, discussion page or
 * contributions (and may contain other links). The link may be nested in other elements.
 *
 * @private
 * @param {Text} timestampNode Text node
 * @param {Node} [until] Node to stop searching at
 * @return {Array} Result, a two-element array
 * @return {Node[]} return.0 Sibling nodes comprising the signature, in reverse order (with
 *   `timestampNode` as the first element)
 * @return {string|null} return.1 Username, null for unsigned comments
 */
function findSignature( timestampNode, until ) {
	var
		node = timestampNode,
		sigNodes = [ node ],
		sigUsername = null,
		length = 0,
		lastLinkNode = timestampNode,
		links, nodes;
	while ( ( node = node.previousSibling ) && length < data.signatureScanLimit && node !== until ) {
		sigNodes.push( node );
		length += ( node.textContent || '' ).length;
		if ( !node.tagName ) {
			continue;
		}
		links = [];
		if ( node.tagName.toLowerCase() === 'a' ) {
			links.push( node );
		} else {
			// Handle links nested in formatting elements.
			// Helpful accidental feature: users whose signature is not detected in full (due to
			// text formatting) can just wrap it in a <span> to fix that.
			// "Ten Pound Hammer • (What did I screw up now?)"
			// "« Saper // dyskusja »"
			nodes = node.getElementsByTagName( 'a' );
			links.push.apply( links, nodes );
		}
		if ( !links.length ) {
			continue;
		}
		// Use .some() rather than .every() to permit vanity links
		// "TonyTheTiger (T / C / WP:FOUR / WP:CHICAGO / WP:WAWARD)"
		// eslint-disable-next-line no-loop-func
		if ( links.some( function ( link ) {
			var username, title;
			title = getTitleFromUrl( link.href );
			if ( !title ) {
				return false;
			}
			if (
				title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user ||
				title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user_talk
			) {
				username = title.getMainText();
				if ( username.indexOf( '/' ) !== -1 ) {
					return false;
				}
			} else if (
				title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).special &&
				title.getMainText().split( '/' )[ 0 ] === data.specialContributionsName
			) {
				username = title.getMainText().split( '/' )[ 1 ];
				// Users may link to their contributions with non-standard name
				username = mw.Title.makeTitle( mw.config.get( 'wgNamespaceIds' ).user, username ).getMainText();
			}
			if ( !username ) {
				return false;
			}
			if ( mw.util.isIPv6Address( username ) ) {
				// Bot-generated links "Preceding unsigned comment added by" have non-standard case
				username = username.toUpperCase();
			}

			// Check that every link points to the same user
			if ( !sigUsername ) {
				sigUsername = username;
			}
			return username === sigUsername;
		} ) ) {
			lastLinkNode = node;
		}
		// Keep looking if a node with links wasn't a link to a user page
		// "Doc James (talk · contribs · email)"
	}
	// Pop excess text nodes
	while ( sigNodes[ sigNodes.length - 1 ] !== lastLinkNode ) {
		sigNodes.pop();
	}
	return [ sigNodes, sigUsername ];
}

/**
 * Get the indent level of a node, relative to its ancestor node.
 *
 * The indent level is the number of lists inside of which it is nested.
 *
 * @private
 * @param {Node} node
 * @param {HTMLElement} rootNode Node to stop counting at
 * @return {number}
 */
function getIndentLevel( node, rootNode ) {
	var indent = 0, tagName;
	while ( node ) {
		if ( node === rootNode ) {
			break;
		}
		tagName = node.tagName && node.tagName.toLowerCase();
		if ( tagName === 'li' || tagName === 'dd' ) {
			indent++;
		}
		node = node.parentNode;
	}
	return indent;
}

/**
 * Trim ASCII whitespace, as defined in the HTML spec.
 *
 * @param {string} str
 * @return {string}
 */
function htmlTrim( str ) {
	// https://infra.spec.whatwg.org/#ascii-whitespace
	return str.replace( /^[\t\n\f\r ]+/, '' ).replace( /[\t\n\f\r ]+$/, '' );
}

/**
 * Return the next leaf node in the tree order that is not an empty or whitespace-only text node.
 *
 * In other words, this returns a Text node with content other than whitespace, or an Element node
 * with no children, that follows the given node in the HTML source.
 *
 * @private
 * @param {Node} node Node to start searching at. If it isn't a leaf node, its children are ignored.
 * @param {HTMLElement} rootNode Node to stop searching at
 * @return {Node|null}
 */
function nextInterestingLeafNode( node, rootNode ) {
	var treeWalker = rootNode.ownerDocument.createTreeWalker(
		rootNode,
		// eslint-disable-next-line no-bitwise
		NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
		function ( n ) {
			// Ignore this node and its descendants
			// (unless it's the root node, this is a special case for "fakeHeading" handling)
			if ( node !== rootNode && ( n === node || n.parentNode === node ) ) {
				return NodeFilter.FILTER_REJECT;
			}
			if (
				( n.nodeType === Node.TEXT_NODE && htmlTrim( n.textContent ) !== '' ) ||
				( n.nodeType === Node.ELEMENT_NODE && !n.firstChild )
			) {
				return NodeFilter.FILTER_ACCEPT;
			}
			return NodeFilter.FILTER_SKIP;
		},
		false
	);
	treeWalker.currentNode = node;
	treeWalker.nextNode();
	return treeWalker.currentNode;
}

/**
 * Get all discussion comments (and headings) within a DOM subtree.
 *
 * This returns a flat list, use #groupThreads to associate replies to original messages and get a
 * tree structure starting at section headings.
 *
 * For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
 * syntax is just for illustration):
 *
 *     == A ==
 *     B. ~~~~
 *     : C.
 *     : C. ~~~~
 *     :: D. ~~~~
 *     ::: E. ~~~~
 *     ::: F. ~~~~
 *     : G. ~~~~
 *     H. ~~~~
 *     : I. ~~~~
 *
 * This function would return a structure like:
 *
 *     [
 *       { type: 'heading', level: 0, range: (h2: A)        },
 *       { type: 'comment', level: 1, range: (p: B)         },
 *       { type: 'comment', level: 2, range: (li: C, li: C) },
 *       { type: 'comment', level: 3, range: (li: D)        },
 *       { type: 'comment', level: 4, range: (li: E)        },
 *       { type: 'comment', level: 4, range: (li: F)        },
 *       { type: 'comment', level: 2, range: (li: G)        },
 *       { type: 'comment', level: 1, range: (p: H)         },
 *       { type: 'comment', level: 2, range: (li: I)        }
 *     ]
 *
 * @param {HTMLElement} rootNode
 * @return {Object[]} Results. Each result is an object.
 * @return {string} return.type `heading` or `comment`
 * @return {Object} return.range Object describing the extent of the comment, including the
 *   signature and timestamp. It has the same properties as a Range object: `startContainer`,
 *   `startOffset`, `endContainer`, `endOffset` (we don't use a real Range because they change
 *   magically when the DOM structure changes).
 * @return {Object[]} [return.signatureRanges] Objects describing the extent of signatures (plus
 *   timestamps) for this comment. There is always at least one signature, but there may be
 *   multiple. The author and timestamp of the comment is determined from the first signature.
 *   The last node in every signature range is the text node containing the timestamp.
 * @return {number} return.level Indentation level of the comment. Headings are `0`, comments start
 *   at `1`.
 * @return {Object} [return.timestamp] Timestamp (Moment object), undefined for headings
 * @return {string} [return.author] Comment author's username, undefined for headings
 */
function getComments( rootNode ) {
	var
		dfParser = getLocalTimestampParser(),
		comments = [],
		timestamps, nextTimestamp, treeWalker,
		node, range, fakeHeading, curComment,
		foundSignature, firstSigNode, sigRange, author, startNode, match, startLevel, endLevel;

	timestamps = findTimestamps( rootNode );

	treeWalker = rootNode.ownerDocument.createTreeWalker(
		rootNode,
		// eslint-disable-next-line no-bitwise
		NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
		acceptOnlyNodesAllowingComments,
		false
	);

	// Placeholder heading in case there are comments in the 0th section
	range = {
		startContainer: rootNode,
		startOffset: 0,
		endContainer: rootNode,
		endOffset: 0
	};
	fakeHeading = {
		placeholderHeading: true,
		type: 'heading',
		range: range,
		level: 0
	};

	curComment = fakeHeading;

	nextTimestamp = 0;
	while ( ( node = treeWalker.nextNode() ) ) {
		if ( node.tagName && node.tagName.match( /^h[1-6]$/i ) ) {
			range = {
				startContainer: node,
				startOffset: 0,
				endContainer: node,
				endOffset: node.childNodes.length
			};
			curComment = {
				type: 'heading',
				range: range,
				level: 0
			};
			comments.push( curComment );
		} else if ( timestamps[ nextTimestamp ] && node === timestamps[ nextTimestamp ][ 0 ] ) {
			foundSignature = findSignature( node, curComment.range.endContainer );
			author = foundSignature[ 1 ];
			firstSigNode = foundSignature[ 0 ][ foundSignature[ 0 ].length - 1 ];

			if ( !author ) {
				// Ignore timestamps for which we couldn't find a signature. It's probably not a real
				// comment, but just a false match due to a copypasted timestamp.
				nextTimestamp++;
				continue;
			}

			// Everything from last comment up to here is the next comment
			startNode = nextInterestingLeafNode( curComment.range.endContainer, rootNode );
			match = timestamps[ nextTimestamp ][ 1 ];
			range = {
				startContainer: startNode.parentNode,
				startOffset: Array.prototype.indexOf.call( startNode.parentNode.childNodes, startNode ),
				endContainer: node,
				endOffset: match.index + match[ 0 ].length
			};
			sigRange = {
				startContainer: firstSigNode.parentNode,
				startOffset: Array.prototype.indexOf.call( firstSigNode.parentNode.childNodes, firstSigNode ),
				endContainer: node,
				endOffset: match.index + match[ 0 ].length
			};

			startLevel = getIndentLevel( startNode, rootNode ) + 1;
			endLevel = getIndentLevel( node, rootNode ) + 1;
			if ( startLevel !== endLevel ) {
				console.log( 'Comment starts and ends with different indentation', startNode, node );
			}

			// Avoid generating multiple comments when there is more than one signature on a single "line".
			// Often this is done when someone edits their comment later and wants to add a note about that.
			// (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments
			// within one paragraph/listitem result in a confusing double "Reply" button, and we also have
			// no way to indicate which one you're replying to (this might matter in the future for
			// notifications or something).
			if (
				curComment.type === 'comment' &&
				( utils.closestElement( node, [ 'li', 'dd', 'p' ] ) || node.parentNode ) ===
					( utils.closestElement( curComment.range.endContainer, [ 'li', 'dd', 'p' ] ) || curComment.range.endContainer.parentNode )
			) {
				// Merge this with the previous comment. Use that comment's author and timestamp.
				curComment.range.endContainer = range.endContainer;
				curComment.range.endOffset = range.endOffset;
				curComment.signatureRanges.push( sigRange );
				curComment.level = Math.min( Math.min( startLevel, endLevel ), curComment.level );

				nextTimestamp++;
				continue;
			}

			curComment = {
				type: 'comment',
				timestamp: dfParser( match ),
				author: author,
				range: range,
				signatureRanges: [ sigRange ],
				// Should this use the indent level of `startNode` or `node`?
				level: Math.min( startLevel, endLevel )
			};
			comments.push( curComment );
			nextTimestamp++;
		}
	}

	// Insert the fake placeholder heading if there are any comments in the 0th section
	// (before the first real heading)
	if ( comments.length && comments[ 0 ].type !== 'heading' ) {
		comments.unshift( fakeHeading );
	}

	return comments;
}

/**
 * Group discussion comments into threads and associate replies to original messages.
 *
 * Each thread must begin with a heading. Original messages in the thread are treated as replies to
 * its heading. Other replies are associated based on the order and indentation level.
 *
 * Note that the objects in `comments` are extended in-place with the additional data.
 *
 * For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
 * syntax is just for illustration):
 *
 *     == A ==
 *     B. ~~~~
 *     : C.
 *     : C. ~~~~
 *     :: D. ~~~~
 *     ::: E. ~~~~
 *     ::: F. ~~~~
 *     : G. ~~~~
 *     H. ~~~~
 *     : I. ~~~~
 *
 * This function would return a structure like:
 *
 *     [
 *       { type: 'heading', level: 0, range: (h2: A), replies: [
 *         { type: 'comment', level: 1, range: (p: B), replies: [
 *           { type: 'comment', level: 2, range: (li: C, li: C), replies: [
 *             { type: 'comment', level: 3, range: (li: D), replies: [
 *               { type: 'comment', level: 4, range: (li: E), replies: [] },
 *               { type: 'comment', level: 4, range: (li: F), replies: [] },
 *             ] },
 *           ] },
 *           { type: 'comment', level: 2, range: (li: G), replies: [] },
 *         ] },
 *         { type: 'comment', level: 1, range: (p: H), replies: [
 *           { type: 'comment', level: 2, range: (li: I), replies: [] },
 *         ] },
 *       ] },
 *     ]
 *
 * @param {Object} comments Result of #getComments
 * @return {Object[]} Tree structure of comments, using the same objects as `comments`. Top-level
 *   items are the headings. The following properties are added:
 * @return {string} return.id Unique ID (within the page) for this comment, intended to be used to
 *   find this comment in other revisions of the same page
 * @return {Object[]} return.replies Comment objects which are replies to this comment
 * @return {Object|null} return.parent Comment object which this is a reply to (null for headings)
 */
function groupThreads( comments ) {
	var
		threads = [],
		replies = [],
		commentsById = {},
		i, comment, id, number;

	for ( i = 0; i < comments.length; i++ ) {
		comment = comments[ i ];

		if ( comment.level === 0 ) {
			// We don't need ids for section headings right now, but we might in the future
			// e.g. if we allow replying directly to sections (adding top-level comments)
			id = null;
		} else {
			// username+timestamp
			id = [
				comment.author || '',
				comment.timestamp.toISOString()
			].join( '|' );

			// If there would be multiple comments with the same ID (i.e. the user left multiple comments
			// in one edit, or within a minute), append sequential numbers
			number = 0;
			while ( commentsById[ id + '|' + number ] ) {
				number++;
			}
			id = id + '|' + number;
		}

		if ( id ) {
			commentsById[ id ] = comment;
		}

		// This modifies the original objects in `comments`!
		comment.id = id;
		comment.replies = [];
		comment.parent = null;

		if ( replies.length < comment.level ) {
			// Someone skipped an indentation level (or several). Pretend that the previous reply
			// covers multiple indentation levels, so that following comments get connected to it.
			console.log( 'Comment skips indentation level', comment.range );
			while ( replies.length < comment.level ) {
				replies[ replies.length ] = replies[ replies.length - 1 ];
			}
		}

		if ( comment.level === 0 ) {
			// new root (thread)
			threads.push( comment );
		} else if ( replies[ comment.level - 1 ] ) {
			// add as a reply to closest less nested comment
			replies[ comment.level - 1 ].replies.push( comment );
			comment.parent = replies[ comment.level - 1 ];
		} else {
			console.log( 'Comment could not be connected to a thread', comment.range );
		}

		replies[ comment.level ] = comment;
		// cut off more deeply nested replies
		replies.length = comment.level + 1;
	}

	return threads;
}

/**
 * Get the list of authors involved in a comment and its replies.
 *
 * You probably want to pass a thread root here (a heading).
 *
 * @param {Object} comment Comment object, as returned by #groupThreads
 * @return {Object} Object with comment author usernames as keys
 */

/**
 * Get the list of authors involved in a comment and its replies.
 *
 * You probably want to pass a thread root here (a heading).
 *
 * @param {Object} comment Comment object, as returned by #groupThreads
 * @return {string[]} Author usernames
 */
function getAuthors( comment ) {
	var authors = {};
	function getAuthorSet( comment ) {
		if ( comment.author ) {
			authors[ comment.author ] = true;
		}
		// Get the set of authors in the same format from each reply
		comment.replies.map( getAuthorSet );
	}

	getAuthorSet( comment );

	return Object.keys( authors ).sort();
}

/**
 * Get the name of the page from which this comment is transcluded (if any).
 *
 * @param {Object} comment Comment object, as returned by #groupThreads
 * @return {string|boolean} `false` if this comment is not transcluded. A string if it's transcluded
 *   from a single page (the page title, in text form with spaces). `true` if it's transcluded, but
 *   we can't determine the source.
 */
function getTranscludedFrom( comment ) {
	var node, about, dataMw;

	// If some template is used within the comment (e.g. {{ping|…}} or {{tl|…}}), that *does not* mean
	// the comment is transcluded. We only want to consider comments to be transcluded if the wrapper
	// element (usually <li> or <p>) is marked as part of a transclusion.
	// TODO: This seems to work fine but I'm having a hard time explaining why it is correct...
	node = comment.range.endContainer;

	// Find the node containing information about the transclusion:
	// 1. Find the closest ancestor with an 'about' attribute
	// 2. Find the main node of the about-group (first sibling with the same 'about' attribute)
	// 3. If this is an mw:Transclusion node, return it; otherwise, go to step 1
	while ( node ) {
		// 1.
		if (
			node.nodeType === Node.ELEMENT_NODE &&
			node.getAttribute( 'about' ) &&
			/^#mwt\d+$/.test( node.getAttribute( 'about' ) )
		) {
			about = node.getAttribute( 'about' );

			// 2.
			while (
				node.previousSibling &&
				node.previousSibling.nodeType === Node.ELEMENT_NODE &&
				node.previousSibling.getAttribute( 'about' ) === about
			) {
				node = node.previousSibling;
			}

			// 3.
			if (
				node.getAttribute( 'typeof' ) &&
				node.getAttribute( 'typeof' ).split( ' ' ).indexOf( 'mw:Transclusion' ) !== -1
			) {
				break;
			}
		}

		node = node.parentNode;
	}

	if ( !node ) {
		// No mw:Transclusion node found, this comment is not transcluded
		return false;
	}

	dataMw = JSON.parse( node.getAttribute( 'data-mw' ) );

	// Only return a page name if this is a simple single-template transclusion.
	if (
		dataMw.parts &&
		dataMw.parts.length === 1 &&
		dataMw.parts[ 0 ].template &&
		dataMw.parts[ 0 ].template.target.href
	) {
		// Slice off the './' prefix and convert to text form (underscores to spaces, URL-decoded)
		return mw.libs.ve.normalizeParsoidResourceName( dataMw.parts[ 0 ].template.target.href );
	}

	// Multi-template transclusion, or a parser function call, or template-affected wikitext outside
	// of a template call, or a mix of the above
	return true;
}

module.exports = {
	findTimestamps: findTimestamps,
	getLocalTimestampParser: getLocalTimestampParser,
	getTimestampRegexp: getTimestampRegexp,
	getTimestampParser: getTimestampParser,
	getComments: getComments,
	groupThreads: groupThreads,
	findSignature: findSignature,
	getAuthors: getAuthors,
	getTranscludedFrom: getTranscludedFrom
};