2019-09-26 07:06:56 +00:00
|
|
|
'use strict';
|
2020-05-22 16:26:05 +00:00
|
|
|
/* global $:off */
|
|
|
|
|
2019-10-18 12:12:55 +00:00
|
|
|
var
|
2020-03-08 14:32:38 +00:00
|
|
|
utils = require( './utils.js' ),
|
2020-09-08 00:23:53 +00:00
|
|
|
codePointLength = require( 'mediawiki.String' ).codePointLength,
|
2021-02-02 17:40:18 +00:00
|
|
|
trimByteLength = require( 'mediawiki.String' ).trimByteLength,
|
2020-05-22 16:26:05 +00:00
|
|
|
CommentItem = require( './CommentItem.js' ),
|
|
|
|
HeadingItem = require( './HeadingItem.js' ),
|
2021-01-26 18:58:58 +00:00
|
|
|
ThreadItem = require( './ThreadItem.js' ),
|
2021-01-29 18:36:04 +00:00
|
|
|
// Data::getLocalData()
|
2020-02-25 02:10:27 +00:00
|
|
|
data = require( './parser/data.json' ),
|
2019-10-18 12:12:55 +00:00
|
|
|
moment = require( './lib/moment-timezone/moment-timezone-with-data-1970-2030.js' );
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
|
|
|
* Utilities for detecting and parsing components of discussion pages: signatures, timestamps,
|
|
|
|
* comments and threads.
|
|
|
|
*
|
2020-07-20 21:15:03 +00:00
|
|
|
* @class mw.dt.Parser
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
function Parser( rootNode ) {
|
|
|
|
this.rootNode = rootNode;
|
|
|
|
this.threadItems = null;
|
|
|
|
this.commentItems = null;
|
2021-02-12 19:16:13 +00:00
|
|
|
this.threadItemsByName = null;
|
2020-09-22 23:05:25 +00:00
|
|
|
this.threadItemsById = null;
|
2020-07-20 21:15:03 +00:00
|
|
|
this.threads = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
OO.initClass( Parser );
|
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
|
|
|
* Get text of localisation messages in content language.
|
|
|
|
*
|
|
|
|
* @private
|
2020-09-03 20:59:33 +00:00
|
|
|
* @param {string} contLangVariant Content language variant
|
|
|
|
* @param {string[]} messages Message keys
|
|
|
|
* @return {string[]} Message values
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-09-03 20:59:33 +00:00
|
|
|
function getMessages( contLangVariant, messages ) {
|
2019-10-20 16:26:00 +00:00
|
|
|
return messages.map( function ( code ) {
|
2020-09-03 20:59:33 +00:00
|
|
|
return data.contLangMessages[ contLangVariant ][ code ];
|
2019-09-26 07:06:56 +00:00
|
|
|
} );
|
|
|
|
}
|
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
|
|
|
* Get a regexp that matches timestamps generated using the given date format.
|
|
|
|
*
|
|
|
|
* This only supports format characters that are used by the default date format in any of
|
|
|
|
* MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters),
|
|
|
|
* and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are
|
|
|
|
* complicated).
|
|
|
|
*
|
2020-07-20 21:15:03 +00:00
|
|
|
* @private
|
2020-09-03 20:59:33 +00:00
|
|
|
* @param {string} contLangVariant Content language variant
|
2019-10-20 16:36:43 +00:00
|
|
|
* @param {string} format Date format, as used by MediaWiki
|
2020-05-08 19:21:53 +00:00
|
|
|
* @param {string} digitsRegexp Regular expression matching a single localised digit, e.g. `[0-9]`
|
2019-10-20 16:36:43 +00:00
|
|
|
* @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
|
|
|
|
* for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
|
|
|
|
* @return {string} Regular expression
|
|
|
|
*/
|
2020-09-03 20:59:33 +00:00
|
|
|
Parser.prototype.getTimestampRegexp = function ( contLangVariant, format, digitsRegexp, tzAbbrs ) {
|
2020-08-26 22:46:34 +00:00
|
|
|
function regexpGroup( r ) {
|
|
|
|
return '(' + r + ')';
|
2019-10-20 16:26:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function regexpAlternateGroup( array ) {
|
|
|
|
return '(' + array.map( mw.util.escapeRegExp ).join( '|' ) + ')';
|
|
|
|
}
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var s = '';
|
2019-10-20 16:26:00 +00:00
|
|
|
// Adapted from Language::sprintfDate()
|
2021-04-08 13:46:09 +00:00
|
|
|
for ( var p = 0; p < format.length; p++ ) {
|
|
|
|
var num = false;
|
|
|
|
var code = format[ p ];
|
2019-09-26 07:06:56 +00:00
|
|
|
if ( code === 'x' && p < format.length - 1 ) {
|
|
|
|
code += format[ ++p ];
|
|
|
|
}
|
|
|
|
if ( code === 'xk' && p < format.length - 1 ) {
|
|
|
|
code += format[ ++p ];
|
|
|
|
}
|
|
|
|
|
|
|
|
switch ( code ) {
|
|
|
|
case 'xx':
|
|
|
|
s += 'x';
|
|
|
|
break;
|
|
|
|
case 'xg':
|
2020-09-03 20:59:33 +00:00
|
|
|
s += regexpAlternateGroup( getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
|
|
|
|
'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
|
|
|
|
'december-gen'
|
|
|
|
] ) );
|
|
|
|
break;
|
|
|
|
case 'd':
|
|
|
|
num = '2';
|
|
|
|
break;
|
|
|
|
case 'D':
|
2020-09-03 20:59:33 +00:00
|
|
|
s += regexpAlternateGroup( getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'
|
|
|
|
] ) );
|
|
|
|
break;
|
|
|
|
case 'j':
|
|
|
|
num = '1,2';
|
|
|
|
break;
|
|
|
|
case 'l':
|
2020-09-03 20:59:33 +00:00
|
|
|
s += regexpAlternateGroup( getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'sunday', 'monday', 'tuesday', 'wednesday', 'thursday',
|
|
|
|
'friday', 'saturday'
|
|
|
|
] ) );
|
|
|
|
break;
|
|
|
|
case 'F':
|
2020-09-03 20:59:33 +00:00
|
|
|
s += regexpAlternateGroup( getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'january', 'february', 'march', 'april', 'may_long', 'june',
|
|
|
|
'july', 'august', 'september', 'october', 'november',
|
|
|
|
'december'
|
|
|
|
] ) );
|
|
|
|
break;
|
|
|
|
case 'M':
|
2020-09-03 20:59:33 +00:00
|
|
|
s += regexpAlternateGroup( getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
|
|
|
|
'sep', 'oct', 'nov', 'dec'
|
|
|
|
] ) );
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
num = '1,2';
|
|
|
|
break;
|
|
|
|
case 'Y':
|
|
|
|
num = '4';
|
|
|
|
break;
|
|
|
|
case 'xkY':
|
|
|
|
num = '4';
|
|
|
|
break;
|
|
|
|
case 'G':
|
|
|
|
num = '1,2';
|
|
|
|
break;
|
|
|
|
case 'H':
|
|
|
|
num = '2';
|
|
|
|
break;
|
|
|
|
case 'i':
|
|
|
|
num = '2';
|
|
|
|
break;
|
|
|
|
case '\\':
|
|
|
|
// Backslash escaping
|
|
|
|
if ( p < format.length - 1 ) {
|
2019-10-20 16:34:44 +00:00
|
|
|
s += mw.util.escapeRegExp( format[ ++p ] );
|
2019-09-26 07:06:56 +00:00
|
|
|
} else {
|
2019-10-20 16:34:44 +00:00
|
|
|
s += mw.util.escapeRegExp( '\\' );
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case '"':
|
|
|
|
// Quoted literal
|
|
|
|
if ( p < format.length - 1 ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var endQuote = format.indexOf( '"', p + 1 );
|
2019-09-26 07:06:56 +00:00
|
|
|
if ( endQuote === -1 ) {
|
|
|
|
// No terminating quote, assume literal "
|
|
|
|
s += '"';
|
|
|
|
} else {
|
2019-10-20 16:34:44 +00:00
|
|
|
s += mw.util.escapeRegExp( format.substr( p + 1, endQuote - p - 1 ) );
|
2019-09-26 07:06:56 +00:00
|
|
|
p = endQuote;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Quote at end of string, assume literal "
|
|
|
|
s += '"';
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
2019-10-20 16:34:44 +00:00
|
|
|
s += mw.util.escapeRegExp( format[ p ] );
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
if ( num !== false ) {
|
2020-05-08 19:21:53 +00:00
|
|
|
s += regexpGroup( digitsRegexp + '{' + num + '}' );
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var tzRegexp = regexpAlternateGroup( Object.keys( tzAbbrs ) );
|
2020-05-12 11:51:20 +00:00
|
|
|
// Hard-coded parentheses and space like in Parser::pstPass2
|
|
|
|
// Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T245784)
|
2021-04-08 13:46:09 +00:00
|
|
|
var regexp = s + '[\\u200E\\u200F]? [\\u200E\\u200F]?\\(' + tzRegexp + '\\)';
|
2019-10-18 12:12:55 +00:00
|
|
|
|
|
|
|
return regexp;
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
|
|
|
* Get a function that parses timestamps generated using the given date format, based on the result
|
|
|
|
* of matching the regexp returned by #getTimestampRegexp.
|
|
|
|
*
|
2020-07-20 21:15:03 +00:00
|
|
|
* @private
|
2020-09-03 20:59:33 +00:00
|
|
|
* @param {string} contLangVariant Content language variant
|
2019-10-20 16:36:43 +00:00
|
|
|
* @param {string} format Date format, as used by MediaWiki
|
2020-09-14 21:31:53 +00:00
|
|
|
* @param {string[]|null} digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]`
|
2019-10-20 16:36:43 +00:00
|
|
|
* @param {string} localTimezone Local timezone IANA name, e.g. `America/New_York`
|
|
|
|
* @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
|
|
|
|
* for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
|
2020-06-25 12:23:17 +00:00
|
|
|
* @return {TimestampParser} Timestamp parser function
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-09-03 20:59:33 +00:00
|
|
|
Parser.prototype.getTimestampParser = function ( contLangVariant, format, digits, localTimezone, tzAbbrs ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var matchingGroups = [];
|
|
|
|
for ( var p = 0; p < format.length; p++ ) {
|
|
|
|
var code = format[ p ];
|
2019-09-26 07:06:56 +00:00
|
|
|
if ( code === 'x' && p < format.length - 1 ) {
|
|
|
|
code += format[ ++p ];
|
|
|
|
}
|
|
|
|
if ( code === 'xk' && p < format.length - 1 ) {
|
|
|
|
code += format[ ++p ];
|
|
|
|
}
|
|
|
|
|
|
|
|
switch ( code ) {
|
|
|
|
case 'xx':
|
|
|
|
break;
|
|
|
|
case 'xg':
|
|
|
|
case 'd':
|
|
|
|
case 'j':
|
|
|
|
case 'D':
|
|
|
|
case 'l':
|
|
|
|
case 'F':
|
|
|
|
case 'M':
|
|
|
|
case 'n':
|
|
|
|
case 'Y':
|
|
|
|
case 'xkY':
|
|
|
|
case 'G':
|
|
|
|
case 'H':
|
|
|
|
case 'i':
|
|
|
|
matchingGroups.push( code );
|
|
|
|
break;
|
|
|
|
case '\\':
|
|
|
|
// Backslash escaping
|
|
|
|
if ( p < format.length - 1 ) {
|
|
|
|
++p;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case '"':
|
|
|
|
// Quoted literal
|
|
|
|
if ( p < format.length - 1 ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var endQuote = format.indexOf( '"', p + 1 );
|
2019-09-26 07:06:56 +00:00
|
|
|
if ( endQuote !== -1 ) {
|
|
|
|
p = endQuote;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function untransformDigits( text ) {
|
|
|
|
if ( !digits ) {
|
|
|
|
return text;
|
|
|
|
}
|
|
|
|
return text.replace(
|
2020-08-31 22:13:00 +00:00
|
|
|
new RegExp( '[' + digits.join( '' ) + ']', 'g' ),
|
2019-09-26 07:06:56 +00:00
|
|
|
function ( m ) {
|
|
|
|
return digits.indexOf( m );
|
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2020-06-25 12:23:17 +00:00
|
|
|
/**
|
|
|
|
* @typedef {function(Array):moment} TimestampParser
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Timestamp parser
|
|
|
|
*
|
|
|
|
* @param {Array} match RegExp match data
|
|
|
|
* @return {moment} Moment date object
|
|
|
|
*/
|
2019-10-18 12:12:55 +00:00
|
|
|
return function timestampParser( match ) {
|
2019-09-26 07:06:56 +00:00
|
|
|
var
|
|
|
|
year = 0,
|
|
|
|
monthIdx = 0,
|
|
|
|
day = 0,
|
|
|
|
hour = 0,
|
2021-04-08 13:46:09 +00:00
|
|
|
minute = 0;
|
|
|
|
|
|
|
|
for ( var i = 0; i < matchingGroups.length; i++ ) {
|
|
|
|
var code2 = matchingGroups[ i ];
|
|
|
|
var text = match[ i + 1 ];
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2020-08-26 22:46:34 +00:00
|
|
|
switch ( code2 ) {
|
2019-09-26 07:06:56 +00:00
|
|
|
case 'xg':
|
2020-09-03 20:59:33 +00:00
|
|
|
monthIdx = getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'january-gen', 'february-gen', 'march-gen', 'april-gen', 'may-gen', 'june-gen',
|
|
|
|
'july-gen', 'august-gen', 'september-gen', 'october-gen', 'november-gen',
|
|
|
|
'december-gen'
|
|
|
|
] ).indexOf( text );
|
|
|
|
break;
|
|
|
|
case 'd':
|
|
|
|
case 'j':
|
|
|
|
day = Number( untransformDigits( text ) );
|
|
|
|
break;
|
|
|
|
case 'D':
|
|
|
|
case 'l':
|
|
|
|
// Day of the week - unused
|
|
|
|
break;
|
|
|
|
case 'F':
|
2020-09-03 20:59:33 +00:00
|
|
|
monthIdx = getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'january', 'february', 'march', 'april', 'may_long', 'june',
|
|
|
|
'july', 'august', 'september', 'october', 'november',
|
|
|
|
'december'
|
|
|
|
] ).indexOf( text );
|
|
|
|
break;
|
|
|
|
case 'M':
|
2020-09-03 20:59:33 +00:00
|
|
|
monthIdx = getMessages( contLangVariant, [
|
2019-09-26 07:06:56 +00:00
|
|
|
'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug',
|
|
|
|
'sep', 'oct', 'nov', 'dec'
|
|
|
|
] ).indexOf( text );
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
monthIdx = Number( untransformDigits( text ) ) - 1;
|
|
|
|
break;
|
|
|
|
case 'Y':
|
|
|
|
year = Number( untransformDigits( text ) );
|
|
|
|
break;
|
|
|
|
case 'xkY':
|
|
|
|
// Thai year
|
|
|
|
year = Number( untransformDigits( text ) ) - 543;
|
|
|
|
break;
|
|
|
|
case 'G':
|
|
|
|
case 'H':
|
|
|
|
hour = Number( untransformDigits( text ) );
|
|
|
|
break;
|
|
|
|
case 'i':
|
|
|
|
minute = Number( untransformDigits( text ) );
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
throw new Error( 'Not implemented' );
|
|
|
|
}
|
|
|
|
}
|
2019-10-18 12:12:55 +00:00
|
|
|
// The last matching group is the timezone abbreviation
|
2021-04-08 13:46:09 +00:00
|
|
|
var tzAbbr = tzAbbrs[ match[ match.length - 1 ] ];
|
2019-10-18 12:12:55 +00:00
|
|
|
|
|
|
|
// Most of the time, the timezone abbreviation is not necessary to parse the date, since we
|
|
|
|
// can assume all times are in the wiki's local timezone.
|
2021-04-08 13:46:09 +00:00
|
|
|
var date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-18 12:12:55 +00:00
|
|
|
// But during the "fall back" at the end of DST, some times will happen twice. Per the docs,
|
|
|
|
// "Moment Timezone handles this by always using the earlier instance of a duplicated hour."
|
|
|
|
// https://momentjs.com/timezone/docs/#/using-timezones/parsing-ambiguous-inputs/
|
|
|
|
|
|
|
|
// Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect when
|
|
|
|
// that behavior was incorrect...
|
|
|
|
if ( date.zoneAbbr() !== tzAbbr ) {
|
|
|
|
// ...and force the correct parsing. I can't find proper documentation for this feature,
|
|
|
|
// but this pull request explains it: https://github.com/moment/moment-timezone/pull/101
|
|
|
|
moment.tz.moveAmbiguousForward = true;
|
|
|
|
date = moment.tz( [ year, monthIdx, day, hour, minute ], localTimezone );
|
|
|
|
moment.tz.moveAmbiguousForward = false;
|
|
|
|
if ( date.zoneAbbr() !== tzAbbr ) {
|
|
|
|
// This should not be possible for "genuine" timestamps generated by MediaWiki.
|
|
|
|
// But bots and humans get it wrong when marking up unsigned comments…
|
|
|
|
// https://pl.wikipedia.org/w/index.php?title=Wikipedia:Kawiarenka/Artykuły&diff=prev&oldid=54772606
|
2020-05-13 18:23:48 +00:00
|
|
|
date.discussionToolsWarning = 'Timestamp has timezone abbreviation for the wrong time';
|
2019-10-18 12:12:55 +00:00
|
|
|
} else {
|
2020-05-13 18:23:48 +00:00
|
|
|
date.discussionToolsWarning = 'Ambiguous time at DST switchover was parsed';
|
2019-10-18 12:12:55 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return date;
|
2019-09-26 07:06:56 +00:00
|
|
|
};
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
2020-09-03 20:59:33 +00:00
|
|
|
* Get a regexp that matches timestamps in the local date format, for each language variant.
|
2019-10-20 16:36:43 +00:00
|
|
|
*
|
|
|
|
* This calls #getTimestampRegexp with predefined data for the current wiki.
|
|
|
|
*
|
|
|
|
* @private
|
2020-09-03 20:59:33 +00:00
|
|
|
* @return {string[]} Regular expressions
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-09-03 20:59:33 +00:00
|
|
|
Parser.prototype.getLocalTimestampRegexps = function () {
|
|
|
|
var parser = this;
|
|
|
|
return Object.keys( data.dateFormat ).map( function ( contLangVariant ) {
|
|
|
|
return parser.getTimestampRegexp(
|
|
|
|
contLangVariant,
|
|
|
|
data.dateFormat[ contLangVariant ],
|
|
|
|
'[' + data.digits[ contLangVariant ].join( '' ) + ']',
|
|
|
|
data.timezones[ contLangVariant ]
|
|
|
|
);
|
|
|
|
} );
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
2020-09-03 20:59:33 +00:00
|
|
|
* Get a function that parses timestamps in the local date format, for each language variant,
|
|
|
|
* based on the result of matching the regexps returned by #getLocalTimestampRegexps.
|
2019-10-20 16:36:43 +00:00
|
|
|
*
|
|
|
|
* This calls #getTimestampParser with predefined data for the current wiki.
|
|
|
|
*
|
|
|
|
* @private
|
2020-09-03 20:59:33 +00:00
|
|
|
* @return {TimestampParser[]} Timestamp parser functions
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-09-03 20:59:33 +00:00
|
|
|
Parser.prototype.getLocalTimestampParsers = function () {
|
|
|
|
var parser = this;
|
|
|
|
return Object.keys( data.dateFormat ).map( function ( contLangVariant ) {
|
|
|
|
return parser.getTimestampParser(
|
|
|
|
contLangVariant,
|
|
|
|
data.dateFormat[ contLangVariant ],
|
|
|
|
data.digits[ contLangVariant ],
|
|
|
|
data.localTimezone,
|
|
|
|
data.timezones[ contLangVariant ]
|
|
|
|
);
|
|
|
|
} );
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-24 16:24:59 +00:00
|
|
|
/**
|
|
|
|
* Callback for document.createTreeWalker that will skip over nodes where we don't want to detect
|
|
|
|
* comments (or section headings).
|
|
|
|
*
|
|
|
|
* @param {Node} node
|
|
|
|
* @return {number} Appropriate NodeFilter constant
|
|
|
|
*/
|
|
|
|
function acceptOnlyNodesAllowingComments( node ) {
|
|
|
|
// The table of contents has a heading that gets erroneously detected as a section
|
|
|
|
if ( node.id === 'toc' ) {
|
|
|
|
return NodeFilter.FILTER_REJECT;
|
|
|
|
}
|
2021-02-26 20:05:40 +00:00
|
|
|
// Don't detect comments within quotes (T275881)
|
|
|
|
if ( node instanceof HTMLElement && (
|
|
|
|
node.tagName.toLowerCase() === 'blockquote' ||
|
2021-03-01 20:40:21 +00:00
|
|
|
node.tagName.toLowerCase() === 'cite' ||
|
2021-02-26 20:05:40 +00:00
|
|
|
node.tagName.toLowerCase() === 'q'
|
|
|
|
) ) {
|
|
|
|
return NodeFilter.FILTER_REJECT;
|
|
|
|
}
|
2020-11-04 15:17:39 +00:00
|
|
|
// Don't detect comments within headings (but don't reject the headings themselves)
|
|
|
|
if ( node.parentNode instanceof HTMLElement && node.parentNode.tagName.match( /^h([1-6])$/i ) ) {
|
|
|
|
return NodeFilter.FILTER_REJECT;
|
|
|
|
}
|
2019-10-24 16:24:59 +00:00
|
|
|
return NodeFilter.FILTER_ACCEPT;
|
|
|
|
}
|
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
2020-09-03 20:59:33 +00:00
|
|
|
* Find a timestamp in a given text node
|
2019-10-20 16:36:43 +00:00
|
|
|
*
|
2020-07-20 21:15:03 +00:00
|
|
|
* @private
|
2020-07-10 14:16:49 +00:00
|
|
|
* @param {Text} node Text node
|
2020-09-03 20:59:33 +00:00
|
|
|
* @param {string[]} timestampRegexps Timestamp regexps
|
|
|
|
* @return {Object|null} Object with the following keys:
|
|
|
|
* - {number} offset Length of extra text preceding the node that was used for matching
|
|
|
|
* - {number} parserIndex Which of the regexps matched
|
|
|
|
* - {Array} matchData Regexp match data, which specifies the location of the match,
|
|
|
|
* and which can be parsed using #getLocalTimestampParsers
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-09-03 20:59:33 +00:00
|
|
|
Parser.prototype.findTimestamp = function ( node, timestampRegexps ) {
|
|
|
|
var matchData, i,
|
2020-08-11 04:22:57 +00:00
|
|
|
nodeText = '',
|
|
|
|
offset = 0;
|
2020-07-10 14:16:49 +00:00
|
|
|
while ( node ) {
|
2020-08-11 04:22:57 +00:00
|
|
|
nodeText = node.nodeValue + nodeText;
|
2020-07-10 14:16:49 +00:00
|
|
|
|
|
|
|
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
|
|
|
|
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
|
|
|
|
// which apparently are often turned into entities by buggy editing tools. To handle
|
|
|
|
// this, we must piece together the text, so that our regexp can match those timestamps.
|
|
|
|
if (
|
2020-08-11 04:22:57 +00:00
|
|
|
node.previousSibling &&
|
|
|
|
node.previousSibling.nodeType === Node.ELEMENT_NODE &&
|
|
|
|
node.previousSibling.getAttribute( 'typeof' ) === 'mw:Entity'
|
2020-07-10 14:16:49 +00:00
|
|
|
) {
|
2020-08-11 04:22:57 +00:00
|
|
|
nodeText = node.previousSibling.firstChild.nodeValue + nodeText;
|
|
|
|
offset += node.previousSibling.firstChild.nodeValue.length;
|
2020-07-10 14:16:49 +00:00
|
|
|
|
|
|
|
// If the entity is followed by more text, do this again
|
2020-05-05 20:19:04 +00:00
|
|
|
if (
|
2020-08-11 04:22:57 +00:00
|
|
|
node.previousSibling.previousSibling &&
|
|
|
|
node.previousSibling.previousSibling.nodeType === Node.TEXT_NODE
|
2020-05-05 20:19:04 +00:00
|
|
|
) {
|
2020-08-11 04:22:57 +00:00
|
|
|
offset += node.previousSibling.previousSibling.nodeValue.length;
|
|
|
|
node = node.previousSibling.previousSibling;
|
2020-05-05 20:19:04 +00:00
|
|
|
} else {
|
|
|
|
node = null;
|
|
|
|
}
|
2020-07-10 14:16:49 +00:00
|
|
|
} else {
|
|
|
|
node = null;
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
}
|
2020-07-10 14:16:49 +00:00
|
|
|
|
2020-09-03 20:59:33 +00:00
|
|
|
for ( i = 0; i < timestampRegexps.length; i++ ) {
|
|
|
|
// Technically, there could be multiple matches in a single text node. However, the ultimate
|
|
|
|
// point of this is to find the signatures which precede the timestamps, and any later
|
|
|
|
// timestamps in the text node can't be directly preceded by a signature (as we require them to
|
|
|
|
// have links), so we only concern ourselves with the first match.
|
|
|
|
matchData = nodeText.match( timestampRegexps[ i ] );
|
|
|
|
if ( matchData ) {
|
|
|
|
return {
|
|
|
|
matchData: matchData,
|
|
|
|
offset: offset,
|
|
|
|
parserIndex: i
|
|
|
|
};
|
|
|
|
}
|
2020-08-11 04:22:57 +00:00
|
|
|
}
|
|
|
|
return null;
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2021-01-26 18:58:58 +00:00
|
|
|
/**
|
|
|
|
* Given a link node (`<a>`), if it's a link to a user-related page, return their username.
|
|
|
|
*
|
|
|
|
* @param {HTMLElement} link
|
|
|
|
* @return {string|null}
|
|
|
|
*/
|
|
|
|
function getUsernameFromLink( link ) {
|
2021-06-24 19:10:40 +00:00
|
|
|
var title = utils.getTitleFromUrl( link.href );
|
2021-01-26 18:58:58 +00:00
|
|
|
if ( !title ) {
|
|
|
|
return null;
|
|
|
|
}
|
2021-04-08 13:46:09 +00:00
|
|
|
var username;
|
2021-01-26 18:58:58 +00:00
|
|
|
if (
|
|
|
|
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user ||
|
|
|
|
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).user_talk
|
|
|
|
) {
|
|
|
|
username = title.getMainText();
|
|
|
|
if ( username.indexOf( '/' ) !== -1 ) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
} else if (
|
|
|
|
title.getNamespaceId() === mw.config.get( 'wgNamespaceIds' ).special &&
|
|
|
|
title.getMainText().split( '/' )[ 0 ] === data.specialContributionsName
|
|
|
|
) {
|
|
|
|
username = title.getMainText().split( '/' )[ 1 ];
|
|
|
|
if ( !username ) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
// Normalize the username: users may link to their contributions with an unnormalized name
|
2021-04-08 13:46:09 +00:00
|
|
|
var userpage = mw.Title.makeTitle( mw.config.get( 'wgNamespaceIds' ).user, username );
|
2021-01-26 18:58:58 +00:00
|
|
|
if ( !userpage ) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
username = userpage.getMainText();
|
|
|
|
}
|
|
|
|
if ( !username ) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
if ( mw.util.isIPv6Address( username ) ) {
|
|
|
|
// Bot-generated links "Preceding unsigned comment added by" have non-standard case
|
|
|
|
username = username.toUpperCase();
|
|
|
|
}
|
|
|
|
return username;
|
|
|
|
}
|
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
|
|
|
* Find a user signature preceding a timestamp.
|
|
|
|
*
|
|
|
|
* The signature includes the timestamp node.
|
|
|
|
*
|
|
|
|
* A signature must contain at least one link to the user's userpage, discussion page or
|
|
|
|
* contributions (and may contain other links). The link may be nested in other elements.
|
|
|
|
*
|
|
|
|
* @private
|
|
|
|
* @param {Text} timestampNode Text node
|
2020-03-06 16:51:59 +00:00
|
|
|
* @param {Node} [until] Node to stop searching at
|
2020-08-26 22:46:34 +00:00
|
|
|
* @return {Array} Result, a tuple contaning:
|
|
|
|
* - {Node[]} Sibling nodes comprising the signature, in reverse order (with
|
2020-05-27 17:42:19 +00:00
|
|
|
* `timestampNode` or its parent node as the first element)
|
2020-08-26 22:46:34 +00:00
|
|
|
* - {string|null} Username, null for unsigned comments
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-07-20 21:15:03 +00:00
|
|
|
Parser.prototype.findSignature = function ( timestampNode, until ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var sigUsername = null;
|
|
|
|
var length = 0;
|
|
|
|
var lastLinkNode = timestampNode;
|
2020-05-27 17:42:19 +00:00
|
|
|
|
2021-01-26 18:58:58 +00:00
|
|
|
utils.linearWalkBackwards(
|
|
|
|
timestampNode,
|
|
|
|
function ( event, node ) {
|
|
|
|
if ( event === 'enter' && node === until ) {
|
|
|
|
return true;
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
2021-01-26 18:58:58 +00:00
|
|
|
if ( length >= data.signatureScanLimit ) {
|
|
|
|
return true;
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
2021-01-26 18:58:58 +00:00
|
|
|
if ( utils.isBlockElement( node ) ) {
|
|
|
|
// Don't allow reaching into preceding paragraphs
|
|
|
|
return true;
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
2021-01-26 18:58:58 +00:00
|
|
|
|
|
|
|
if ( event === 'leave' && node !== timestampNode ) {
|
|
|
|
length += node.nodeType === Node.TEXT_NODE ? codePointLength( node.textContent ) : 0;
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
|
2021-01-26 18:58:58 +00:00
|
|
|
// Find the closest link before timestamp that links to the user's user page.
|
|
|
|
//
|
|
|
|
// Support timestamps being linked to the diff introducing the comment:
|
|
|
|
// if the timestamp node is the only child of a link node, use the link node instead
|
|
|
|
//
|
|
|
|
// Handle links nested in formatting elements.
|
|
|
|
if ( event === 'leave' && node.nodeType === Node.ELEMENT_NODE && node.tagName.toLowerCase() === 'a' ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var username = getUsernameFromLink( node );
|
2021-01-26 18:58:58 +00:00
|
|
|
if ( username ) {
|
|
|
|
// Accept the first link to the user namespace, then only accept links to that user
|
|
|
|
if ( sigUsername === null ) {
|
|
|
|
sigUsername = username;
|
|
|
|
}
|
|
|
|
if ( username === sigUsername ) {
|
|
|
|
lastLinkNode = node;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Keep looking if a node with links wasn't a link to a user page
|
|
|
|
// "Doc James (talk · contribs · email)"
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
}
|
2021-01-26 18:58:58 +00:00
|
|
|
);
|
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var range = {
|
2021-01-26 18:58:58 +00:00
|
|
|
startContainer: lastLinkNode.parentNode,
|
|
|
|
startOffset: utils.childIndexOf( lastLinkNode ),
|
|
|
|
endContainer: timestampNode.parentNode,
|
|
|
|
endOffset: utils.childIndexOf( timestampNode ) + 1
|
|
|
|
};
|
2021-04-08 13:46:09 +00:00
|
|
|
var nativeRange = ThreadItem.prototype.getNativeRange.call( { range: range } );
|
2021-01-26 18:58:58 +00:00
|
|
|
|
|
|
|
// Expand the range so that it covers sibling nodes.
|
|
|
|
// This will include any wrapping formatting elements as part of the signature.
|
|
|
|
//
|
|
|
|
// Helpful accidental feature: users whose signature is not detected in full (due to
|
|
|
|
// text formatting) can just wrap it in a <span> to fix that.
|
|
|
|
// "Ten Pound Hammer • (What did I screw up now?)"
|
|
|
|
// "« Saper // dyskusja »"
|
|
|
|
//
|
|
|
|
// TODO Not sure if this is actually good, might be better to just use the range...
|
2021-04-08 13:46:09 +00:00
|
|
|
var sigNodes = utils.getCoveredSiblings( nativeRange ).reverse();
|
2021-01-26 18:58:58 +00:00
|
|
|
|
2019-09-26 07:06:56 +00:00
|
|
|
return [ sigNodes, sigUsername ];
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2020-12-14 20:13:55 +00:00
|
|
|
/**
|
|
|
|
* Check whether the node is a comment separator (instead of a part of the comment).
|
|
|
|
*
|
|
|
|
* @param {Node} node
|
|
|
|
* @return {boolean}
|
|
|
|
*/
|
|
|
|
function isCommentSeparator( node ) {
|
|
|
|
return node.nodeType === Node.ELEMENT_NODE && (
|
|
|
|
// Empty paragraphs (`<p><br></p>`) between indented comments mess up indentation detection
|
|
|
|
node.nodeName.toLowerCase() === 'br' ||
|
|
|
|
// Horizontal line
|
2020-09-30 21:29:56 +00:00
|
|
|
node.nodeName.toLowerCase() === 'hr' ||
|
|
|
|
// {{outdent}} templates
|
|
|
|
node.getAttribute( 'class' ) === 'outdent-template'
|
2020-12-14 20:13:55 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
2020-09-30 18:41:38 +00:00
|
|
|
* Return the next leaf node in the tree order that is likely a part of a discussion comment,
|
|
|
|
* rather than some boring "separator" element.
|
2019-10-20 16:36:43 +00:00
|
|
|
*
|
2020-12-14 20:13:55 +00:00
|
|
|
* Currently, this can return a Text node with content other than whitespace, or an Element node
|
|
|
|
* that is a "void element" or "text element", except some special cases that we treat as comment
|
|
|
|
* separators (isCommentSeparator()).
|
2019-10-20 16:36:43 +00:00
|
|
|
*
|
|
|
|
* @private
|
|
|
|
* @param {Node} node Node to start searching at. If it isn't a leaf node, its children are ignored.
|
2020-06-04 18:48:58 +00:00
|
|
|
* @return {Node}
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-07-20 21:15:03 +00:00
|
|
|
Parser.prototype.nextInterestingLeafNode = function ( node ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var rootNode = this.rootNode;
|
2020-07-20 21:15:03 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var treeWalker = rootNode.ownerDocument.createTreeWalker(
|
2019-09-26 07:06:56 +00:00
|
|
|
rootNode,
|
|
|
|
// eslint-disable-next-line no-bitwise
|
|
|
|
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
|
|
|
|
function ( n ) {
|
|
|
|
// Ignore this node and its descendants
|
2020-01-28 23:35:26 +00:00
|
|
|
// (unless it's the root node, this is a special case for "fakeHeading" handling)
|
|
|
|
if ( node !== rootNode && ( n === node || n.parentNode === node ) ) {
|
2019-09-26 07:06:56 +00:00
|
|
|
return NodeFilter.FILTER_REJECT;
|
|
|
|
}
|
2020-12-14 20:13:55 +00:00
|
|
|
// Ignore some elements usually used as separators or headers (and their descendants)
|
|
|
|
if ( isCommentSeparator( n ) ) {
|
|
|
|
return NodeFilter.FILTER_REJECT;
|
|
|
|
}
|
2021-01-26 03:36:27 +00:00
|
|
|
// Ignore nodes with no rendering that mess up our indentation detection
|
|
|
|
if ( utils.isRenderingTransparentNode( n ) ) {
|
|
|
|
return NodeFilter.FILTER_REJECT;
|
|
|
|
}
|
2020-05-13 21:23:22 +00:00
|
|
|
if (
|
2020-05-11 20:07:14 +00:00
|
|
|
( n.nodeType === Node.TEXT_NODE && utils.htmlTrim( n.textContent ) !== '' ) ||
|
|
|
|
( n.nodeType === Node.CDATA_SECTION_NODE && utils.htmlTrim( n.textContent ) !== '' ) ||
|
2020-12-14 20:13:55 +00:00
|
|
|
( utils.cantHaveElementChildren( n ) )
|
2020-05-13 21:23:22 +00:00
|
|
|
) {
|
2019-09-26 07:06:56 +00:00
|
|
|
return NodeFilter.FILTER_ACCEPT;
|
|
|
|
}
|
|
|
|
return NodeFilter.FILTER_SKIP;
|
|
|
|
},
|
|
|
|
false
|
|
|
|
);
|
|
|
|
treeWalker.currentNode = node;
|
|
|
|
treeWalker.nextNode();
|
2020-06-04 18:48:58 +00:00
|
|
|
if ( !treeWalker.currentNode ) {
|
|
|
|
throw new Error( 'nextInterestingLeafNode not found' );
|
|
|
|
}
|
2019-09-26 07:06:56 +00:00
|
|
|
return treeWalker.currentNode;
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
|
|
|
* Get all discussion comments (and headings) within a DOM subtree.
|
|
|
|
*
|
2020-07-20 21:15:03 +00:00
|
|
|
* This returns a flat list, use #getThreads to get a tree structure starting at section headings.
|
2019-10-20 16:36:43 +00:00
|
|
|
*
|
|
|
|
* For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
|
|
|
|
* syntax is just for illustration):
|
|
|
|
*
|
|
|
|
* == A ==
|
|
|
|
* B. ~~~~
|
|
|
|
* : C.
|
|
|
|
* : C. ~~~~
|
|
|
|
* :: D. ~~~~
|
|
|
|
* ::: E. ~~~~
|
|
|
|
* ::: F. ~~~~
|
|
|
|
* : G. ~~~~
|
|
|
|
* H. ~~~~
|
|
|
|
* : I. ~~~~
|
|
|
|
*
|
|
|
|
* This function would return a structure like:
|
|
|
|
*
|
|
|
|
* [
|
2020-06-25 12:23:17 +00:00
|
|
|
* HeadingItem( { level: 0, range: (h2: A) } ),
|
|
|
|
* CommentItem( { level: 1, range: (p: B) } ),
|
|
|
|
* CommentItem( { level: 2, range: (li: C, li: C) } ),
|
|
|
|
* CommentItem( { level: 3, range: (li: D) } ),
|
|
|
|
* CommentItem( { level: 4, range: (li: E) } ),
|
|
|
|
* CommentItem( { level: 4, range: (li: F) } ),
|
|
|
|
* CommentItem( { level: 2, range: (li: G) } ),
|
|
|
|
* CommentItem( { level: 1, range: (p: H) } ),
|
|
|
|
* CommentItem( { level: 2, range: (li: I) } )
|
2019-10-20 16:36:43 +00:00
|
|
|
* ]
|
|
|
|
*
|
2020-05-11 19:56:23 +00:00
|
|
|
* @param {HTMLElement} rootNode
|
2020-06-25 12:23:17 +00:00
|
|
|
* @return {ThreadItem[]} Thread items
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-07-20 21:15:03 +00:00
|
|
|
Parser.prototype.getThreadItems = function () {
|
|
|
|
if ( !this.threadItems ) {
|
|
|
|
this.buildThreads();
|
|
|
|
}
|
|
|
|
return this.threadItems;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Same as getFlatThreadItems, but only returns the CommentItems
|
|
|
|
*
|
|
|
|
* @return {CommentItem[]} Comment items
|
|
|
|
*/
|
|
|
|
Parser.prototype.getCommentItems = function () {
|
|
|
|
if ( !this.commentItems ) {
|
|
|
|
this.buildThreads();
|
|
|
|
}
|
|
|
|
return this.commentItems;
|
|
|
|
};
|
|
|
|
|
2021-02-12 19:16:13 +00:00
|
|
|
/**
|
|
|
|
* Find ThreadItems by their name
|
|
|
|
*
|
|
|
|
* This will usually return a single-element array, but it may return multiple comments if they're
|
|
|
|
* indistinguishable by name. In that case, use their IDs to disambiguate.
|
|
|
|
*
|
|
|
|
* @param {string} name Name
|
|
|
|
* @return {ThreadItem[]} Thread items, empty array if not found
|
|
|
|
*/
|
|
|
|
Parser.prototype.findCommentsByName = function ( name ) {
|
|
|
|
if ( !this.threadItemsByName ) {
|
|
|
|
this.buildThreads();
|
|
|
|
}
|
|
|
|
return this.threadItemsByName[ name ] || [];
|
|
|
|
};
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
/**
|
2020-09-22 23:05:25 +00:00
|
|
|
* Find a ThreadItem by its ID
|
2020-07-20 21:15:03 +00:00
|
|
|
*
|
2020-09-22 23:05:25 +00:00
|
|
|
* @param {string} id ID
|
|
|
|
* @return {ThreadItem|null} Thread item, null if not found
|
2020-07-20 21:15:03 +00:00
|
|
|
*/
|
|
|
|
Parser.prototype.findCommentById = function ( id ) {
|
2020-09-22 23:05:25 +00:00
|
|
|
if ( !this.threadItemsById ) {
|
2020-07-20 21:15:03 +00:00
|
|
|
this.buildThreads();
|
|
|
|
}
|
2020-09-22 23:05:25 +00:00
|
|
|
return this.threadItemsById[ id ] || null;
|
2020-07-20 21:15:03 +00:00
|
|
|
};
|
|
|
|
|
2021-02-26 20:57:22 +00:00
|
|
|
/**
|
|
|
|
* @param {Node[]} sigNodes
|
|
|
|
* @param {Object} match
|
|
|
|
* @param {Text} node
|
|
|
|
* @return {Object}
|
|
|
|
*/
|
|
|
|
function adjustSigRange( sigNodes, match, node ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var firstSigNode = sigNodes[ sigNodes.length - 1 ];
|
|
|
|
var lastSigNode = sigNodes[ 0 ];
|
2021-02-26 20:57:22 +00:00
|
|
|
|
|
|
|
// TODO Document why this needs to be so complicated
|
2021-04-08 13:46:09 +00:00
|
|
|
var lastSigNodeOffset = lastSigNode === node ?
|
2021-02-26 20:57:22 +00:00
|
|
|
match.matchData.index + match.matchData[ 0 ].length - match.offset :
|
|
|
|
utils.childIndexOf( lastSigNode ) + 1;
|
2021-04-08 13:46:09 +00:00
|
|
|
var sigRange = {
|
2021-02-26 20:57:22 +00:00
|
|
|
startContainer: firstSigNode.parentNode,
|
|
|
|
startOffset: utils.childIndexOf( firstSigNode ),
|
|
|
|
endContainer: lastSigNode === node ? node : lastSigNode.parentNode,
|
|
|
|
endOffset: lastSigNodeOffset
|
|
|
|
};
|
|
|
|
return sigRange;
|
|
|
|
}
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
Parser.prototype.buildThreadItems = function () {
|
2019-09-26 07:06:56 +00:00
|
|
|
var
|
2020-09-03 20:59:33 +00:00
|
|
|
dfParsers = this.getLocalTimestampParsers(),
|
|
|
|
timestampRegexps = this.getLocalTimestampRegexps(),
|
2020-07-20 21:15:03 +00:00
|
|
|
commentItems = [],
|
2021-04-08 13:46:09 +00:00
|
|
|
threadItems = [];
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var treeWalker = this.rootNode.ownerDocument.createTreeWalker(
|
2020-07-20 21:15:03 +00:00
|
|
|
this.rootNode,
|
2019-09-26 07:06:56 +00:00
|
|
|
// eslint-disable-next-line no-bitwise
|
|
|
|
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
|
2019-10-24 16:24:59 +00:00
|
|
|
acceptOnlyNodesAllowingComments,
|
2019-09-26 07:06:56 +00:00
|
|
|
false
|
|
|
|
);
|
|
|
|
|
2020-01-28 23:35:26 +00:00
|
|
|
// Placeholder heading in case there are comments in the 0th section
|
2021-04-08 13:46:09 +00:00
|
|
|
var range = {
|
2020-07-20 21:15:03 +00:00
|
|
|
startContainer: this.rootNode,
|
2020-02-05 18:30:21 +00:00
|
|
|
startOffset: 0,
|
2020-07-20 21:15:03 +00:00
|
|
|
endContainer: this.rootNode,
|
2020-02-05 18:30:21 +00:00
|
|
|
endOffset: 0
|
|
|
|
};
|
2021-04-08 13:46:09 +00:00
|
|
|
var fakeHeading = new HeadingItem( range, 99, true );
|
2020-07-29 23:57:51 +00:00
|
|
|
fakeHeading.rootNode = this.rootNode;
|
2020-01-28 23:35:26 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var curComment = fakeHeading;
|
|
|
|
var curCommentEnd = range.endContainer;
|
2020-01-28 23:35:26 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var node, lastSigNode;
|
2019-09-26 07:06:56 +00:00
|
|
|
while ( ( node = treeWalker.nextNode() ) ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var match;
|
2020-10-01 19:36:11 +00:00
|
|
|
if ( node.tagName && ( match = node.tagName.match( /^h([1-6])$/i ) ) ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var headingNodeAndOffset = utils.getHeadlineNodeAndOffset( node );
|
|
|
|
var headingNode = headingNodeAndOffset.node;
|
|
|
|
var startOffset = headingNodeAndOffset.offset;
|
2020-02-05 18:30:21 +00:00
|
|
|
range = {
|
2020-10-27 12:18:36 +00:00
|
|
|
startContainer: headingNode,
|
|
|
|
startOffset: startOffset,
|
|
|
|
endContainer: headingNode,
|
|
|
|
endOffset: headingNode.childNodes.length
|
2020-02-05 18:30:21 +00:00
|
|
|
};
|
2020-10-01 19:36:11 +00:00
|
|
|
curComment = new HeadingItem( range, +match[ 1 ] );
|
2020-07-29 23:57:51 +00:00
|
|
|
curComment.rootNode = this.rootNode;
|
2020-07-20 21:15:03 +00:00
|
|
|
threadItems.push( curComment );
|
2020-11-16 00:35:51 +00:00
|
|
|
curCommentEnd = node;
|
2020-09-03 20:59:33 +00:00
|
|
|
} else if ( node.nodeType === Node.TEXT_NODE && ( match = this.findTimestamp( node, timestampRegexps ) ) ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var warnings = [];
|
|
|
|
var foundSignature = this.findSignature( node, lastSigNode );
|
|
|
|
var author = foundSignature[ 1 ];
|
2020-05-27 17:42:19 +00:00
|
|
|
lastSigNode = foundSignature[ 0 ][ 0 ];
|
Only detect comments with real signatures
Consequences of this are visible in the test cases:
* (en) Tech News posts are not detected.
Examples: "21:22, 1 July 2019 (UTC)", "21:42, 29 July 2019 (UTC)"
* (en) Comments by users who customize the timestamp are not detected.
Examples: "10:49, 28 June 2019 (UTC)", "21:34, 14 July 2019 (UTC)"
* (en) Comments with signatures missing a username are not detected.
This sometimes happens if a comment is accidentally signed with
'~~~~~' (five tildes), which only inserts the timestamp.
Examples: "17:17, 27 July 2019 (UTC)", "10:25, 29 July 2019 (UTC)"
* (pl) A lone timestamp at the beginning of a thread is not detected.
It's not part of a post, it was added to aid automatic archiving.
Example: "21:03, 18 paź 2018 (CET)"
Bug: T245692
Change-Id: I0767bb239a1800f2e538917b5995fc4f0fa4d043
2020-02-20 22:06:17 +00:00
|
|
|
|
|
|
|
if ( !author ) {
|
|
|
|
// Ignore timestamps for which we couldn't find a signature. It's probably not a real
|
|
|
|
// comment, but just a false match due to a copypasted timestamp.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var sigRanges = [];
|
2021-02-26 20:57:22 +00:00
|
|
|
sigRanges.push( adjustSigRange( foundSignature[ 0 ], match, node ) );
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2020-08-11 04:22:55 +00:00
|
|
|
// Everything from the last comment up to here is the next comment
|
2021-04-08 13:46:09 +00:00
|
|
|
var startNode = this.nextInterestingLeafNode( curCommentEnd );
|
|
|
|
var endNode = lastSigNode;
|
2021-01-08 19:20:33 +00:00
|
|
|
|
|
|
|
// Skip to the end of the "paragraph". This only looks at tag names and can be fooled by CSS, but
|
|
|
|
// avoiding that would be more difficult and slower.
|
2021-02-26 20:57:22 +00:00
|
|
|
//
|
|
|
|
// If this skips over another potential signature, also skip it in the main TreeWalker loop, to
|
|
|
|
// avoid generating multiple comments when there is more than one signature on a single "line".
|
|
|
|
// Often this is done when someone edits their comment later and wants to add a note about that.
|
|
|
|
// (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments
|
|
|
|
// within one paragraph/list-item result in a confusing double "Reply" button, and we also have
|
|
|
|
// no way to indicate which one you're replying to (this might matter in the future for
|
|
|
|
// notifications or something).
|
2021-01-08 19:20:33 +00:00
|
|
|
utils.linearWalk(
|
|
|
|
lastSigNode,
|
|
|
|
// eslint-disable-next-line no-loop-func
|
2021-02-26 20:57:22 +00:00
|
|
|
function ( event, n ) {
|
|
|
|
var match2, foundSignature2;
|
|
|
|
if ( utils.isBlockElement( n ) ) {
|
2021-01-08 19:20:33 +00:00
|
|
|
// Stop when entering or leaving a block node
|
|
|
|
return true;
|
|
|
|
}
|
2021-02-26 20:57:22 +00:00
|
|
|
if (
|
|
|
|
event === 'leave' &&
|
|
|
|
n.nodeType === Node.TEXT_NODE && n !== node &&
|
|
|
|
( match2 = this.findTimestamp( n, timestampRegexps ) )
|
|
|
|
) {
|
|
|
|
// If this skips over another potential signature, also skip it in the main TreeWalker loop
|
|
|
|
treeWalker.currentNode = n;
|
|
|
|
// …and add it as another signature to this comment (regardless of the author and timestamp)
|
|
|
|
foundSignature2 = this.findSignature( n, node );
|
|
|
|
if ( foundSignature2[ 1 ] ) {
|
|
|
|
sigRanges.push( adjustSigRange( foundSignature2[ 0 ], match2, n ) );
|
|
|
|
}
|
|
|
|
}
|
2021-01-08 19:20:33 +00:00
|
|
|
if ( event === 'leave' ) {
|
|
|
|
// Take the last complete node which we skipped past
|
2021-02-26 20:57:22 +00:00
|
|
|
endNode = n;
|
2021-01-08 19:20:33 +00:00
|
|
|
}
|
2021-02-26 20:57:22 +00:00
|
|
|
}.bind( this )
|
2021-01-08 19:20:33 +00:00
|
|
|
);
|
2020-08-11 04:22:55 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var length = endNode.nodeType === Node.TEXT_NODE ?
|
2020-11-24 20:58:34 +00:00
|
|
|
endNode.textContent.replace( /[\t\n\f\r ]+$/, '' ).length :
|
|
|
|
endNode.childNodes.length;
|
|
|
|
range = {
|
|
|
|
startContainer: startNode.parentNode,
|
|
|
|
startOffset: utils.childIndexOf( startNode ),
|
|
|
|
endContainer: endNode,
|
|
|
|
endOffset: length
|
|
|
|
};
|
2020-08-11 04:22:55 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var startLevel = utils.getIndentLevel( startNode, this.rootNode ) + 1;
|
|
|
|
var endLevel = utils.getIndentLevel( node, this.rootNode ) + 1;
|
2019-09-26 07:06:56 +00:00
|
|
|
if ( startLevel !== endLevel ) {
|
2020-05-13 18:23:48 +00:00
|
|
|
warnings.push( 'Comment starts and ends with different indentation' );
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
2020-08-20 18:38:25 +00:00
|
|
|
// Should this use the indent level of `startNode` or `node`?
|
2021-04-08 13:46:09 +00:00
|
|
|
var level = Math.min( startLevel, endLevel );
|
2020-08-20 18:38:25 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var dateTime = dfParsers[ match.parserIndex ]( match.matchData );
|
2020-08-20 18:38:25 +00:00
|
|
|
if ( dateTime.discussionToolsWarning ) {
|
|
|
|
warnings.push( dateTime.discussionToolsWarning );
|
|
|
|
}
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2020-05-22 16:26:05 +00:00
|
|
|
curComment = new CommentItem(
|
2020-08-20 18:38:25 +00:00
|
|
|
level,
|
2020-05-22 16:26:05 +00:00
|
|
|
range,
|
2021-02-26 20:57:22 +00:00
|
|
|
sigRanges,
|
2020-05-22 16:26:05 +00:00
|
|
|
dateTime,
|
|
|
|
author
|
|
|
|
);
|
2020-07-29 23:57:51 +00:00
|
|
|
curComment.rootNode = this.rootNode;
|
2020-05-13 18:23:48 +00:00
|
|
|
if ( warnings.length ) {
|
|
|
|
curComment.warnings = warnings;
|
|
|
|
}
|
2020-07-20 21:15:03 +00:00
|
|
|
commentItems.push( curComment );
|
|
|
|
threadItems.push( curComment );
|
2020-11-16 00:35:51 +00:00
|
|
|
curCommentEnd = curComment.range.endContainer;
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-28 23:35:26 +00:00
|
|
|
// Insert the fake placeholder heading if there are any comments in the 0th section
|
|
|
|
// (before the first real heading)
|
2020-07-20 21:15:03 +00:00
|
|
|
if ( threadItems.length && !( threadItems[ 0 ] instanceof HeadingItem ) ) {
|
|
|
|
threadItems.unshift( fakeHeading );
|
2020-01-28 23:35:26 +00:00
|
|
|
}
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
this.commentItems = commentItems;
|
|
|
|
this.threadItems = threadItems;
|
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2019-10-20 16:36:43 +00:00
|
|
|
/**
|
|
|
|
* Group discussion comments into threads and associate replies to original messages.
|
|
|
|
*
|
|
|
|
* Each thread must begin with a heading. Original messages in the thread are treated as replies to
|
|
|
|
* its heading. Other replies are associated based on the order and indentation level.
|
|
|
|
*
|
|
|
|
* Note that the objects in `comments` are extended in-place with the additional data.
|
|
|
|
*
|
|
|
|
* For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, the wikitext
|
|
|
|
* syntax is just for illustration):
|
|
|
|
*
|
|
|
|
* == A ==
|
|
|
|
* B. ~~~~
|
|
|
|
* : C.
|
|
|
|
* : C. ~~~~
|
|
|
|
* :: D. ~~~~
|
|
|
|
* ::: E. ~~~~
|
|
|
|
* ::: F. ~~~~
|
|
|
|
* : G. ~~~~
|
|
|
|
* H. ~~~~
|
|
|
|
* : I. ~~~~
|
|
|
|
*
|
|
|
|
* This function would return a structure like:
|
|
|
|
*
|
|
|
|
* [
|
2020-06-25 12:23:17 +00:00
|
|
|
* HeadingItem( { level: 0, range: (h2: A), replies: [
|
|
|
|
* CommentItem( { level: 1, range: (p: B), replies: [
|
|
|
|
* CommentItem( { level: 2, range: (li: C, li: C), replies: [
|
|
|
|
* CommentItem( { level: 3, range: (li: D), replies: [
|
|
|
|
* CommentItem( { level: 4, range: (li: E), replies: [] },
|
|
|
|
* CommentItem( { level: 4, range: (li: F), replies: [] },
|
2019-10-20 16:36:43 +00:00
|
|
|
* ] },
|
|
|
|
* ] },
|
2020-06-25 12:23:17 +00:00
|
|
|
* CommentItem( { level: 2, range: (li: G), replies: [] },
|
2019-10-20 16:36:43 +00:00
|
|
|
* ] },
|
2020-06-25 12:23:17 +00:00
|
|
|
* CommentItem( { level: 1, range: (p: H), replies: [
|
|
|
|
* CommentItem( { level: 2, range: (li: I), replies: [] },
|
2019-10-20 16:36:43 +00:00
|
|
|
* ] },
|
2020-06-25 12:23:17 +00:00
|
|
|
* ] } )
|
2019-10-20 16:36:43 +00:00
|
|
|
* ]
|
|
|
|
*
|
2020-06-25 12:23:17 +00:00
|
|
|
* @return {HeadingItem[]} Tree structure of comments, top-level items are the headings.
|
2019-10-20 16:36:43 +00:00
|
|
|
*/
|
2020-07-20 21:15:03 +00:00
|
|
|
Parser.prototype.getThreads = function () {
|
|
|
|
if ( !this.threads ) {
|
|
|
|
this.buildThreads();
|
|
|
|
}
|
|
|
|
return this.threads;
|
|
|
|
};
|
|
|
|
|
2021-02-02 17:40:18 +00:00
|
|
|
/**
|
|
|
|
* Truncate user generated parts of IDs so full ID always fits within a database field of length 255
|
|
|
|
*
|
|
|
|
* @param {string} text Text
|
|
|
|
* @return {string} Truncated text
|
|
|
|
*/
|
|
|
|
Parser.prototype.truncateForId = function ( text ) {
|
|
|
|
return trimByteLength( '', text, 80 ).newVal;
|
|
|
|
};
|
|
|
|
|
2020-10-21 15:52:04 +00:00
|
|
|
/**
|
|
|
|
* Given a thread item, return an identifier for it that is unique within the page.
|
|
|
|
*
|
|
|
|
* @param {ThreadItem} threadItem
|
2021-02-12 23:02:51 +00:00
|
|
|
* @return {string}
|
2020-10-21 15:52:04 +00:00
|
|
|
*/
|
|
|
|
Parser.prototype.computeId = function ( threadItem ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var id, headline;
|
2020-10-21 15:52:04 +00:00
|
|
|
|
2020-09-22 23:05:25 +00:00
|
|
|
if ( threadItem instanceof HeadingItem && threadItem.placeholderHeading ) {
|
|
|
|
// The range points to the root note, using it like below results in silly values
|
2021-03-17 14:39:41 +00:00
|
|
|
id = 'h-';
|
2020-09-22 23:05:25 +00:00
|
|
|
} else if ( threadItem instanceof HeadingItem ) {
|
2020-11-04 21:36:53 +00:00
|
|
|
headline = utils.getHeadlineNodeAndOffset( threadItem.range.startContainer ).node;
|
2021-03-17 14:39:41 +00:00
|
|
|
id = 'h-' + this.truncateForId( headline.getAttribute( 'id' ) || '' );
|
2020-09-22 23:05:25 +00:00
|
|
|
} else if ( threadItem instanceof CommentItem ) {
|
2021-03-17 14:39:41 +00:00
|
|
|
id = 'c-' + this.truncateForId( threadItem.author || '' ).replace( / /g, '_' ) + '-' + threadItem.timestamp.toISOString();
|
2020-10-01 20:13:47 +00:00
|
|
|
} else {
|
|
|
|
throw new Error( 'Unknown ThreadItem type' );
|
|
|
|
}
|
|
|
|
|
|
|
|
// If there would be multiple comments with the same ID (i.e. the user left multiple comments
|
|
|
|
// in one edit, or within a minute), append sequential numbers
|
2021-04-08 13:46:09 +00:00
|
|
|
var threadItemParent = threadItem.parent;
|
2020-10-01 20:13:47 +00:00
|
|
|
if ( threadItemParent instanceof HeadingItem && !threadItemParent.placeholderHeading ) {
|
2020-11-04 21:36:53 +00:00
|
|
|
headline = utils.getHeadlineNodeAndOffset( threadItemParent.range.startContainer ).node;
|
2021-03-17 14:39:41 +00:00
|
|
|
id += '-' + this.truncateForId( headline.getAttribute( 'id' ) || '' );
|
2020-10-01 20:13:47 +00:00
|
|
|
} else if ( threadItemParent instanceof CommentItem ) {
|
2021-03-17 14:39:41 +00:00
|
|
|
id += '-' + this.truncateForId( threadItemParent.author || '' ).replace( / /g, '_' ) + '-' + threadItemParent.timestamp.toISOString();
|
2020-10-01 20:13:47 +00:00
|
|
|
}
|
2020-10-21 15:52:04 +00:00
|
|
|
|
2020-10-07 15:48:20 +00:00
|
|
|
if ( threadItem instanceof HeadingItem ) {
|
|
|
|
// To avoid old threads re-appearing on popular pages when someone uses a vague title
|
|
|
|
// (e.g. dozens of threads titled "question" on [[Wikipedia:Help desk]]: https://w.wiki/fbN),
|
|
|
|
// include the oldest timestamp in the thread (i.e. date the thread was started) in the
|
|
|
|
// heading ID.
|
2021-04-08 13:46:09 +00:00
|
|
|
var oldestComment = this.getThreadStartComment( threadItem );
|
2021-02-22 20:31:15 +00:00
|
|
|
if ( oldestComment ) {
|
2021-03-17 14:39:41 +00:00
|
|
|
id += '-' + oldestComment.timestamp.toISOString();
|
2020-10-07 15:48:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-01 20:13:47 +00:00
|
|
|
if ( this.threadItemsById[ id ] ) {
|
|
|
|
// Well, that's tough
|
|
|
|
threadItem.warnings.push( 'Duplicate comment ID' );
|
|
|
|
// Finally, disambiguate by adding sequential numbers, to allow replying to both comments
|
2021-04-08 13:46:09 +00:00
|
|
|
var number = 1;
|
2021-03-17 14:39:41 +00:00
|
|
|
while ( this.threadItemsById[ id + '-' + number ] ) {
|
2020-10-21 15:52:04 +00:00
|
|
|
number++;
|
|
|
|
}
|
2021-03-17 14:39:41 +00:00
|
|
|
id = id + '-' + number;
|
2020-10-21 15:52:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return id;
|
|
|
|
};
|
|
|
|
|
2021-02-12 19:16:13 +00:00
|
|
|
/**
|
|
|
|
* Given a thread item, return an identifier for it that is consistent across all pages and
|
|
|
|
* revisions where this comment might appear.
|
|
|
|
*
|
|
|
|
* Multiple comments on a page can have the same name; use ID to distinguish them.
|
|
|
|
*
|
|
|
|
* @param {ThreadItem} threadItem
|
|
|
|
* @return {string}
|
|
|
|
*/
|
|
|
|
Parser.prototype.computeName = function ( threadItem ) {
|
|
|
|
var name, mainComment;
|
|
|
|
|
|
|
|
if ( threadItem instanceof HeadingItem ) {
|
|
|
|
name = 'h-';
|
|
|
|
mainComment = this.getThreadStartComment( threadItem );
|
|
|
|
} else if ( threadItem instanceof CommentItem ) {
|
|
|
|
name = 'c-';
|
|
|
|
mainComment = threadItem;
|
|
|
|
} else {
|
|
|
|
throw new Error( 'Unknown ThreadItem type' );
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( mainComment ) {
|
|
|
|
name += this.truncateForId( mainComment.author || '' ).replace( / /g, '_' ) +
|
|
|
|
'-' + mainComment.timestamp.toISOString();
|
|
|
|
}
|
|
|
|
|
|
|
|
return name;
|
|
|
|
};
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
Parser.prototype.buildThreads = function () {
|
2019-09-26 07:06:56 +00:00
|
|
|
var
|
|
|
|
threads = [],
|
2021-04-08 13:46:09 +00:00
|
|
|
replies = [];
|
2020-07-20 21:15:03 +00:00
|
|
|
|
|
|
|
if ( !this.threadItems ) {
|
|
|
|
this.buildThreadItems();
|
|
|
|
}
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2020-09-22 23:05:25 +00:00
|
|
|
this.threadItemsById = {};
|
2021-02-12 19:16:13 +00:00
|
|
|
this.threadItemsByName = {};
|
2021-04-08 13:46:09 +00:00
|
|
|
|
|
|
|
var i, threadItem;
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
for ( i = 0; i < this.threadItems.length; i++ ) {
|
|
|
|
threadItem = this.threadItems[ i ];
|
2019-11-24 15:39:52 +00:00
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
if ( replies.length < threadItem.level ) {
|
2019-09-26 07:06:56 +00:00
|
|
|
// Someone skipped an indentation level (or several). Pretend that the previous reply
|
|
|
|
// covers multiple indentation levels, so that following comments get connected to it.
|
2020-07-20 21:15:03 +00:00
|
|
|
threadItem.warnings.push( 'Comment skips indentation level' );
|
|
|
|
while ( replies.length < threadItem.level ) {
|
2019-09-26 07:06:56 +00:00
|
|
|
replies[ replies.length ] = replies[ replies.length - 1 ];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
if ( threadItem instanceof HeadingItem ) {
|
2020-05-12 23:21:02 +00:00
|
|
|
// New root (thread)
|
2020-07-20 21:15:03 +00:00
|
|
|
threads.push( threadItem );
|
2020-10-01 19:36:11 +00:00
|
|
|
// Attach as a sub-thread to preceding higher-level heading.
|
|
|
|
// Any replies will appear in the tree twice, under the main-thread and the sub-thread.
|
2021-04-08 13:46:09 +00:00
|
|
|
var maybeParent = threads.length > 1 ? threads[ threads.length - 2 ] : null;
|
2020-10-01 19:36:11 +00:00
|
|
|
while ( maybeParent && maybeParent.headingLevel >= threadItem.headingLevel ) {
|
|
|
|
maybeParent = maybeParent.parent;
|
|
|
|
}
|
|
|
|
if ( maybeParent ) {
|
|
|
|
threadItem.parent = maybeParent;
|
|
|
|
maybeParent.replies.push( threadItem );
|
|
|
|
}
|
2020-07-20 21:15:03 +00:00
|
|
|
} else if ( replies[ threadItem.level - 1 ] ) {
|
2020-05-12 23:21:02 +00:00
|
|
|
// Add as a reply to the closest less-nested comment
|
2020-07-20 21:15:03 +00:00
|
|
|
threadItem.parent = replies[ threadItem.level - 1 ];
|
|
|
|
threadItem.parent.replies.push( threadItem );
|
2019-09-26 07:06:56 +00:00
|
|
|
} else {
|
2020-07-20 21:15:03 +00:00
|
|
|
threadItem.warnings.push( 'Comment could not be connected to a thread' );
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
|
|
|
|
2020-07-20 21:15:03 +00:00
|
|
|
replies[ threadItem.level ] = threadItem;
|
2020-05-12 23:21:02 +00:00
|
|
|
// Cut off more deeply nested replies
|
2020-07-20 21:15:03 +00:00
|
|
|
replies.length = threadItem.level + 1;
|
2020-10-07 15:48:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
this.threads = threads;
|
|
|
|
|
|
|
|
for ( i = 0; i < this.threadItems.length; i++ ) {
|
|
|
|
threadItem = this.threadItems[ i ];
|
2020-10-01 20:13:47 +00:00
|
|
|
|
2021-04-08 13:46:09 +00:00
|
|
|
var name = this.computeName( threadItem );
|
2021-02-12 19:16:13 +00:00
|
|
|
threadItem.name = name;
|
|
|
|
if ( !this.threadItemsByName[ name ] ) {
|
|
|
|
this.threadItemsByName[ name ] = [];
|
|
|
|
}
|
|
|
|
this.threadItemsByName[ name ].push( threadItem );
|
|
|
|
|
2020-10-01 20:13:47 +00:00
|
|
|
// Set the IDs used to refer to comments and headings.
|
2020-10-07 15:48:20 +00:00
|
|
|
// This has to be a separate pass because we don't have the list of replies before
|
|
|
|
// this point.
|
2021-04-08 13:46:09 +00:00
|
|
|
var id = this.computeId( threadItem );
|
2020-10-01 20:13:47 +00:00
|
|
|
threadItem.id = id;
|
2021-02-12 23:02:51 +00:00
|
|
|
this.threadItemsById[ id ] = threadItem;
|
2019-09-26 07:06:56 +00:00
|
|
|
}
|
2020-10-07 15:48:20 +00:00
|
|
|
};
|
2019-09-26 07:06:56 +00:00
|
|
|
|
2020-10-07 15:48:20 +00:00
|
|
|
/**
|
|
|
|
* @param {ThreadItem} threadItem
|
2021-02-22 20:31:15 +00:00
|
|
|
* @return {CommentItem|null}
|
2020-10-07 15:48:20 +00:00
|
|
|
*/
|
2021-02-22 20:31:15 +00:00
|
|
|
Parser.prototype.getThreadStartComment = function ( threadItem ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var oldest = null;
|
2020-10-07 15:48:20 +00:00
|
|
|
if ( threadItem instanceof CommentItem ) {
|
2021-02-22 20:31:15 +00:00
|
|
|
oldest = threadItem;
|
2020-10-07 15:48:20 +00:00
|
|
|
}
|
|
|
|
// Check all replies. This can't just use the first comment because threads are often summarized
|
|
|
|
// at the top when the discussion is closed.
|
2021-04-08 13:46:09 +00:00
|
|
|
for ( var i = 0; i < threadItem.replies.length; i++ ) {
|
|
|
|
var comment = threadItem.replies[ i ];
|
2020-10-07 15:48:20 +00:00
|
|
|
// Don't include sub-threads to avoid changing the ID when threads are "merged".
|
|
|
|
if ( comment instanceof CommentItem ) {
|
2021-04-08 13:46:09 +00:00
|
|
|
var oldestInReplies = this.getThreadStartComment( comment );
|
2021-02-22 20:31:15 +00:00
|
|
|
if ( !oldest || oldestInReplies.timestamp.isBefore( oldest.timestamp ) ) {
|
|
|
|
oldest = oldestInReplies;
|
2020-10-07 15:48:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-02-22 20:31:15 +00:00
|
|
|
return oldest;
|
2019-09-26 07:06:56 +00:00
|
|
|
};
|
2020-07-20 21:15:03 +00:00
|
|
|
|
|
|
|
module.exports = Parser;
|