From 092cfd6075a456b0139ca6617c3232b8034ddb13 Mon Sep 17 00:00:00 2001
From: Ed Sanders <esanders@wikimedia.org>
Date: Fri, 10 Jul 2020 15:16:49 +0100
Subject: [PATCH] Parser: Replace findTimestamps with findTimestamp

Instead of doing a separate tree walk and finding all timestamps
separately, make it part of the getComments tree walk, and find
timestamps one at a time.

Change-Id: I47f466eaf228504faa189fd99e07493bc7f022cd
---
 includes/CommentParser.php |  52 +++++++------------
 modules/dt.debug.js        |  22 ++++----
 modules/parser.js          | 100 ++++++++++++++-----------------------
 modules/utils.js           |   2 +-
 4 files changed, 69 insertions(+), 107 deletions(-)

diff --git a/includes/CommentParser.php b/includes/CommentParser.php
index 37df56e56..78c037773 100644
--- a/includes/CommentParser.php
+++ b/includes/CommentParser.php
@@ -10,7 +10,6 @@ use DateTimeZone;
 use DOMElement;
 use DOMNode;
 use DOMText;
-use DOMXPath;
 use IP;
 use Language;
 use MediaWiki\MediaWikiServices;
@@ -18,8 +17,6 @@ use MWException;
 use Title;
 
 // TODO clean up static vs non-static
-
-// TODO consider rewriting as single traversal, without XPath
 // TODO consider making timestamp parsing not a returned function
 
 class CommentParser {
@@ -635,18 +632,13 @@ class CommentParser {
 	}
 
 	/**
-	 * Find all timestamps within a DOM subtree.
+	 * Find a timestamps in a given text node
 	 *
-	 * @param DOMElement $rootNode
-	 * @return array Array of [node, matchData] pairs
+	 * @param DOMText $node Text node
+	 * @param string $timestampRegex Timestamp regex
+	 * @return array|null Match data
 	 */
-	public function findTimestamps( DOMElement $rootNode ) : array {
-		$xpath = new DOMXPath( $rootNode->ownerDocument );
-		$textNodes = $xpath->query( '//text()', $rootNode );
-		$matches = [];
-		$timestampRegex = $this->getLocalTimestampRegexp();
-		foreach ( $textNodes as $node ) {
-			$startNode = $node;
+	public function findTimestamp( DOMText $node, string $timestampRegex ) : ?array {
 			$nodeText = '';
 
 			while ( $node ) {
@@ -657,18 +649,18 @@ class CommentParser {
 				// which apparently are often turned into &nbsp; entities by buggy editing tools. To handle
 				// this, we must piece together the text, so that our regexp can match those timestamps.
 				if (
-					$node->nextSibling &&
-					$node->nextSibling->nodeType === XML_ELEMENT_NODE &&
-					$node->nextSibling->getAttribute( 'typeof' ) === 'mw:Entity'
+				( $nextSibling = $node->nextSibling ) &&
+				$nextSibling instanceof DOMElement &&
+				$nextSibling->getAttribute( 'typeof' ) === 'mw:Entity'
 				) {
-					$nodeText .= $node->nextSibling->firstChild->nodeValue;
+				$nodeText .= $nextSibling->firstChild->nodeValue;
 
 					// If the entity is followed by more text, do this again
 					if (
-						$node->nextSibling->nextSibling &&
-						$node->nextSibling->nextSibling->nodeType === XML_TEXT_NODE
+					$nextSibling->nextSibling &&
+					$nextSibling->nextSibling instanceof DOMText
 					) {
-						$node = $node->nextSibling->nextSibling;
+					$node = $nextSibling->nextSibling;
 					} else {
 						$node = null;
 					}
@@ -680,10 +672,9 @@ class CommentParser {
 			$matchData = null;
 			// Allows us to mimic match.index in #getComments
 			if ( preg_match( $timestampRegex, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) {
-				$matches[] = [ $startNode, $matchData ];
+			return $matchData;
 			}
-		}
-		return $matches;
+		return null;
 	}
 
 	/**
@@ -724,8 +715,7 @@ class CommentParser {
 	 * @return ThreadItem[] Thread items
 	 */
 	public function getComments( DOMElement $rootNode ) : array {
-		$timestamps = $this->findTimestamps( $rootNode );
-
+		$timestampRegex = $this->getLocalTimestampRegexp();
 		$comments = [];
 		$dfParser = $this->getLocalTimestampParser();
 
@@ -735,21 +725,17 @@ class CommentParser {
 
 		$curComment = $fakeHeading;
 
-		$nextTimestamp = 0;
 		$treeWalker = new TreeWalker(
 			$rootNode,
 			NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT,
 			[ self::class, 'acceptOnlyNodesAllowingComments' ]
 		);
 		while ( $node = $treeWalker->nextNode() ) {
-			if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) {
+			if ( $node instanceof DOMElement && preg_match( '/^h[1-6]$/i', $node->tagName ) ) {
 				$range = new ImmutableRange( $node, 0, $node, $node->childNodes->length );
 				$curComment = new HeadingItem( $range );
 				$comments[] = $curComment;
-			} elseif (
-				$node instanceof DOMText &&
-				isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0]
-			) {
+			} elseif ( $node instanceof DOMText && ( $match = $this->findTimestamp( $node, $timestampRegex ) ) ) {
 				$warnings = [];
 				$foundSignature = $this->findSignature( $node, $curComment->getRange()->endContainer );
 				$author = $foundSignature[1];
@@ -759,13 +745,11 @@ class CommentParser {
 				if ( !$author ) {
 					// Ignore timestamps for which we couldn't find a signature. It's probably not a real
 					// comment, but just a false match due to a copypasted timestamp.
-					$nextTimestamp++;
 					continue;
 				}
 
 				// Everything from the last comment up to here is the next comment
 				$startNode = $this->nextInterestingLeafNode( $curComment->getRange()->endContainer, $rootNode );
-				$match = $timestamps[$nextTimestamp][1];
 				$offset = $lastSigNode === $node ?
 					$match[0][1] + strlen( $match[0][0] ) :
 					CommentUtils::childIndexOf( $lastSigNode ) + 1;
@@ -814,7 +798,6 @@ class CommentParser {
 					$curComment->addSignatureRange( $sigRange );
 					$curComment->setLevel( min( min( $startLevel, $endLevel ), $curComment->getLevel() ) );
 
-					$nextTimestamp++;
 					continue;
 				}
 
@@ -837,7 +820,6 @@ class CommentParser {
 					$curComment->addWarnings( $warnings );
 				}
 				$comments[] = $curComment;
-				$nextTimestamp++;
 			}
 		}
 
diff --git a/modules/dt.debug.js b/modules/dt.debug.js
index 27ee2e475..1d86c43b4 100644
--- a/modules/dt.debug.js
+++ b/modules/dt.debug.js
@@ -1,18 +1,22 @@
 var
 	parser = require( 'ext.discussionTools.init' ).parser,
 	highlighter = require( './highlighter.js' ),
-	timestamps, comments, threads, i, node, match, signature, emptySignature;
-
-timestamps = parser.findTimestamps( document.getElementById( 'mw-content-text' ) );
-comments = parser.getComments( document.getElementById( 'mw-content-text' ) );
-threads = parser.groupThreads( comments );
+	comments = parser.getComments( document.getElementById( 'mw-content-text' ) ),
+	threads = parser.groupThreads( comments ),
+	timestampRegex = parser.getLocalTimestampRegexp();
 
 highlighter.markThreads( threads );
 
 // TODO: Use comment.signatureRanges to mark up signatures/timestamps
-for ( i = 0; i < timestamps.length; i++ ) {
-	node = timestamps[ i ][ 0 ];
-	match = timestamps[ i ][ 1 ];
+comments.forEach( function ( comment ) {
+	var signature, emptySignature, node, match;
+
+	if ( comment.type !== 'comment' ) {
+		return;
+	}
+
+	node = comment.range.endContainer;
+	match = parser.findTimestamp( node, timestampRegex );
 	signature = parser.findSignature( node )[ 0 ];
 	emptySignature = signature.length === 1 && signature[ 0 ] === node;
 	// Note that additional content may follow the timestamp (e.g. in some voting formats), but we
@@ -22,4 +26,4 @@ for ( i = 0; i < timestamps.length; i++ ) {
 	if ( !emptySignature ) {
 		highlighter.markSignature( signature );
 	}
-}
+} );
diff --git a/modules/parser.js b/modules/parser.js
index 8a52ddd2f..fb3de07b8 100644
--- a/modules/parser.js
+++ b/modules/parser.js
@@ -398,67 +398,48 @@ function acceptOnlyNodesAllowingComments( node ) {
 }
 
 /**
- * Find all timestamps within a DOM subtree.
+ * Find a timestamps in a given text node
  *
- * @param {HTMLElement} rootNode Node to search
- * @return {[Text, Array]} Results. Each result is a tuple containing:
- *  - Text node containing the timestamp
- *  - Regexp match data, which specifies the location of the match, and which
- *    can be parsed using #getLocalTimestampParser
+ * @param {Text} node Text node
+ * @param {string} timestampRegex Timestamp regex
+ * @return {Array} Regexp match data, which specifies the location of the match,
+ *  and which can be parsed using #getLocalTimestampParser
  */
-function findTimestamps( rootNode ) {
-	var
-		matches = [],
-		treeWalker = rootNode.ownerDocument.createTreeWalker(
-			rootNode,
-			NodeFilter.SHOW_TEXT,
-			acceptOnlyNodesAllowingComments,
-			false
-		),
-		dateRegexp = getLocalTimestampRegexp(),
-		node, startNode, nodeText, match;
+function findTimestamp( node, timestampRegex ) {
+	var nodeText = '';
+	while ( node ) {
+		nodeText += node.nodeValue;
 
-	while ( ( node = treeWalker.nextNode() ) ) {
-		startNode = node;
-		nodeText = '';
+		// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
+		// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
+		// which apparently are often turned into &nbsp; entities by buggy editing tools. To handle
+		// this, we must piece together the text, so that our regexp can match those timestamps.
+		if (
+			node.nextSibling &&
+			node.nextSibling.nodeType === Node.ELEMENT_NODE &&
+			node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
+		) {
+			nodeText += node.nextSibling.firstChild.nodeValue;
 
-		while ( node ) {
-			nodeText += node.nodeValue;
-
-			// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
-			// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
-			// which apparently are often turned into &nbsp; entities by buggy editing tools. To handle
-			// this, we must piece together the text, so that our regexp can match those timestamps.
+			// If the entity is followed by more text, do this again
 			if (
-				node.nextSibling &&
-				node.nextSibling.nodeType === Node.ELEMENT_NODE &&
-				node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
+				node.nextSibling.nextSibling &&
+				node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
 			) {
-				nodeText += node.nextSibling.firstChild.nodeValue;
-
-				// If the entity is followed by more text, do this again
-				if (
-					node.nextSibling.nextSibling &&
-					node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
-				) {
-					node = node.nextSibling.nextSibling;
-				} else {
-					node = null;
-				}
+				node = node.nextSibling.nextSibling;
 			} else {
 				node = null;
 			}
-		}
-
-		// Technically, there could be multiple matches in a single text node. However, the ultimate
-		// point of this is to find the signatures which precede the timestamps, and any later
-		// timestamps in the text node can't be directly preceded by a signature (as we require them to
-		// have links), so we only concern ourselves with the first match.
-		if ( ( match = nodeText.match( dateRegexp ) ) ) {
-			matches.push( [ startNode, match ] );
+		} else {
+			node = null;
 		}
 	}
-	return matches;
+
+	// Technically, there could be multiple matches in a single text node. However, the ultimate
+	// point of this is to find the signatures which precede the timestamps, and any later
+	// timestamps in the text node can't be directly preceded by a signature (as we require them to
+	// have links), so we only concern ourselves with the first match.
+	return nodeText.match( timestampRegex );
 }
 
 /**
@@ -703,13 +684,12 @@ function nextInterestingLeafNode( node, rootNode ) {
 function getComments( rootNode ) {
 	var
 		dfParser = getLocalTimestampParser(),
+		timestampRegex = getLocalTimestampRegexp(),
 		comments = [],
-		timestamps, nextTimestamp, treeWalker,
+		treeWalker,
 		node, range, fakeHeading, curComment,
 		foundSignature, firstSigNode, lastSigNode, sigRange, author, startNode, match, startLevel, endLevel, dateTime, warnings;
 
-	timestamps = findTimestamps( rootNode );
-
 	treeWalker = rootNode.ownerDocument.createTreeWalker(
 		rootNode,
 		// eslint-disable-next-line no-bitwise
@@ -729,7 +709,6 @@ function getComments( rootNode ) {
 
 	curComment = fakeHeading;
 
-	nextTimestamp = 0;
 	while ( ( node = treeWalker.nextNode() ) ) {
 		if ( node.tagName && node.tagName.match( /^h[1-6]$/i ) ) {
 			range = {
@@ -740,7 +719,7 @@ function getComments( rootNode ) {
 			};
 			curComment = new HeadingItem( range );
 			comments.push( curComment );
-		} else if ( timestamps[ nextTimestamp ] && node === timestamps[ nextTimestamp ][ 0 ] ) {
+		} else if ( node.nodeType === Node.TEXT_NODE && ( match = findTimestamp( node, timestampRegex ) ) ) {
 			warnings = [];
 			foundSignature = findSignature( node, curComment.range.endContainer );
 			author = foundSignature[ 1 ];
@@ -750,13 +729,11 @@ function getComments( rootNode ) {
 			if ( !author ) {
 				// Ignore timestamps for which we couldn't find a signature. It's probably not a real
 				// comment, but just a false match due to a copypasted timestamp.
-				nextTimestamp++;
 				continue;
 			}
 
 			// Everything from last comment up to here is the next comment
 			startNode = nextInterestingLeafNode( curComment.range.endContainer, rootNode );
-			match = timestamps[ nextTimestamp ][ 1 ];
 			range = {
 				startContainer: startNode.parentNode,
 				startOffset: utils.childIndexOf( startNode ),
@@ -792,8 +769,6 @@ function getComments( rootNode ) {
 				curComment.range.endOffset = range.endOffset;
 				curComment.signatureRanges.push( sigRange );
 				curComment.level = Math.min( Math.min( startLevel, endLevel ), curComment.level );
-
-				nextTimestamp++;
 				continue;
 			}
 
@@ -814,7 +789,6 @@ function getComments( rootNode ) {
 				curComment.warnings = warnings;
 			}
 			comments.push( curComment );
-			nextTimestamp++;
 		}
 	}
 
@@ -1002,7 +976,6 @@ function getTranscludedFrom( comment ) {
 }
 
 module.exports = {
-	findTimestamps: findTimestamps,
 	getLocalTimestampParser: getLocalTimestampParser,
 	getTimestampRegexp: getTimestampRegexp,
 	getTimestampParser: getTimestampParser,
@@ -1010,5 +983,8 @@ module.exports = {
 	groupThreads: groupThreads,
 	findSignature: findSignature,
 	getAuthors: getAuthors,
-	getTranscludedFrom: getTranscludedFrom
+	getTranscludedFrom: getTranscludedFrom,
+	// Only used by dtdebug
+	findTimestamp: findTimestamp,
+	getLocalTimestampRegexp: getLocalTimestampRegexp
 };
diff --git a/modules/utils.js b/modules/utils.js
index d9be37942..82fc03dcf 100644
--- a/modules/utils.js
+++ b/modules/utils.js
@@ -13,7 +13,7 @@ function getNativeRange( comment ) {
 		nativeRange = doc.createRange();
 	nativeRange.setStart( comment.range.startContainer, comment.range.startOffset );
 	// HACK: When the offset is outside the container, assume this is because of
-	// the 'mw:Entity' hack in parser#findTimestamps and adjust accordingly.
+	// the 'mw:Entity' hack in parser#findTimestamp and adjust accordingly.
 	// TODO: The parser should produce valid ranges!
 	endContainer = comment.range.endContainer;
 	endOffset = comment.range.endOffset;