From 092cfd6075a456b0139ca6617c3232b8034ddb13 Mon Sep 17 00:00:00 2001 From: Ed Sanders Date: Fri, 10 Jul 2020 15:16:49 +0100 Subject: [PATCH] Parser: Replace findTimestamps with findTimestamp Instead of doing a separate tree walk and finding all timestamps separately, make it part of the getComments tree walk, and find timestamps one at a time. Change-Id: I47f466eaf228504faa189fd99e07493bc7f022cd --- includes/CommentParser.php | 52 +++++++------------ modules/dt.debug.js | 22 ++++---- modules/parser.js | 100 ++++++++++++++----------------------- modules/utils.js | 2 +- 4 files changed, 69 insertions(+), 107 deletions(-) diff --git a/includes/CommentParser.php b/includes/CommentParser.php index 37df56e56..78c037773 100644 --- a/includes/CommentParser.php +++ b/includes/CommentParser.php @@ -10,7 +10,6 @@ use DateTimeZone; use DOMElement; use DOMNode; use DOMText; -use DOMXPath; use IP; use Language; use MediaWiki\MediaWikiServices; @@ -18,8 +17,6 @@ use MWException; use Title; // TODO clean up static vs non-static - -// TODO consider rewriting as single traversal, without XPath // TODO consider making timestamp parsing not a returned function class CommentParser { @@ -635,18 +632,13 @@ class CommentParser { } /** - * Find all timestamps within a DOM subtree. + * Find a timestamps in a given text node * - * @param DOMElement $rootNode - * @return array Array of [node, matchData] pairs + * @param DOMText $node Text node + * @param string $timestampRegex Timestamp regex + * @return array|null Match data */ - public function findTimestamps( DOMElement $rootNode ) : array { - $xpath = new DOMXPath( $rootNode->ownerDocument ); - $textNodes = $xpath->query( '//text()', $rootNode ); - $matches = []; - $timestampRegex = $this->getLocalTimestampRegexp(); - foreach ( $textNodes as $node ) { - $startNode = $node; + public function findTimestamp( DOMText $node, string $timestampRegex ) : ?array { $nodeText = ''; while ( $node ) { @@ -657,18 +649,18 @@ class CommentParser { // which apparently are often turned into   entities by buggy editing tools. To handle // this, we must piece together the text, so that our regexp can match those timestamps. if ( - $node->nextSibling && - $node->nextSibling->nodeType === XML_ELEMENT_NODE && - $node->nextSibling->getAttribute( 'typeof' ) === 'mw:Entity' + ( $nextSibling = $node->nextSibling ) && + $nextSibling instanceof DOMElement && + $nextSibling->getAttribute( 'typeof' ) === 'mw:Entity' ) { - $nodeText .= $node->nextSibling->firstChild->nodeValue; + $nodeText .= $nextSibling->firstChild->nodeValue; // If the entity is followed by more text, do this again if ( - $node->nextSibling->nextSibling && - $node->nextSibling->nextSibling->nodeType === XML_TEXT_NODE + $nextSibling->nextSibling && + $nextSibling->nextSibling instanceof DOMText ) { - $node = $node->nextSibling->nextSibling; + $node = $nextSibling->nextSibling; } else { $node = null; } @@ -680,10 +672,9 @@ class CommentParser { $matchData = null; // Allows us to mimic match.index in #getComments if ( preg_match( $timestampRegex, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) { - $matches[] = [ $startNode, $matchData ]; + return $matchData; } - } - return $matches; + return null; } /** @@ -724,8 +715,7 @@ class CommentParser { * @return ThreadItem[] Thread items */ public function getComments( DOMElement $rootNode ) : array { - $timestamps = $this->findTimestamps( $rootNode ); - + $timestampRegex = $this->getLocalTimestampRegexp(); $comments = []; $dfParser = $this->getLocalTimestampParser(); @@ -735,21 +725,17 @@ class CommentParser { $curComment = $fakeHeading; - $nextTimestamp = 0; $treeWalker = new TreeWalker( $rootNode, NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, [ self::class, 'acceptOnlyNodesAllowingComments' ] ); while ( $node = $treeWalker->nextNode() ) { - if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) { + if ( $node instanceof DOMElement && preg_match( '/^h[1-6]$/i', $node->tagName ) ) { $range = new ImmutableRange( $node, 0, $node, $node->childNodes->length ); $curComment = new HeadingItem( $range ); $comments[] = $curComment; - } elseif ( - $node instanceof DOMText && - isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0] - ) { + } elseif ( $node instanceof DOMText && ( $match = $this->findTimestamp( $node, $timestampRegex ) ) ) { $warnings = []; $foundSignature = $this->findSignature( $node, $curComment->getRange()->endContainer ); $author = $foundSignature[1]; @@ -759,13 +745,11 @@ class CommentParser { if ( !$author ) { // Ignore timestamps for which we couldn't find a signature. It's probably not a real // comment, but just a false match due to a copypasted timestamp. - $nextTimestamp++; continue; } // Everything from the last comment up to here is the next comment $startNode = $this->nextInterestingLeafNode( $curComment->getRange()->endContainer, $rootNode ); - $match = $timestamps[$nextTimestamp][1]; $offset = $lastSigNode === $node ? $match[0][1] + strlen( $match[0][0] ) : CommentUtils::childIndexOf( $lastSigNode ) + 1; @@ -814,7 +798,6 @@ class CommentParser { $curComment->addSignatureRange( $sigRange ); $curComment->setLevel( min( min( $startLevel, $endLevel ), $curComment->getLevel() ) ); - $nextTimestamp++; continue; } @@ -837,7 +820,6 @@ class CommentParser { $curComment->addWarnings( $warnings ); } $comments[] = $curComment; - $nextTimestamp++; } } diff --git a/modules/dt.debug.js b/modules/dt.debug.js index 27ee2e475..1d86c43b4 100644 --- a/modules/dt.debug.js +++ b/modules/dt.debug.js @@ -1,18 +1,22 @@ var parser = require( 'ext.discussionTools.init' ).parser, highlighter = require( './highlighter.js' ), - timestamps, comments, threads, i, node, match, signature, emptySignature; - -timestamps = parser.findTimestamps( document.getElementById( 'mw-content-text' ) ); -comments = parser.getComments( document.getElementById( 'mw-content-text' ) ); -threads = parser.groupThreads( comments ); + comments = parser.getComments( document.getElementById( 'mw-content-text' ) ), + threads = parser.groupThreads( comments ), + timestampRegex = parser.getLocalTimestampRegexp(); highlighter.markThreads( threads ); // TODO: Use comment.signatureRanges to mark up signatures/timestamps -for ( i = 0; i < timestamps.length; i++ ) { - node = timestamps[ i ][ 0 ]; - match = timestamps[ i ][ 1 ]; +comments.forEach( function ( comment ) { + var signature, emptySignature, node, match; + + if ( comment.type !== 'comment' ) { + return; + } + + node = comment.range.endContainer; + match = parser.findTimestamp( node, timestampRegex ); signature = parser.findSignature( node )[ 0 ]; emptySignature = signature.length === 1 && signature[ 0 ] === node; // Note that additional content may follow the timestamp (e.g. in some voting formats), but we @@ -22,4 +26,4 @@ for ( i = 0; i < timestamps.length; i++ ) { if ( !emptySignature ) { highlighter.markSignature( signature ); } -} +} ); diff --git a/modules/parser.js b/modules/parser.js index 8a52ddd2f..fb3de07b8 100644 --- a/modules/parser.js +++ b/modules/parser.js @@ -398,67 +398,48 @@ function acceptOnlyNodesAllowingComments( node ) { } /** - * Find all timestamps within a DOM subtree. + * Find a timestamps in a given text node * - * @param {HTMLElement} rootNode Node to search - * @return {[Text, Array]} Results. Each result is a tuple containing: - * - Text node containing the timestamp - * - Regexp match data, which specifies the location of the match, and which - * can be parsed using #getLocalTimestampParser + * @param {Text} node Text node + * @param {string} timestampRegex Timestamp regex + * @return {Array} Regexp match data, which specifies the location of the match, + * and which can be parsed using #getLocalTimestampParser */ -function findTimestamps( rootNode ) { - var - matches = [], - treeWalker = rootNode.ownerDocument.createTreeWalker( - rootNode, - NodeFilter.SHOW_TEXT, - acceptOnlyNodesAllowingComments, - false - ), - dateRegexp = getLocalTimestampRegexp(), - node, startNode, nodeText, match; +function findTimestamp( node, timestampRegex ) { + var nodeText = ''; + while ( node ) { + nodeText += node.nodeValue; - while ( ( node = treeWalker.nextNode() ) ) { - startNode = node; - nodeText = ''; + // In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML + // entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces, + // which apparently are often turned into   entities by buggy editing tools. To handle + // this, we must piece together the text, so that our regexp can match those timestamps. + if ( + node.nextSibling && + node.nextSibling.nodeType === Node.ELEMENT_NODE && + node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity' + ) { + nodeText += node.nextSibling.firstChild.nodeValue; - while ( node ) { - nodeText += node.nodeValue; - - // In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML - // entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces, - // which apparently are often turned into   entities by buggy editing tools. To handle - // this, we must piece together the text, so that our regexp can match those timestamps. + // If the entity is followed by more text, do this again if ( - node.nextSibling && - node.nextSibling.nodeType === Node.ELEMENT_NODE && - node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity' + node.nextSibling.nextSibling && + node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE ) { - nodeText += node.nextSibling.firstChild.nodeValue; - - // If the entity is followed by more text, do this again - if ( - node.nextSibling.nextSibling && - node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE - ) { - node = node.nextSibling.nextSibling; - } else { - node = null; - } + node = node.nextSibling.nextSibling; } else { node = null; } - } - - // Technically, there could be multiple matches in a single text node. However, the ultimate - // point of this is to find the signatures which precede the timestamps, and any later - // timestamps in the text node can't be directly preceded by a signature (as we require them to - // have links), so we only concern ourselves with the first match. - if ( ( match = nodeText.match( dateRegexp ) ) ) { - matches.push( [ startNode, match ] ); + } else { + node = null; } } - return matches; + + // Technically, there could be multiple matches in a single text node. However, the ultimate + // point of this is to find the signatures which precede the timestamps, and any later + // timestamps in the text node can't be directly preceded by a signature (as we require them to + // have links), so we only concern ourselves with the first match. + return nodeText.match( timestampRegex ); } /** @@ -703,13 +684,12 @@ function nextInterestingLeafNode( node, rootNode ) { function getComments( rootNode ) { var dfParser = getLocalTimestampParser(), + timestampRegex = getLocalTimestampRegexp(), comments = [], - timestamps, nextTimestamp, treeWalker, + treeWalker, node, range, fakeHeading, curComment, foundSignature, firstSigNode, lastSigNode, sigRange, author, startNode, match, startLevel, endLevel, dateTime, warnings; - timestamps = findTimestamps( rootNode ); - treeWalker = rootNode.ownerDocument.createTreeWalker( rootNode, // eslint-disable-next-line no-bitwise @@ -729,7 +709,6 @@ function getComments( rootNode ) { curComment = fakeHeading; - nextTimestamp = 0; while ( ( node = treeWalker.nextNode() ) ) { if ( node.tagName && node.tagName.match( /^h[1-6]$/i ) ) { range = { @@ -740,7 +719,7 @@ function getComments( rootNode ) { }; curComment = new HeadingItem( range ); comments.push( curComment ); - } else if ( timestamps[ nextTimestamp ] && node === timestamps[ nextTimestamp ][ 0 ] ) { + } else if ( node.nodeType === Node.TEXT_NODE && ( match = findTimestamp( node, timestampRegex ) ) ) { warnings = []; foundSignature = findSignature( node, curComment.range.endContainer ); author = foundSignature[ 1 ]; @@ -750,13 +729,11 @@ function getComments( rootNode ) { if ( !author ) { // Ignore timestamps for which we couldn't find a signature. It's probably not a real // comment, but just a false match due to a copypasted timestamp. - nextTimestamp++; continue; } // Everything from last comment up to here is the next comment startNode = nextInterestingLeafNode( curComment.range.endContainer, rootNode ); - match = timestamps[ nextTimestamp ][ 1 ]; range = { startContainer: startNode.parentNode, startOffset: utils.childIndexOf( startNode ), @@ -792,8 +769,6 @@ function getComments( rootNode ) { curComment.range.endOffset = range.endOffset; curComment.signatureRanges.push( sigRange ); curComment.level = Math.min( Math.min( startLevel, endLevel ), curComment.level ); - - nextTimestamp++; continue; } @@ -814,7 +789,6 @@ function getComments( rootNode ) { curComment.warnings = warnings; } comments.push( curComment ); - nextTimestamp++; } } @@ -1002,7 +976,6 @@ function getTranscludedFrom( comment ) { } module.exports = { - findTimestamps: findTimestamps, getLocalTimestampParser: getLocalTimestampParser, getTimestampRegexp: getTimestampRegexp, getTimestampParser: getTimestampParser, @@ -1010,5 +983,8 @@ module.exports = { groupThreads: groupThreads, findSignature: findSignature, getAuthors: getAuthors, - getTranscludedFrom: getTranscludedFrom + getTranscludedFrom: getTranscludedFrom, + // Only used by dtdebug + findTimestamp: findTimestamp, + getLocalTimestampRegexp: getLocalTimestampRegexp }; diff --git a/modules/utils.js b/modules/utils.js index d9be37942..82fc03dcf 100644 --- a/modules/utils.js +++ b/modules/utils.js @@ -13,7 +13,7 @@ function getNativeRange( comment ) { nativeRange = doc.createRange(); nativeRange.setStart( comment.range.startContainer, comment.range.startOffset ); // HACK: When the offset is outside the container, assume this is because of - // the 'mw:Entity' hack in parser#findTimestamps and adjust accordingly. + // the 'mw:Entity' hack in parser#findTimestamp and adjust accordingly. // TODO: The parser should produce valid ranges! endContainer = comment.range.endContainer; endOffset = comment.range.endOffset;