Parser: Replace findTimestamps with findTimestamp

Instead of doing a separate tree walk and finding all timestamps
separately, make it part of the getComments tree walk, and find
timestamps one at a time.

Change-Id: I47f466eaf228504faa189fd99e07493bc7f022cd
This commit is contained in:
Ed Sanders 2020-07-10 15:16:49 +01:00 committed by Bartosz Dziewoński
parent 308c2747b0
commit 092cfd6075
4 changed files with 69 additions and 107 deletions

View file

@ -10,7 +10,6 @@ use DateTimeZone;
use DOMElement;
use DOMNode;
use DOMText;
use DOMXPath;
use IP;
use Language;
use MediaWiki\MediaWikiServices;
@ -18,8 +17,6 @@ use MWException;
use Title;
// TODO clean up static vs non-static
// TODO consider rewriting as single traversal, without XPath
// TODO consider making timestamp parsing not a returned function
class CommentParser {
@ -635,18 +632,13 @@ class CommentParser {
}
/**
* Find all timestamps within a DOM subtree.
* Find a timestamps in a given text node
*
* @param DOMElement $rootNode
* @return array Array of [node, matchData] pairs
* @param DOMText $node Text node
* @param string $timestampRegex Timestamp regex
* @return array|null Match data
*/
public function findTimestamps( DOMElement $rootNode ) : array {
$xpath = new DOMXPath( $rootNode->ownerDocument );
$textNodes = $xpath->query( '//text()', $rootNode );
$matches = [];
$timestampRegex = $this->getLocalTimestampRegexp();
foreach ( $textNodes as $node ) {
$startNode = $node;
public function findTimestamp( DOMText $node, string $timestampRegex ) : ?array {
$nodeText = '';
while ( $node ) {
@ -657,18 +649,18 @@ class CommentParser {
// which apparently are often turned into   entities by buggy editing tools. To handle
// this, we must piece together the text, so that our regexp can match those timestamps.
if (
$node->nextSibling &&
$node->nextSibling->nodeType === XML_ELEMENT_NODE &&
$node->nextSibling->getAttribute( 'typeof' ) === 'mw:Entity'
( $nextSibling = $node->nextSibling ) &&
$nextSibling instanceof DOMElement &&
$nextSibling->getAttribute( 'typeof' ) === 'mw:Entity'
) {
$nodeText .= $node->nextSibling->firstChild->nodeValue;
$nodeText .= $nextSibling->firstChild->nodeValue;
// If the entity is followed by more text, do this again
if (
$node->nextSibling->nextSibling &&
$node->nextSibling->nextSibling->nodeType === XML_TEXT_NODE
$nextSibling->nextSibling &&
$nextSibling->nextSibling instanceof DOMText
) {
$node = $node->nextSibling->nextSibling;
$node = $nextSibling->nextSibling;
} else {
$node = null;
}
@ -680,10 +672,9 @@ class CommentParser {
$matchData = null;
// Allows us to mimic match.index in #getComments
if ( preg_match( $timestampRegex, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) {
$matches[] = [ $startNode, $matchData ];
return $matchData;
}
}
return $matches;
return null;
}
/**
@ -724,8 +715,7 @@ class CommentParser {
* @return ThreadItem[] Thread items
*/
public function getComments( DOMElement $rootNode ) : array {
$timestamps = $this->findTimestamps( $rootNode );
$timestampRegex = $this->getLocalTimestampRegexp();
$comments = [];
$dfParser = $this->getLocalTimestampParser();
@ -735,21 +725,17 @@ class CommentParser {
$curComment = $fakeHeading;
$nextTimestamp = 0;
$treeWalker = new TreeWalker(
$rootNode,
NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT,
[ self::class, 'acceptOnlyNodesAllowingComments' ]
);
while ( $node = $treeWalker->nextNode() ) {
if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) {
if ( $node instanceof DOMElement && preg_match( '/^h[1-6]$/i', $node->tagName ) ) {
$range = new ImmutableRange( $node, 0, $node, $node->childNodes->length );
$curComment = new HeadingItem( $range );
$comments[] = $curComment;
} elseif (
$node instanceof DOMText &&
isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0]
) {
} elseif ( $node instanceof DOMText && ( $match = $this->findTimestamp( $node, $timestampRegex ) ) ) {
$warnings = [];
$foundSignature = $this->findSignature( $node, $curComment->getRange()->endContainer );
$author = $foundSignature[1];
@ -759,13 +745,11 @@ class CommentParser {
if ( !$author ) {
// Ignore timestamps for which we couldn't find a signature. It's probably not a real
// comment, but just a false match due to a copypasted timestamp.
$nextTimestamp++;
continue;
}
// Everything from the last comment up to here is the next comment
$startNode = $this->nextInterestingLeafNode( $curComment->getRange()->endContainer, $rootNode );
$match = $timestamps[$nextTimestamp][1];
$offset = $lastSigNode === $node ?
$match[0][1] + strlen( $match[0][0] ) :
CommentUtils::childIndexOf( $lastSigNode ) + 1;
@ -814,7 +798,6 @@ class CommentParser {
$curComment->addSignatureRange( $sigRange );
$curComment->setLevel( min( min( $startLevel, $endLevel ), $curComment->getLevel() ) );
$nextTimestamp++;
continue;
}
@ -837,7 +820,6 @@ class CommentParser {
$curComment->addWarnings( $warnings );
}
$comments[] = $curComment;
$nextTimestamp++;
}
}

View file

@ -1,18 +1,22 @@
var
parser = require( 'ext.discussionTools.init' ).parser,
highlighter = require( './highlighter.js' ),
timestamps, comments, threads, i, node, match, signature, emptySignature;
timestamps = parser.findTimestamps( document.getElementById( 'mw-content-text' ) );
comments = parser.getComments( document.getElementById( 'mw-content-text' ) );
threads = parser.groupThreads( comments );
comments = parser.getComments( document.getElementById( 'mw-content-text' ) ),
threads = parser.groupThreads( comments ),
timestampRegex = parser.getLocalTimestampRegexp();
highlighter.markThreads( threads );
// TODO: Use comment.signatureRanges to mark up signatures/timestamps
for ( i = 0; i < timestamps.length; i++ ) {
node = timestamps[ i ][ 0 ];
match = timestamps[ i ][ 1 ];
comments.forEach( function ( comment ) {
var signature, emptySignature, node, match;
if ( comment.type !== 'comment' ) {
return;
}
node = comment.range.endContainer;
match = parser.findTimestamp( node, timestampRegex );
signature = parser.findSignature( node )[ 0 ];
emptySignature = signature.length === 1 && signature[ 0 ] === node;
// Note that additional content may follow the timestamp (e.g. in some voting formats), but we
@ -22,4 +26,4 @@ for ( i = 0; i < timestamps.length; i++ ) {
if ( !emptySignature ) {
highlighter.markSignature( signature );
}
}
} );

View file

@ -398,67 +398,48 @@ function acceptOnlyNodesAllowingComments( node ) {
}
/**
* Find all timestamps within a DOM subtree.
* Find a timestamps in a given text node
*
* @param {HTMLElement} rootNode Node to search
* @return {[Text, Array]} Results. Each result is a tuple containing:
* - Text node containing the timestamp
* - Regexp match data, which specifies the location of the match, and which
* can be parsed using #getLocalTimestampParser
* @param {Text} node Text node
* @param {string} timestampRegex Timestamp regex
* @return {Array} Regexp match data, which specifies the location of the match,
* and which can be parsed using #getLocalTimestampParser
*/
function findTimestamps( rootNode ) {
var
matches = [],
treeWalker = rootNode.ownerDocument.createTreeWalker(
rootNode,
NodeFilter.SHOW_TEXT,
acceptOnlyNodesAllowingComments,
false
),
dateRegexp = getLocalTimestampRegexp(),
node, startNode, nodeText, match;
function findTimestamp( node, timestampRegex ) {
var nodeText = '';
while ( node ) {
nodeText += node.nodeValue;
while ( ( node = treeWalker.nextNode() ) ) {
startNode = node;
nodeText = '';
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
// which apparently are often turned into &nbsp; entities by buggy editing tools. To handle
// this, we must piece together the text, so that our regexp can match those timestamps.
if (
node.nextSibling &&
node.nextSibling.nodeType === Node.ELEMENT_NODE &&
node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
) {
nodeText += node.nextSibling.firstChild.nodeValue;
while ( node ) {
nodeText += node.nodeValue;
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
// which apparently are often turned into &nbsp; entities by buggy editing tools. To handle
// this, we must piece together the text, so that our regexp can match those timestamps.
// If the entity is followed by more text, do this again
if (
node.nextSibling &&
node.nextSibling.nodeType === Node.ELEMENT_NODE &&
node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
node.nextSibling.nextSibling &&
node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
) {
nodeText += node.nextSibling.firstChild.nodeValue;
// If the entity is followed by more text, do this again
if (
node.nextSibling.nextSibling &&
node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
) {
node = node.nextSibling.nextSibling;
} else {
node = null;
}
node = node.nextSibling.nextSibling;
} else {
node = null;
}
}
// Technically, there could be multiple matches in a single text node. However, the ultimate
// point of this is to find the signatures which precede the timestamps, and any later
// timestamps in the text node can't be directly preceded by a signature (as we require them to
// have links), so we only concern ourselves with the first match.
if ( ( match = nodeText.match( dateRegexp ) ) ) {
matches.push( [ startNode, match ] );
} else {
node = null;
}
}
return matches;
// Technically, there could be multiple matches in a single text node. However, the ultimate
// point of this is to find the signatures which precede the timestamps, and any later
// timestamps in the text node can't be directly preceded by a signature (as we require them to
// have links), so we only concern ourselves with the first match.
return nodeText.match( timestampRegex );
}
/**
@ -703,13 +684,12 @@ function nextInterestingLeafNode( node, rootNode ) {
function getComments( rootNode ) {
var
dfParser = getLocalTimestampParser(),
timestampRegex = getLocalTimestampRegexp(),
comments = [],
timestamps, nextTimestamp, treeWalker,
treeWalker,
node, range, fakeHeading, curComment,
foundSignature, firstSigNode, lastSigNode, sigRange, author, startNode, match, startLevel, endLevel, dateTime, warnings;
timestamps = findTimestamps( rootNode );
treeWalker = rootNode.ownerDocument.createTreeWalker(
rootNode,
// eslint-disable-next-line no-bitwise
@ -729,7 +709,6 @@ function getComments( rootNode ) {
curComment = fakeHeading;
nextTimestamp = 0;
while ( ( node = treeWalker.nextNode() ) ) {
if ( node.tagName && node.tagName.match( /^h[1-6]$/i ) ) {
range = {
@ -740,7 +719,7 @@ function getComments( rootNode ) {
};
curComment = new HeadingItem( range );
comments.push( curComment );
} else if ( timestamps[ nextTimestamp ] && node === timestamps[ nextTimestamp ][ 0 ] ) {
} else if ( node.nodeType === Node.TEXT_NODE && ( match = findTimestamp( node, timestampRegex ) ) ) {
warnings = [];
foundSignature = findSignature( node, curComment.range.endContainer );
author = foundSignature[ 1 ];
@ -750,13 +729,11 @@ function getComments( rootNode ) {
if ( !author ) {
// Ignore timestamps for which we couldn't find a signature. It's probably not a real
// comment, but just a false match due to a copypasted timestamp.
nextTimestamp++;
continue;
}
// Everything from last comment up to here is the next comment
startNode = nextInterestingLeafNode( curComment.range.endContainer, rootNode );
match = timestamps[ nextTimestamp ][ 1 ];
range = {
startContainer: startNode.parentNode,
startOffset: utils.childIndexOf( startNode ),
@ -792,8 +769,6 @@ function getComments( rootNode ) {
curComment.range.endOffset = range.endOffset;
curComment.signatureRanges.push( sigRange );
curComment.level = Math.min( Math.min( startLevel, endLevel ), curComment.level );
nextTimestamp++;
continue;
}
@ -814,7 +789,6 @@ function getComments( rootNode ) {
curComment.warnings = warnings;
}
comments.push( curComment );
nextTimestamp++;
}
}
@ -1002,7 +976,6 @@ function getTranscludedFrom( comment ) {
}
module.exports = {
findTimestamps: findTimestamps,
getLocalTimestampParser: getLocalTimestampParser,
getTimestampRegexp: getTimestampRegexp,
getTimestampParser: getTimestampParser,
@ -1010,5 +983,8 @@ module.exports = {
groupThreads: groupThreads,
findSignature: findSignature,
getAuthors: getAuthors,
getTranscludedFrom: getTranscludedFrom
getTranscludedFrom: getTranscludedFrom,
// Only used by dtdebug
findTimestamp: findTimestamp,
getLocalTimestampRegexp: getLocalTimestampRegexp
};

View file

@ -13,7 +13,7 @@ function getNativeRange( comment ) {
nativeRange = doc.createRange();
nativeRange.setStart( comment.range.startContainer, comment.range.startOffset );
// HACK: When the offset is outside the container, assume this is because of
// the 'mw:Entity' hack in parser#findTimestamps and adjust accordingly.
// the 'mw:Entity' hack in parser#findTimestamp and adjust accordingly.
// TODO: The parser should produce valid ranges!
endContainer = comment.range.endContainer;
endOffset = comment.range.endOffset;