mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/DiscussionTools
synced 2024-11-23 16:06:53 +00:00
Parser: Replace findTimestamps with findTimestamp
Instead of doing a separate tree walk and finding all timestamps separately, make it part of the getComments tree walk, and find timestamps one at a time. Change-Id: I47f466eaf228504faa189fd99e07493bc7f022cd
This commit is contained in:
parent
308c2747b0
commit
092cfd6075
|
@ -10,7 +10,6 @@ use DateTimeZone;
|
|||
use DOMElement;
|
||||
use DOMNode;
|
||||
use DOMText;
|
||||
use DOMXPath;
|
||||
use IP;
|
||||
use Language;
|
||||
use MediaWiki\MediaWikiServices;
|
||||
|
@ -18,8 +17,6 @@ use MWException;
|
|||
use Title;
|
||||
|
||||
// TODO clean up static vs non-static
|
||||
|
||||
// TODO consider rewriting as single traversal, without XPath
|
||||
// TODO consider making timestamp parsing not a returned function
|
||||
|
||||
class CommentParser {
|
||||
|
@ -635,18 +632,13 @@ class CommentParser {
|
|||
}
|
||||
|
||||
/**
|
||||
* Find all timestamps within a DOM subtree.
|
||||
* Find a timestamps in a given text node
|
||||
*
|
||||
* @param DOMElement $rootNode
|
||||
* @return array Array of [node, matchData] pairs
|
||||
* @param DOMText $node Text node
|
||||
* @param string $timestampRegex Timestamp regex
|
||||
* @return array|null Match data
|
||||
*/
|
||||
public function findTimestamps( DOMElement $rootNode ) : array {
|
||||
$xpath = new DOMXPath( $rootNode->ownerDocument );
|
||||
$textNodes = $xpath->query( '//text()', $rootNode );
|
||||
$matches = [];
|
||||
$timestampRegex = $this->getLocalTimestampRegexp();
|
||||
foreach ( $textNodes as $node ) {
|
||||
$startNode = $node;
|
||||
public function findTimestamp( DOMText $node, string $timestampRegex ) : ?array {
|
||||
$nodeText = '';
|
||||
|
||||
while ( $node ) {
|
||||
|
@ -657,18 +649,18 @@ class CommentParser {
|
|||
// which apparently are often turned into entities by buggy editing tools. To handle
|
||||
// this, we must piece together the text, so that our regexp can match those timestamps.
|
||||
if (
|
||||
$node->nextSibling &&
|
||||
$node->nextSibling->nodeType === XML_ELEMENT_NODE &&
|
||||
$node->nextSibling->getAttribute( 'typeof' ) === 'mw:Entity'
|
||||
( $nextSibling = $node->nextSibling ) &&
|
||||
$nextSibling instanceof DOMElement &&
|
||||
$nextSibling->getAttribute( 'typeof' ) === 'mw:Entity'
|
||||
) {
|
||||
$nodeText .= $node->nextSibling->firstChild->nodeValue;
|
||||
$nodeText .= $nextSibling->firstChild->nodeValue;
|
||||
|
||||
// If the entity is followed by more text, do this again
|
||||
if (
|
||||
$node->nextSibling->nextSibling &&
|
||||
$node->nextSibling->nextSibling->nodeType === XML_TEXT_NODE
|
||||
$nextSibling->nextSibling &&
|
||||
$nextSibling->nextSibling instanceof DOMText
|
||||
) {
|
||||
$node = $node->nextSibling->nextSibling;
|
||||
$node = $nextSibling->nextSibling;
|
||||
} else {
|
||||
$node = null;
|
||||
}
|
||||
|
@ -680,10 +672,9 @@ class CommentParser {
|
|||
$matchData = null;
|
||||
// Allows us to mimic match.index in #getComments
|
||||
if ( preg_match( $timestampRegex, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) {
|
||||
$matches[] = [ $startNode, $matchData ];
|
||||
return $matchData;
|
||||
}
|
||||
}
|
||||
return $matches;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -724,8 +715,7 @@ class CommentParser {
|
|||
* @return ThreadItem[] Thread items
|
||||
*/
|
||||
public function getComments( DOMElement $rootNode ) : array {
|
||||
$timestamps = $this->findTimestamps( $rootNode );
|
||||
|
||||
$timestampRegex = $this->getLocalTimestampRegexp();
|
||||
$comments = [];
|
||||
$dfParser = $this->getLocalTimestampParser();
|
||||
|
||||
|
@ -735,21 +725,17 @@ class CommentParser {
|
|||
|
||||
$curComment = $fakeHeading;
|
||||
|
||||
$nextTimestamp = 0;
|
||||
$treeWalker = new TreeWalker(
|
||||
$rootNode,
|
||||
NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT,
|
||||
[ self::class, 'acceptOnlyNodesAllowingComments' ]
|
||||
);
|
||||
while ( $node = $treeWalker->nextNode() ) {
|
||||
if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) {
|
||||
if ( $node instanceof DOMElement && preg_match( '/^h[1-6]$/i', $node->tagName ) ) {
|
||||
$range = new ImmutableRange( $node, 0, $node, $node->childNodes->length );
|
||||
$curComment = new HeadingItem( $range );
|
||||
$comments[] = $curComment;
|
||||
} elseif (
|
||||
$node instanceof DOMText &&
|
||||
isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0]
|
||||
) {
|
||||
} elseif ( $node instanceof DOMText && ( $match = $this->findTimestamp( $node, $timestampRegex ) ) ) {
|
||||
$warnings = [];
|
||||
$foundSignature = $this->findSignature( $node, $curComment->getRange()->endContainer );
|
||||
$author = $foundSignature[1];
|
||||
|
@ -759,13 +745,11 @@ class CommentParser {
|
|||
if ( !$author ) {
|
||||
// Ignore timestamps for which we couldn't find a signature. It's probably not a real
|
||||
// comment, but just a false match due to a copypasted timestamp.
|
||||
$nextTimestamp++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Everything from the last comment up to here is the next comment
|
||||
$startNode = $this->nextInterestingLeafNode( $curComment->getRange()->endContainer, $rootNode );
|
||||
$match = $timestamps[$nextTimestamp][1];
|
||||
$offset = $lastSigNode === $node ?
|
||||
$match[0][1] + strlen( $match[0][0] ) :
|
||||
CommentUtils::childIndexOf( $lastSigNode ) + 1;
|
||||
|
@ -814,7 +798,6 @@ class CommentParser {
|
|||
$curComment->addSignatureRange( $sigRange );
|
||||
$curComment->setLevel( min( min( $startLevel, $endLevel ), $curComment->getLevel() ) );
|
||||
|
||||
$nextTimestamp++;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -837,7 +820,6 @@ class CommentParser {
|
|||
$curComment->addWarnings( $warnings );
|
||||
}
|
||||
$comments[] = $curComment;
|
||||
$nextTimestamp++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,18 +1,22 @@
|
|||
var
|
||||
parser = require( 'ext.discussionTools.init' ).parser,
|
||||
highlighter = require( './highlighter.js' ),
|
||||
timestamps, comments, threads, i, node, match, signature, emptySignature;
|
||||
|
||||
timestamps = parser.findTimestamps( document.getElementById( 'mw-content-text' ) );
|
||||
comments = parser.getComments( document.getElementById( 'mw-content-text' ) );
|
||||
threads = parser.groupThreads( comments );
|
||||
comments = parser.getComments( document.getElementById( 'mw-content-text' ) ),
|
||||
threads = parser.groupThreads( comments ),
|
||||
timestampRegex = parser.getLocalTimestampRegexp();
|
||||
|
||||
highlighter.markThreads( threads );
|
||||
|
||||
// TODO: Use comment.signatureRanges to mark up signatures/timestamps
|
||||
for ( i = 0; i < timestamps.length; i++ ) {
|
||||
node = timestamps[ i ][ 0 ];
|
||||
match = timestamps[ i ][ 1 ];
|
||||
comments.forEach( function ( comment ) {
|
||||
var signature, emptySignature, node, match;
|
||||
|
||||
if ( comment.type !== 'comment' ) {
|
||||
return;
|
||||
}
|
||||
|
||||
node = comment.range.endContainer;
|
||||
match = parser.findTimestamp( node, timestampRegex );
|
||||
signature = parser.findSignature( node )[ 0 ];
|
||||
emptySignature = signature.length === 1 && signature[ 0 ] === node;
|
||||
// Note that additional content may follow the timestamp (e.g. in some voting formats), but we
|
||||
|
@ -22,4 +26,4 @@ for ( i = 0; i < timestamps.length; i++ ) {
|
|||
if ( !emptySignature ) {
|
||||
highlighter.markSignature( signature );
|
||||
}
|
||||
}
|
||||
} );
|
||||
|
|
|
@ -398,67 +398,48 @@ function acceptOnlyNodesAllowingComments( node ) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Find all timestamps within a DOM subtree.
|
||||
* Find a timestamps in a given text node
|
||||
*
|
||||
* @param {HTMLElement} rootNode Node to search
|
||||
* @return {[Text, Array]} Results. Each result is a tuple containing:
|
||||
* - Text node containing the timestamp
|
||||
* - Regexp match data, which specifies the location of the match, and which
|
||||
* can be parsed using #getLocalTimestampParser
|
||||
* @param {Text} node Text node
|
||||
* @param {string} timestampRegex Timestamp regex
|
||||
* @return {Array} Regexp match data, which specifies the location of the match,
|
||||
* and which can be parsed using #getLocalTimestampParser
|
||||
*/
|
||||
function findTimestamps( rootNode ) {
|
||||
var
|
||||
matches = [],
|
||||
treeWalker = rootNode.ownerDocument.createTreeWalker(
|
||||
rootNode,
|
||||
NodeFilter.SHOW_TEXT,
|
||||
acceptOnlyNodesAllowingComments,
|
||||
false
|
||||
),
|
||||
dateRegexp = getLocalTimestampRegexp(),
|
||||
node, startNode, nodeText, match;
|
||||
function findTimestamp( node, timestampRegex ) {
|
||||
var nodeText = '';
|
||||
while ( node ) {
|
||||
nodeText += node.nodeValue;
|
||||
|
||||
while ( ( node = treeWalker.nextNode() ) ) {
|
||||
startNode = node;
|
||||
nodeText = '';
|
||||
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
|
||||
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
|
||||
// which apparently are often turned into entities by buggy editing tools. To handle
|
||||
// this, we must piece together the text, so that our regexp can match those timestamps.
|
||||
if (
|
||||
node.nextSibling &&
|
||||
node.nextSibling.nodeType === Node.ELEMENT_NODE &&
|
||||
node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
|
||||
) {
|
||||
nodeText += node.nextSibling.firstChild.nodeValue;
|
||||
|
||||
while ( node ) {
|
||||
nodeText += node.nodeValue;
|
||||
|
||||
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
|
||||
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
|
||||
// which apparently are often turned into entities by buggy editing tools. To handle
|
||||
// this, we must piece together the text, so that our regexp can match those timestamps.
|
||||
// If the entity is followed by more text, do this again
|
||||
if (
|
||||
node.nextSibling &&
|
||||
node.nextSibling.nodeType === Node.ELEMENT_NODE &&
|
||||
node.nextSibling.getAttribute( 'typeof' ) === 'mw:Entity'
|
||||
node.nextSibling.nextSibling &&
|
||||
node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
|
||||
) {
|
||||
nodeText += node.nextSibling.firstChild.nodeValue;
|
||||
|
||||
// If the entity is followed by more text, do this again
|
||||
if (
|
||||
node.nextSibling.nextSibling &&
|
||||
node.nextSibling.nextSibling.nodeType === Node.TEXT_NODE
|
||||
) {
|
||||
node = node.nextSibling.nextSibling;
|
||||
} else {
|
||||
node = null;
|
||||
}
|
||||
node = node.nextSibling.nextSibling;
|
||||
} else {
|
||||
node = null;
|
||||
}
|
||||
}
|
||||
|
||||
// Technically, there could be multiple matches in a single text node. However, the ultimate
|
||||
// point of this is to find the signatures which precede the timestamps, and any later
|
||||
// timestamps in the text node can't be directly preceded by a signature (as we require them to
|
||||
// have links), so we only concern ourselves with the first match.
|
||||
if ( ( match = nodeText.match( dateRegexp ) ) ) {
|
||||
matches.push( [ startNode, match ] );
|
||||
} else {
|
||||
node = null;
|
||||
}
|
||||
}
|
||||
return matches;
|
||||
|
||||
// Technically, there could be multiple matches in a single text node. However, the ultimate
|
||||
// point of this is to find the signatures which precede the timestamps, and any later
|
||||
// timestamps in the text node can't be directly preceded by a signature (as we require them to
|
||||
// have links), so we only concern ourselves with the first match.
|
||||
return nodeText.match( timestampRegex );
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -703,13 +684,12 @@ function nextInterestingLeafNode( node, rootNode ) {
|
|||
function getComments( rootNode ) {
|
||||
var
|
||||
dfParser = getLocalTimestampParser(),
|
||||
timestampRegex = getLocalTimestampRegexp(),
|
||||
comments = [],
|
||||
timestamps, nextTimestamp, treeWalker,
|
||||
treeWalker,
|
||||
node, range, fakeHeading, curComment,
|
||||
foundSignature, firstSigNode, lastSigNode, sigRange, author, startNode, match, startLevel, endLevel, dateTime, warnings;
|
||||
|
||||
timestamps = findTimestamps( rootNode );
|
||||
|
||||
treeWalker = rootNode.ownerDocument.createTreeWalker(
|
||||
rootNode,
|
||||
// eslint-disable-next-line no-bitwise
|
||||
|
@ -729,7 +709,6 @@ function getComments( rootNode ) {
|
|||
|
||||
curComment = fakeHeading;
|
||||
|
||||
nextTimestamp = 0;
|
||||
while ( ( node = treeWalker.nextNode() ) ) {
|
||||
if ( node.tagName && node.tagName.match( /^h[1-6]$/i ) ) {
|
||||
range = {
|
||||
|
@ -740,7 +719,7 @@ function getComments( rootNode ) {
|
|||
};
|
||||
curComment = new HeadingItem( range );
|
||||
comments.push( curComment );
|
||||
} else if ( timestamps[ nextTimestamp ] && node === timestamps[ nextTimestamp ][ 0 ] ) {
|
||||
} else if ( node.nodeType === Node.TEXT_NODE && ( match = findTimestamp( node, timestampRegex ) ) ) {
|
||||
warnings = [];
|
||||
foundSignature = findSignature( node, curComment.range.endContainer );
|
||||
author = foundSignature[ 1 ];
|
||||
|
@ -750,13 +729,11 @@ function getComments( rootNode ) {
|
|||
if ( !author ) {
|
||||
// Ignore timestamps for which we couldn't find a signature. It's probably not a real
|
||||
// comment, but just a false match due to a copypasted timestamp.
|
||||
nextTimestamp++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Everything from last comment up to here is the next comment
|
||||
startNode = nextInterestingLeafNode( curComment.range.endContainer, rootNode );
|
||||
match = timestamps[ nextTimestamp ][ 1 ];
|
||||
range = {
|
||||
startContainer: startNode.parentNode,
|
||||
startOffset: utils.childIndexOf( startNode ),
|
||||
|
@ -792,8 +769,6 @@ function getComments( rootNode ) {
|
|||
curComment.range.endOffset = range.endOffset;
|
||||
curComment.signatureRanges.push( sigRange );
|
||||
curComment.level = Math.min( Math.min( startLevel, endLevel ), curComment.level );
|
||||
|
||||
nextTimestamp++;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -814,7 +789,6 @@ function getComments( rootNode ) {
|
|||
curComment.warnings = warnings;
|
||||
}
|
||||
comments.push( curComment );
|
||||
nextTimestamp++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1002,7 +976,6 @@ function getTranscludedFrom( comment ) {
|
|||
}
|
||||
|
||||
module.exports = {
|
||||
findTimestamps: findTimestamps,
|
||||
getLocalTimestampParser: getLocalTimestampParser,
|
||||
getTimestampRegexp: getTimestampRegexp,
|
||||
getTimestampParser: getTimestampParser,
|
||||
|
@ -1010,5 +983,8 @@ module.exports = {
|
|||
groupThreads: groupThreads,
|
||||
findSignature: findSignature,
|
||||
getAuthors: getAuthors,
|
||||
getTranscludedFrom: getTranscludedFrom
|
||||
getTranscludedFrom: getTranscludedFrom,
|
||||
// Only used by dtdebug
|
||||
findTimestamp: findTimestamp,
|
||||
getLocalTimestampRegexp: getLocalTimestampRegexp
|
||||
};
|
||||
|
|
|
@ -13,7 +13,7 @@ function getNativeRange( comment ) {
|
|||
nativeRange = doc.createRange();
|
||||
nativeRange.setStart( comment.range.startContainer, comment.range.startOffset );
|
||||
// HACK: When the offset is outside the container, assume this is because of
|
||||
// the 'mw:Entity' hack in parser#findTimestamps and adjust accordingly.
|
||||
// the 'mw:Entity' hack in parser#findTimestamp and adjust accordingly.
|
||||
// TODO: The parser should produce valid ranges!
|
||||
endContainer = comment.range.endContainer;
|
||||
endOffset = comment.range.endOffset;
|
||||
|
|
Loading…
Reference in a new issue