CommentParser.php: Use tree walking instead of XPath

This is similar to what the JS version does.

The TreeWalker and NodeFilter classes are adapted from
https://github.com/Krinkle/dom-TreeWalker-polyfill
(MIT license).

This makes #getComments twice as fast on en-big-oldparser.html

Change-Id: I2441f33e6e7bad753ac830d277e6a2e81ee8c93d
This commit is contained in:
Bartosz Dziewoński 2020-07-15 16:53:11 +02:00
parent 08b467bf9f
commit 308c2747b0
3 changed files with 238 additions and 29 deletions

View file

@ -103,21 +103,16 @@ class CommentParser {
* @return DOMNode
*/
private function nextInterestingLeafNode( DOMNode $node, DOMElement $rootNode ) : DOMNode {
$n = $node;
do {
if ( $n->firstChild && ( $node === $rootNode || $n !== $node ) ) {
$n = $n->firstChild;
} elseif ( $n->nextSibling ) {
$n = $n->nextSibling;
} else {
while ( $n && $n !== $rootNode && !$n->nextSibling ) {
$n = $n->parentNode;
$treeWalker = new TreeWalker(
$rootNode,
NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT,
function ( $n ) use ( $node, $rootNode ) {
// Ignore this node and its descendants
// (unless it's the root node, this is a special case for "fakeHeading" handling)
if ( $node !== $rootNode && ( $n === $node || $n->parentNode === $node ) ) {
return NodeFilter::FILTER_REJECT;
}
$n = $n->nextSibling;
}
if (
$n && (
if (
(
$n->nodeType === XML_TEXT_NODE &&
CommentUtils::htmlTrim( $n->nodeValue ) !== ''
@ -127,12 +122,18 @@ class CommentParser {
CommentUtils::htmlTrim( $n->nodeValue ) !== ''
) ||
( $n->nodeType === XML_ELEMENT_NODE && !$n->firstChild )
)
) {
return $n;
) {
return NodeFilter::FILTER_ACCEPT;
}
return NodeFilter::FILTER_SKIP;
}
} while ( $n && $n !== $rootNode );
throw new MWException( 'nextInterestingLeafNode not found' );
);
$treeWalker->currentNode = $node;
$treeWalker->nextNode();
if ( !$treeWalker->currentNode ) {
throw new MWException( 'nextInterestingLeafNode not found' );
}
return $treeWalker->currentNode;
}
/**
@ -618,6 +619,21 @@ class CommentParser {
return [ $sigNodes, $sigUsername ];
}
/**
* Callback for TreeWalker that will skip over nodes where we don't want to detect
* comments (or section headings).
*
* @param DOMNode $node
* @return int Appropriate NodeFilter constant
*/
public static function acceptOnlyNodesAllowingComments( DOMNode $node ) {
// The table of contents has a heading that gets erroneously detected as a section
if ( $node instanceof DOMElement && $node->getAttribute( 'id' ) === 'toc' ) {
return NodeFilter::FILTER_REJECT;
}
return NodeFilter::FILTER_ACCEPT;
}
/**
* Find all timestamps within a DOM subtree.
*
@ -710,9 +726,6 @@ class CommentParser {
public function getComments( DOMElement $rootNode ) : array {
$timestamps = $this->findTimestamps( $rootNode );
$xpath = new DOMXPath( $rootNode->ownerDocument );
$allNodes = $xpath->query( '//text()|//node()', $rootNode );
$tocNode = $rootNode->ownerDocument->getElementById( 'toc' );
$comments = [];
$dfParser = $this->getLocalTimestampParser();
@ -723,17 +736,20 @@ class CommentParser {
$curComment = $fakeHeading;
$nextTimestamp = 0;
foreach ( $allNodes as $node ) {
// Skip nodes inside <div id="toc">
if ( $tocNode && CommentUtils::contains( $tocNode, $node ) ) {
continue;
}
$treeWalker = new TreeWalker(
$rootNode,
NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT,
[ self::class, 'acceptOnlyNodesAllowingComments' ]
);
while ( $node = $treeWalker->nextNode() ) {
if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) {
$range = new ImmutableRange( $node, 0, $node, $node->childNodes->length );
$curComment = new HeadingItem( $range );
$comments[] = $curComment;
} elseif ( isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0] ) {
} elseif (
$node instanceof DOMText &&
isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0]
) {
$warnings = [];
$foundSignature = $this->findSignature( $node, $curComment->getRange()->endContainer );
$author = $foundSignature[1];

60
includes/NodeFilter.php Normal file
View file

@ -0,0 +1,60 @@
<?php
namespace MediaWiki\Extension\DiscussionTools;
use DOMNode;
use Exception;
/**
* Partial implementation of W3 DOM4 NodeFilter interface.
*
* See also:
* - https://dom.spec.whatwg.org/#interface-nodefilter
*
* Adapted from https://github.com/Krinkle/dom-TreeWalker-polyfill/blob/master/src/TreeWalker-polyfill.js
*/
class NodeFilter {
// Constants for acceptNode()
public const FILTER_ACCEPT = 1;
public const FILTER_REJECT = 2;
public const FILTER_SKIP = 3;
// Constants for whatToShow
public const SHOW_ALL = 0xFFFFFFFF;
public const SHOW_ELEMENT = 0x1;
public const SHOW_ATTRIBUTE = 0x2;
public const SHOW_TEXT = 0x4;
public const SHOW_CDATA_SECTION = 0x8;
public const SHOW_ENTITY_REFERENCE = 0x10;
public const SHOW_ENTITY = 0x20;
public const SHOW_PROCESSING_INSTRUCTION = 0x40;
public const SHOW_COMMENT = 0x80;
public const SHOW_DOCUMENT = 0x100;
public const SHOW_DOCUMENT_TYPE = 0x200;
public const SHOW_DOCUMENT_FRAGMENT = 0x400;
public const SHOW_NOTATION = 0x800;
public $filter;
private $active = false;
/**
* See https://dom.spec.whatwg.org/#dom-nodefilter-acceptnode
*
* @param DOMNode $node
* @return int Constant NodeFilter::FILTER_ACCEPT,
* NodeFilter::FILTER_REJECT or NodeFilter::FILTER_SKIP.
*/
public function acceptNode( $node ) {
if ( $this->active ) {
throw new Exception( 'DOMException: INVALID_STATE_ERR' );
}
$this->active = true;
$result = call_user_func( $this->filter, $node );
$this->active = false;
return $result;
}
}

133
includes/TreeWalker.php Normal file
View file

@ -0,0 +1,133 @@
<?php
namespace MediaWiki\Extension\DiscussionTools;
use DOMNode;
use Exception;
/**
* Partial implementation of W3 DOM4 TreeWalker interface.
*
* See also:
* - https://dom.spec.whatwg.org/#interface-treewalker
*
* Adapted from https://github.com/Krinkle/dom-TreeWalker-polyfill/blob/master/src/TreeWalker-polyfill.js
*/
class TreeWalker {
public $root;
public $whatToShow;
public $currentNode;
public $filter;
/**
* See https://dom.spec.whatwg.org/#concept-node-filter
*
* @param TreeWalker $tw
* @param DOMNode $node
* @return int Constant NodeFilter::FILTER_ACCEPT,
* NodeFilter::FILTER_REJECT or NodeFilter::FILTER_SKIP.
*/
private function nodeFilter( TreeWalker $tw, DOMNode $node ) {
// Maps nodeType to whatToShow
if ( !( ( ( 1 << ( $node->nodeType - 1 ) ) & $tw->whatToShow ) ) ) {
return NodeFilter::FILTER_SKIP;
}
if ( $tw->filter === null ) {
return NodeFilter::FILTER_ACCEPT;
}
return $tw->filter->acceptNode( $node );
}
/**
* Based on WebKit's NodeTraversal::nextSkippingChildren
* https://trac.webkit.org/browser/trunk/Source/WebCore/dom/NodeTraversal.h?rev=137221#L103
*
* @param DOMNode $node
* @param DOMNode $stayWithin
* @return DOMNode|null
*/
private function nextSkippingChildren( DOMNode $node, DOMNode $stayWithin ) : ?DOMNode {
if ( $node === $stayWithin ) {
return null;
}
if ( $node->nextSibling !== null ) {
return $node->nextSibling;
}
/**
* Based on WebKit's NodeTraversal::nextAncestorSibling
* https://trac.webkit.org/browser/trunk/Source/WebCore/dom/NodeTraversal.cpp?rev=137221#L43
*/
while ( $node->parentNode !== null ) {
$node = $node->parentNode;
if ( $node === $stayWithin ) {
return null;
}
if ( $node->nextSibling !== null ) {
return $node->nextSibling;
}
}
return null;
}
/**
* See https://dom.spec.whatwg.org/#interface-treewalker
*
* @param DOMNode $root
* @param int|null $whatToShow
* @param callable|null $filter
* @throws Exception
*/
public function __construct( DOMNode $root, $whatToShow = null, callable $filter = null ) {
if ( !$root->nodeType ) {
throw new Exception( 'DOMException: NOT_SUPPORTED_ERR' );
}
$this->root = $root;
$this->whatToShow = (int)$whatToShow ?: 0;
$this->currentNode = $root;
if ( !$filter ) {
$this->filter = null;
} else {
$this->filter = new NodeFilter();
$this->filter->filter = $filter;
}
}
/**
* See https://dom.spec.whatwg.org/#dom-treewalker-nextnode
*
* @return DOMNode|null The current node
*/
public function nextNode() : ?DOMNode {
$node = $this->currentNode;
$result = NodeFilter::FILTER_ACCEPT;
while ( true ) {
while ( $result !== NodeFilter::FILTER_REJECT && $node->firstChild !== null ) {
$node = $node->firstChild;
$result = $this->nodeFilter( $this, $node );
if ( $result === NodeFilter::FILTER_ACCEPT ) {
$this->currentNode = $node;
return $node;
}
}
$following = $this->nextSkippingChildren( $node, $this->root );
if ( $following !== null ) {
$node = $following;
} else {
return null;
}
$result = $this->nodeFilter( $this, $node );
if ( $result === NodeFilter::FILTER_ACCEPT ) {
$this->currentNode = $node;
return $node;
}
}
}
}