From 308c2747b03d35223edff1dc07eb81227f111358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartosz=20Dziewo=C5=84ski?= Date: Wed, 15 Jul 2020 16:53:11 +0200 Subject: [PATCH] CommentParser.php: Use tree walking instead of XPath This is similar to what the JS version does. The TreeWalker and NodeFilter classes are adapted from https://github.com/Krinkle/dom-TreeWalker-polyfill (MIT license). This makes #getComments twice as fast on en-big-oldparser.html Change-Id: I2441f33e6e7bad753ac830d277e6a2e81ee8c93d --- includes/CommentParser.php | 74 +++++++++++++-------- includes/NodeFilter.php | 60 +++++++++++++++++ includes/TreeWalker.php | 133 +++++++++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+), 29 deletions(-) create mode 100644 includes/NodeFilter.php create mode 100644 includes/TreeWalker.php diff --git a/includes/CommentParser.php b/includes/CommentParser.php index ca32e00b5..37df56e56 100644 --- a/includes/CommentParser.php +++ b/includes/CommentParser.php @@ -103,21 +103,16 @@ class CommentParser { * @return DOMNode */ private function nextInterestingLeafNode( DOMNode $node, DOMElement $rootNode ) : DOMNode { - $n = $node; - do { - if ( $n->firstChild && ( $node === $rootNode || $n !== $node ) ) { - $n = $n->firstChild; - } elseif ( $n->nextSibling ) { - $n = $n->nextSibling; - } else { - while ( $n && $n !== $rootNode && !$n->nextSibling ) { - $n = $n->parentNode; + $treeWalker = new TreeWalker( + $rootNode, + NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, + function ( $n ) use ( $node, $rootNode ) { + // Ignore this node and its descendants + // (unless it's the root node, this is a special case for "fakeHeading" handling) + if ( $node !== $rootNode && ( $n === $node || $n->parentNode === $node ) ) { + return NodeFilter::FILTER_REJECT; } - $n = $n->nextSibling; - } - - if ( - $n && ( + if ( ( $n->nodeType === XML_TEXT_NODE && CommentUtils::htmlTrim( $n->nodeValue ) !== '' @@ -127,12 +122,18 @@ class CommentParser { CommentUtils::htmlTrim( $n->nodeValue ) !== '' ) || ( $n->nodeType === XML_ELEMENT_NODE && !$n->firstChild ) - ) - ) { - return $n; + ) { + return NodeFilter::FILTER_ACCEPT; + } + return NodeFilter::FILTER_SKIP; } - } while ( $n && $n !== $rootNode ); - throw new MWException( 'nextInterestingLeafNode not found' ); + ); + $treeWalker->currentNode = $node; + $treeWalker->nextNode(); + if ( !$treeWalker->currentNode ) { + throw new MWException( 'nextInterestingLeafNode not found' ); + } + return $treeWalker->currentNode; } /** @@ -618,6 +619,21 @@ class CommentParser { return [ $sigNodes, $sigUsername ]; } + /** + * Callback for TreeWalker that will skip over nodes where we don't want to detect + * comments (or section headings). + * + * @param DOMNode $node + * @return int Appropriate NodeFilter constant + */ + public static function acceptOnlyNodesAllowingComments( DOMNode $node ) { + // The table of contents has a heading that gets erroneously detected as a section + if ( $node instanceof DOMElement && $node->getAttribute( 'id' ) === 'toc' ) { + return NodeFilter::FILTER_REJECT; + } + return NodeFilter::FILTER_ACCEPT; + } + /** * Find all timestamps within a DOM subtree. * @@ -710,9 +726,6 @@ class CommentParser { public function getComments( DOMElement $rootNode ) : array { $timestamps = $this->findTimestamps( $rootNode ); - $xpath = new DOMXPath( $rootNode->ownerDocument ); - $allNodes = $xpath->query( '//text()|//node()', $rootNode ); - $tocNode = $rootNode->ownerDocument->getElementById( 'toc' ); $comments = []; $dfParser = $this->getLocalTimestampParser(); @@ -723,17 +736,20 @@ class CommentParser { $curComment = $fakeHeading; $nextTimestamp = 0; - foreach ( $allNodes as $node ) { - // Skip nodes inside
- if ( $tocNode && CommentUtils::contains( $tocNode, $node ) ) { - continue; - } - + $treeWalker = new TreeWalker( + $rootNode, + NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, + [ self::class, 'acceptOnlyNodesAllowingComments' ] + ); + while ( $node = $treeWalker->nextNode() ) { if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) { $range = new ImmutableRange( $node, 0, $node, $node->childNodes->length ); $curComment = new HeadingItem( $range ); $comments[] = $curComment; - } elseif ( isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0] ) { + } elseif ( + $node instanceof DOMText && + isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0] + ) { $warnings = []; $foundSignature = $this->findSignature( $node, $curComment->getRange()->endContainer ); $author = $foundSignature[1]; diff --git a/includes/NodeFilter.php b/includes/NodeFilter.php new file mode 100644 index 000000000..8b8b38f4c --- /dev/null +++ b/includes/NodeFilter.php @@ -0,0 +1,60 @@ +active ) { + throw new Exception( 'DOMException: INVALID_STATE_ERR' ); + } + + $this->active = true; + $result = call_user_func( $this->filter, $node ); + $this->active = false; + + return $result; + } +} diff --git a/includes/TreeWalker.php b/includes/TreeWalker.php new file mode 100644 index 000000000..e546c9b81 --- /dev/null +++ b/includes/TreeWalker.php @@ -0,0 +1,133 @@ +nodeType - 1 ) ) & $tw->whatToShow ) ) ) { + return NodeFilter::FILTER_SKIP; + } + + if ( $tw->filter === null ) { + return NodeFilter::FILTER_ACCEPT; + } + + return $tw->filter->acceptNode( $node ); + } + + /** + * Based on WebKit's NodeTraversal::nextSkippingChildren + * https://trac.webkit.org/browser/trunk/Source/WebCore/dom/NodeTraversal.h?rev=137221#L103 + * + * @param DOMNode $node + * @param DOMNode $stayWithin + * @return DOMNode|null + */ + private function nextSkippingChildren( DOMNode $node, DOMNode $stayWithin ) : ?DOMNode { + if ( $node === $stayWithin ) { + return null; + } + if ( $node->nextSibling !== null ) { + return $node->nextSibling; + } + + /** + * Based on WebKit's NodeTraversal::nextAncestorSibling + * https://trac.webkit.org/browser/trunk/Source/WebCore/dom/NodeTraversal.cpp?rev=137221#L43 + */ + while ( $node->parentNode !== null ) { + $node = $node->parentNode; + if ( $node === $stayWithin ) { + return null; + } + if ( $node->nextSibling !== null ) { + return $node->nextSibling; + } + } + return null; + } + + /** + * See https://dom.spec.whatwg.org/#interface-treewalker + * + * @param DOMNode $root + * @param int|null $whatToShow + * @param callable|null $filter + * @throws Exception + */ + public function __construct( DOMNode $root, $whatToShow = null, callable $filter = null ) { + if ( !$root->nodeType ) { + throw new Exception( 'DOMException: NOT_SUPPORTED_ERR' ); + } + + $this->root = $root; + $this->whatToShow = (int)$whatToShow ?: 0; + + $this->currentNode = $root; + + if ( !$filter ) { + $this->filter = null; + } else { + $this->filter = new NodeFilter(); + $this->filter->filter = $filter; + } + } + + /** + * See https://dom.spec.whatwg.org/#dom-treewalker-nextnode + * + * @return DOMNode|null The current node + */ + public function nextNode() : ?DOMNode { + $node = $this->currentNode; + $result = NodeFilter::FILTER_ACCEPT; + + while ( true ) { + while ( $result !== NodeFilter::FILTER_REJECT && $node->firstChild !== null ) { + $node = $node->firstChild; + $result = $this->nodeFilter( $this, $node ); + if ( $result === NodeFilter::FILTER_ACCEPT ) { + $this->currentNode = $node; + return $node; + } + } + $following = $this->nextSkippingChildren( $node, $this->root ); + if ( $following !== null ) { + $node = $following; + } else { + return null; + } + $result = $this->nodeFilter( $this, $node ); + if ( $result === NodeFilter::FILTER_ACCEPT ) { + $this->currentNode = $node; + return $node; + } + } + } +}