mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/DiscussionTools
synced 2024-11-24 00:13:36 +00:00
CommentParser.php: Use tree walking instead of XPath
This is similar to what the JS version does. The TreeWalker and NodeFilter classes are adapted from https://github.com/Krinkle/dom-TreeWalker-polyfill (MIT license). This makes #getComments twice as fast on en-big-oldparser.html Change-Id: I2441f33e6e7bad753ac830d277e6a2e81ee8c93d
This commit is contained in:
parent
08b467bf9f
commit
308c2747b0
|
@ -103,21 +103,16 @@ class CommentParser {
|
|||
* @return DOMNode
|
||||
*/
|
||||
private function nextInterestingLeafNode( DOMNode $node, DOMElement $rootNode ) : DOMNode {
|
||||
$n = $node;
|
||||
do {
|
||||
if ( $n->firstChild && ( $node === $rootNode || $n !== $node ) ) {
|
||||
$n = $n->firstChild;
|
||||
} elseif ( $n->nextSibling ) {
|
||||
$n = $n->nextSibling;
|
||||
} else {
|
||||
while ( $n && $n !== $rootNode && !$n->nextSibling ) {
|
||||
$n = $n->parentNode;
|
||||
$treeWalker = new TreeWalker(
|
||||
$rootNode,
|
||||
NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT,
|
||||
function ( $n ) use ( $node, $rootNode ) {
|
||||
// Ignore this node and its descendants
|
||||
// (unless it's the root node, this is a special case for "fakeHeading" handling)
|
||||
if ( $node !== $rootNode && ( $n === $node || $n->parentNode === $node ) ) {
|
||||
return NodeFilter::FILTER_REJECT;
|
||||
}
|
||||
$n = $n->nextSibling;
|
||||
}
|
||||
|
||||
if (
|
||||
$n && (
|
||||
if (
|
||||
(
|
||||
$n->nodeType === XML_TEXT_NODE &&
|
||||
CommentUtils::htmlTrim( $n->nodeValue ) !== ''
|
||||
|
@ -127,12 +122,18 @@ class CommentParser {
|
|||
CommentUtils::htmlTrim( $n->nodeValue ) !== ''
|
||||
) ||
|
||||
( $n->nodeType === XML_ELEMENT_NODE && !$n->firstChild )
|
||||
)
|
||||
) {
|
||||
return $n;
|
||||
) {
|
||||
return NodeFilter::FILTER_ACCEPT;
|
||||
}
|
||||
return NodeFilter::FILTER_SKIP;
|
||||
}
|
||||
} while ( $n && $n !== $rootNode );
|
||||
throw new MWException( 'nextInterestingLeafNode not found' );
|
||||
);
|
||||
$treeWalker->currentNode = $node;
|
||||
$treeWalker->nextNode();
|
||||
if ( !$treeWalker->currentNode ) {
|
||||
throw new MWException( 'nextInterestingLeafNode not found' );
|
||||
}
|
||||
return $treeWalker->currentNode;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -618,6 +619,21 @@ class CommentParser {
|
|||
return [ $sigNodes, $sigUsername ];
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback for TreeWalker that will skip over nodes where we don't want to detect
|
||||
* comments (or section headings).
|
||||
*
|
||||
* @param DOMNode $node
|
||||
* @return int Appropriate NodeFilter constant
|
||||
*/
|
||||
public static function acceptOnlyNodesAllowingComments( DOMNode $node ) {
|
||||
// The table of contents has a heading that gets erroneously detected as a section
|
||||
if ( $node instanceof DOMElement && $node->getAttribute( 'id' ) === 'toc' ) {
|
||||
return NodeFilter::FILTER_REJECT;
|
||||
}
|
||||
return NodeFilter::FILTER_ACCEPT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find all timestamps within a DOM subtree.
|
||||
*
|
||||
|
@ -710,9 +726,6 @@ class CommentParser {
|
|||
public function getComments( DOMElement $rootNode ) : array {
|
||||
$timestamps = $this->findTimestamps( $rootNode );
|
||||
|
||||
$xpath = new DOMXPath( $rootNode->ownerDocument );
|
||||
$allNodes = $xpath->query( '//text()|//node()', $rootNode );
|
||||
$tocNode = $rootNode->ownerDocument->getElementById( 'toc' );
|
||||
$comments = [];
|
||||
$dfParser = $this->getLocalTimestampParser();
|
||||
|
||||
|
@ -723,17 +736,20 @@ class CommentParser {
|
|||
$curComment = $fakeHeading;
|
||||
|
||||
$nextTimestamp = 0;
|
||||
foreach ( $allNodes as $node ) {
|
||||
// Skip nodes inside <div id="toc">
|
||||
if ( $tocNode && CommentUtils::contains( $tocNode, $node ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$treeWalker = new TreeWalker(
|
||||
$rootNode,
|
||||
NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT,
|
||||
[ self::class, 'acceptOnlyNodesAllowingComments' ]
|
||||
);
|
||||
while ( $node = $treeWalker->nextNode() ) {
|
||||
if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) {
|
||||
$range = new ImmutableRange( $node, 0, $node, $node->childNodes->length );
|
||||
$curComment = new HeadingItem( $range );
|
||||
$comments[] = $curComment;
|
||||
} elseif ( isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0] ) {
|
||||
} elseif (
|
||||
$node instanceof DOMText &&
|
||||
isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0]
|
||||
) {
|
||||
$warnings = [];
|
||||
$foundSignature = $this->findSignature( $node, $curComment->getRange()->endContainer );
|
||||
$author = $foundSignature[1];
|
||||
|
|
60
includes/NodeFilter.php
Normal file
60
includes/NodeFilter.php
Normal file
|
@ -0,0 +1,60 @@
|
|||
<?php
|
||||
|
||||
namespace MediaWiki\Extension\DiscussionTools;
|
||||
|
||||
use DOMNode;
|
||||
use Exception;
|
||||
|
||||
/**
|
||||
* Partial implementation of W3 DOM4 NodeFilter interface.
|
||||
*
|
||||
* See also:
|
||||
* - https://dom.spec.whatwg.org/#interface-nodefilter
|
||||
*
|
||||
* Adapted from https://github.com/Krinkle/dom-TreeWalker-polyfill/blob/master/src/TreeWalker-polyfill.js
|
||||
*/
|
||||
class NodeFilter {
|
||||
|
||||
// Constants for acceptNode()
|
||||
public const FILTER_ACCEPT = 1;
|
||||
public const FILTER_REJECT = 2;
|
||||
public const FILTER_SKIP = 3;
|
||||
|
||||
// Constants for whatToShow
|
||||
public const SHOW_ALL = 0xFFFFFFFF;
|
||||
public const SHOW_ELEMENT = 0x1;
|
||||
public const SHOW_ATTRIBUTE = 0x2;
|
||||
public const SHOW_TEXT = 0x4;
|
||||
public const SHOW_CDATA_SECTION = 0x8;
|
||||
public const SHOW_ENTITY_REFERENCE = 0x10;
|
||||
public const SHOW_ENTITY = 0x20;
|
||||
public const SHOW_PROCESSING_INSTRUCTION = 0x40;
|
||||
public const SHOW_COMMENT = 0x80;
|
||||
public const SHOW_DOCUMENT = 0x100;
|
||||
public const SHOW_DOCUMENT_TYPE = 0x200;
|
||||
public const SHOW_DOCUMENT_FRAGMENT = 0x400;
|
||||
public const SHOW_NOTATION = 0x800;
|
||||
|
||||
public $filter;
|
||||
|
||||
private $active = false;
|
||||
|
||||
/**
|
||||
* See https://dom.spec.whatwg.org/#dom-nodefilter-acceptnode
|
||||
*
|
||||
* @param DOMNode $node
|
||||
* @return int Constant NodeFilter::FILTER_ACCEPT,
|
||||
* NodeFilter::FILTER_REJECT or NodeFilter::FILTER_SKIP.
|
||||
*/
|
||||
public function acceptNode( $node ) {
|
||||
if ( $this->active ) {
|
||||
throw new Exception( 'DOMException: INVALID_STATE_ERR' );
|
||||
}
|
||||
|
||||
$this->active = true;
|
||||
$result = call_user_func( $this->filter, $node );
|
||||
$this->active = false;
|
||||
|
||||
return $result;
|
||||
}
|
||||
}
|
133
includes/TreeWalker.php
Normal file
133
includes/TreeWalker.php
Normal file
|
@ -0,0 +1,133 @@
|
|||
<?php
|
||||
|
||||
namespace MediaWiki\Extension\DiscussionTools;
|
||||
|
||||
use DOMNode;
|
||||
use Exception;
|
||||
|
||||
/**
|
||||
* Partial implementation of W3 DOM4 TreeWalker interface.
|
||||
*
|
||||
* See also:
|
||||
* - https://dom.spec.whatwg.org/#interface-treewalker
|
||||
*
|
||||
* Adapted from https://github.com/Krinkle/dom-TreeWalker-polyfill/blob/master/src/TreeWalker-polyfill.js
|
||||
*/
|
||||
class TreeWalker {
|
||||
|
||||
public $root;
|
||||
public $whatToShow;
|
||||
public $currentNode;
|
||||
public $filter;
|
||||
|
||||
/**
|
||||
* See https://dom.spec.whatwg.org/#concept-node-filter
|
||||
*
|
||||
* @param TreeWalker $tw
|
||||
* @param DOMNode $node
|
||||
* @return int Constant NodeFilter::FILTER_ACCEPT,
|
||||
* NodeFilter::FILTER_REJECT or NodeFilter::FILTER_SKIP.
|
||||
*/
|
||||
private function nodeFilter( TreeWalker $tw, DOMNode $node ) {
|
||||
// Maps nodeType to whatToShow
|
||||
if ( !( ( ( 1 << ( $node->nodeType - 1 ) ) & $tw->whatToShow ) ) ) {
|
||||
return NodeFilter::FILTER_SKIP;
|
||||
}
|
||||
|
||||
if ( $tw->filter === null ) {
|
||||
return NodeFilter::FILTER_ACCEPT;
|
||||
}
|
||||
|
||||
return $tw->filter->acceptNode( $node );
|
||||
}
|
||||
|
||||
/**
|
||||
* Based on WebKit's NodeTraversal::nextSkippingChildren
|
||||
* https://trac.webkit.org/browser/trunk/Source/WebCore/dom/NodeTraversal.h?rev=137221#L103
|
||||
*
|
||||
* @param DOMNode $node
|
||||
* @param DOMNode $stayWithin
|
||||
* @return DOMNode|null
|
||||
*/
|
||||
private function nextSkippingChildren( DOMNode $node, DOMNode $stayWithin ) : ?DOMNode {
|
||||
if ( $node === $stayWithin ) {
|
||||
return null;
|
||||
}
|
||||
if ( $node->nextSibling !== null ) {
|
||||
return $node->nextSibling;
|
||||
}
|
||||
|
||||
/**
|
||||
* Based on WebKit's NodeTraversal::nextAncestorSibling
|
||||
* https://trac.webkit.org/browser/trunk/Source/WebCore/dom/NodeTraversal.cpp?rev=137221#L43
|
||||
*/
|
||||
while ( $node->parentNode !== null ) {
|
||||
$node = $node->parentNode;
|
||||
if ( $node === $stayWithin ) {
|
||||
return null;
|
||||
}
|
||||
if ( $node->nextSibling !== null ) {
|
||||
return $node->nextSibling;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* See https://dom.spec.whatwg.org/#interface-treewalker
|
||||
*
|
||||
* @param DOMNode $root
|
||||
* @param int|null $whatToShow
|
||||
* @param callable|null $filter
|
||||
* @throws Exception
|
||||
*/
|
||||
public function __construct( DOMNode $root, $whatToShow = null, callable $filter = null ) {
|
||||
if ( !$root->nodeType ) {
|
||||
throw new Exception( 'DOMException: NOT_SUPPORTED_ERR' );
|
||||
}
|
||||
|
||||
$this->root = $root;
|
||||
$this->whatToShow = (int)$whatToShow ?: 0;
|
||||
|
||||
$this->currentNode = $root;
|
||||
|
||||
if ( !$filter ) {
|
||||
$this->filter = null;
|
||||
} else {
|
||||
$this->filter = new NodeFilter();
|
||||
$this->filter->filter = $filter;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* See https://dom.spec.whatwg.org/#dom-treewalker-nextnode
|
||||
*
|
||||
* @return DOMNode|null The current node
|
||||
*/
|
||||
public function nextNode() : ?DOMNode {
|
||||
$node = $this->currentNode;
|
||||
$result = NodeFilter::FILTER_ACCEPT;
|
||||
|
||||
while ( true ) {
|
||||
while ( $result !== NodeFilter::FILTER_REJECT && $node->firstChild !== null ) {
|
||||
$node = $node->firstChild;
|
||||
$result = $this->nodeFilter( $this, $node );
|
||||
if ( $result === NodeFilter::FILTER_ACCEPT ) {
|
||||
$this->currentNode = $node;
|
||||
return $node;
|
||||
}
|
||||
}
|
||||
$following = $this->nextSkippingChildren( $node, $this->root );
|
||||
if ( $following !== null ) {
|
||||
$node = $following;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
$result = $this->nodeFilter( $this, $node );
|
||||
if ( $result === NodeFilter::FILTER_ACCEPT ) {
|
||||
$this->currentNode = $node;
|
||||
return $node;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue