From 6f32369b6a5e90b149e8aa26afb8ab2221b72004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartosz=20Dziewo=C5=84ski?= Date: Wed, 13 May 2020 22:24:35 +0200 Subject: [PATCH] tests: Fix comparing PHP and JS ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In JS, strings are internally encoded as UTF-16, and properties like .length return values in UTF-16 code units. In PHP, strings are internally encoded as UTF-8, and we have the option of using methods that return bytes like strlen() or UTF-8 code units like mb_strlen(). However, the values produced by preg_match( …, PREG_OFFSET_CAPTURE ) are in bytes, and there's nothing we can do about that. So let's use bytes throughout, mixing the two types results in meaningless numbers. Then in the test code, we have to calculate UTF-16 code units offsets based on the UTF-8 byte offsets. We also have to copy the entire workaround for mw:Entity nodes… Maybe the parser should be fixed to return the real nodes for ranges' ends in this case. Change-Id: I05804489d7de0d60be6e9f84e6a49a885e9fb870 --- includes/DiscussionToolsCommentParser.php | 4 +- .../DiscussionToolsCommentParserTest.php | 64 +++++++++++++++---- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/includes/DiscussionToolsCommentParser.php b/includes/DiscussionToolsCommentParser.php index 7876267df..8ebd2fe9f 100644 --- a/includes/DiscussionToolsCommentParser.php +++ b/includes/DiscussionToolsCommentParser.php @@ -843,13 +843,13 @@ class DiscussionToolsCommentParser { 'startContainer' => $startNode->parentNode, 'startOffset' => self::childIndexOf( $startNode ), 'endContainer' => $node, - 'endOffset' => $match[0][1] + mb_strlen( $match[0][0] ) + 'endOffset' => $match[0][1] + strlen( $match[0][0] ) ]; $sigRange = (object)[ 'startContainer' => $firstSigNode->parentNode, 'startOffset' => self::childIndexOf( $firstSigNode ), 'endContainer' => $node, - 'endOffset' => $match[0][1] + mb_strlen( $match[0][0] ) + 'endOffset' => $match[0][1] + strlen( $match[0][0] ) ]; $startLevel = $this->getIndentLevel( $startNode, $rootNode ) + 1; diff --git a/tests/phpunit/DiscussionToolsCommentParserTest.php b/tests/phpunit/DiscussionToolsCommentParserTest.php index 4f8be01f4..60b4fb9c3 100644 --- a/tests/phpunit/DiscussionToolsCommentParserTest.php +++ b/tests/phpunit/DiscussionToolsCommentParserTest.php @@ -7,7 +7,55 @@ use Wikimedia\TestingAccessWrapper; */ class DiscussionToolsCommentParserTest extends DiscussionToolsTestCase { - private static function getOffsetPath( $ancestor, $node, $nodeOffset ) { + /** + * Convert UTF-8 byte offsets to UTF-16 code unit offsets. + * + * @param DOMElement $ancestor + * @param DOMNode $node + * @param int $nodeOffset + * @return int + */ + private static function getOffsetPath( DOMElement $ancestor, DOMNode $node, $nodeOffset ) { + if ( $node->nodeType === XML_TEXT_NODE ) { + $startNode = $node; + $nodeText = ''; + + while ( $node ) { + $nodeText .= $node->nodeValue; + + // In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML + // entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces, + // which apparently are often turned into   entities by buggy editing tools. To handle + // this, we must piece together the text, so that our regexp can match those timestamps. + if ( + $node->nextSibling && + $node->nextSibling->nodeType === XML_ELEMENT_NODE && + $node->nextSibling->getAttribute( 'typeof' ) === 'mw:Entity' + ) { + $nodeText .= $node->nextSibling->firstChild->nodeValue; + + // If the entity is followed by more text, do this again + if ( + $node->nextSibling->nextSibling && + $node->nextSibling->nextSibling->nodeType === XML_TEXT_NODE + ) { + $node = $node->nextSibling->nextSibling; + } else { + $node = null; + } + } else { + $node = null; + } + } + + $str = substr( $nodeText, 0, $nodeOffset ); + // Count characters that require two code units to encode in UTF-16 + $count = preg_match_all( '/[\x{010000}-\x{10FFFF}]/u', $str ); + $nodeOffset = mb_strlen( $str ) + $count; + + $node = $startNode; + } + $path = [ $nodeOffset ]; while ( $node !== $ancestor ) { if ( !$node->parentNode ) { @@ -181,18 +229,6 @@ class DiscussionToolsCommentParserTest extends DiscussionToolsTestCase { } public function provideComments() { - return [ - self::getJson( './cases/comments.json' )[0], - self::getJson( './cases/comments.json' )[1], - // self::getJson( './cases/comments.json' )[2], - // self::getJson( './cases/comments.json' )[3], - self::getJson( './cases/comments.json' )[4], - self::getJson( './cases/comments.json' )[5], - self::getJson( './cases/comments.json' )[6], - self::getJson( './cases/comments.json' )[7], - self::getJson( './cases/comments.json' )[8], - self::getJson( './cases/comments.json' )[9], - self::getJson( './cases/comments.json' )[10] - ]; + return self::getJson( './cases/comments.json' ); } }