mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/DiscussionTools
synced 2024-09-29 13:26:28 +00:00
6f32369b6a
In JS, strings are internally encoded as UTF-16, and properties like .length return values in UTF-16 code units. In PHP, strings are internally encoded as UTF-8, and we have the option of using methods that return bytes like strlen() or UTF-8 code units like mb_strlen(). However, the values produced by preg_match( …, PREG_OFFSET_CAPTURE ) are in bytes, and there's nothing we can do about that. So let's use bytes throughout, mixing the two types results in meaningless numbers. Then in the test code, we have to calculate UTF-16 code units offsets based on the UTF-8 byte offsets. We also have to copy the entire workaround for mw:Entity nodes… Maybe the parser should be fixed to return the real nodes for ranges' ends in this case. Change-Id: I05804489d7de0d60be6e9f84e6a49a885e9fb870
235 lines
6.8 KiB
PHP
235 lines
6.8 KiB
PHP
<?php
|
|
|
|
use Wikimedia\TestingAccessWrapper;
|
|
|
|
/**
|
|
* @coversDefaultClass DiscussionToolsCommentParser
|
|
*/
|
|
class DiscussionToolsCommentParserTest extends DiscussionToolsTestCase {
|
|
|
|
/**
|
|
* Convert UTF-8 byte offsets to UTF-16 code unit offsets.
|
|
*
|
|
* @param DOMElement $ancestor
|
|
* @param DOMNode $node
|
|
* @param int $nodeOffset
|
|
* @return int
|
|
*/
|
|
private static function getOffsetPath( DOMElement $ancestor, DOMNode $node, $nodeOffset ) {
|
|
if ( $node->nodeType === XML_TEXT_NODE ) {
|
|
$startNode = $node;
|
|
$nodeText = '';
|
|
|
|
while ( $node ) {
|
|
$nodeText .= $node->nodeValue;
|
|
|
|
// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
|
|
// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
|
|
// which apparently are often turned into entities by buggy editing tools. To handle
|
|
// this, we must piece together the text, so that our regexp can match those timestamps.
|
|
if (
|
|
$node->nextSibling &&
|
|
$node->nextSibling->nodeType === XML_ELEMENT_NODE &&
|
|
$node->nextSibling->getAttribute( 'typeof' ) === 'mw:Entity'
|
|
) {
|
|
$nodeText .= $node->nextSibling->firstChild->nodeValue;
|
|
|
|
// If the entity is followed by more text, do this again
|
|
if (
|
|
$node->nextSibling->nextSibling &&
|
|
$node->nextSibling->nextSibling->nodeType === XML_TEXT_NODE
|
|
) {
|
|
$node = $node->nextSibling->nextSibling;
|
|
} else {
|
|
$node = null;
|
|
}
|
|
} else {
|
|
$node = null;
|
|
}
|
|
}
|
|
|
|
$str = substr( $nodeText, 0, $nodeOffset );
|
|
// Count characters that require two code units to encode in UTF-16
|
|
$count = preg_match_all( '/[\x{010000}-\x{10FFFF}]/u', $str );
|
|
$nodeOffset = mb_strlen( $str ) + $count;
|
|
|
|
$node = $startNode;
|
|
}
|
|
|
|
$path = [ $nodeOffset ];
|
|
while ( $node !== $ancestor ) {
|
|
if ( !$node->parentNode ) {
|
|
throw new Error( 'Not a descendant' );
|
|
}
|
|
array_unshift( $path, DiscussionToolsCommentParser::childIndexOf( $node ) );
|
|
$node = $node->parentNode;
|
|
}
|
|
return implode( '/', $path );
|
|
}
|
|
|
|
private static function serializeComments( &$parent, $root ) {
|
|
unset( $parent->parent );
|
|
|
|
// Can't serialize the DOM nodes involved in the range,
|
|
// instead use their offsets within their parent nodes
|
|
$parent->range = [
|
|
self::getOffsetPath( $root, $parent->range->startContainer, $parent->range->startOffset ),
|
|
self::getOffsetPath( $root, $parent->range->endContainer, $parent->range->endOffset )
|
|
];
|
|
if ( isset( $parent->signatureRanges ) ) {
|
|
$parent->signatureRanges = array_map( function ( $range ) use ( $root ) {
|
|
return [
|
|
self::getOffsetPath( $root, $range->startContainer, $range->startOffset ),
|
|
self::getOffsetPath( $root, $range->endContainer, $range->endOffset )
|
|
];
|
|
}, $parent->signatureRanges );
|
|
}
|
|
|
|
foreach ( $parent->replies as $reply ) {
|
|
self::serializeComments( $reply, $root );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideTimestampRegexps
|
|
* @covers ::getTimestampRegexp
|
|
*/
|
|
public function testGetTimestampRegexp( $format, $expected, $message ) {
|
|
$parser = TestingAccessWrapper::newFromObject(
|
|
DiscussionToolsCommentParser::newFromGlobalState()
|
|
);
|
|
|
|
// HACK: Fix differences between JS & PHP regexes
|
|
// TODO: We may just have to have two version in the test data
|
|
$expected = preg_replace( '/\\\\u([0-9A-F]+)/', '\\\\x{$1}', $expected );
|
|
$expected = str_replace( ':', '\:', $expected );
|
|
$expected = '/' . $expected . '/u';
|
|
|
|
$result = $parser->getTimestampRegexp( $format, '\\d', [ 'UTC' => 'UTC' ] );
|
|
self::assertSame( $expected, $result, $message );
|
|
}
|
|
|
|
public function provideTimestampRegexps() {
|
|
return self::getJson( './cases/timestamp-regex.json' );
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideTimestampParser
|
|
* @covers ::getTimestampParser
|
|
*/
|
|
public function testGetTimestampParser( $format, $data, $expected, $message ) {
|
|
$parser = TestingAccessWrapper::newFromObject(
|
|
DiscussionToolsCommentParser::newFromGlobalState()
|
|
);
|
|
|
|
$expected = new DateTimeImmutable( $expected );
|
|
|
|
$tsParser = $parser->getTimestampParser( $format, null, 'UTC', [ 'UTC' => 'UTC' ] );
|
|
self::assertEquals( $expected, $tsParser( $data ), $message );
|
|
}
|
|
|
|
public function provideTimestampParser() {
|
|
return self::getJson( './cases/timestamp-parser.json' );
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideTimestampParserDST
|
|
* @covers ::getTimestampParser
|
|
*/
|
|
public function testGetTimestampParserDST(
|
|
$sample, $expected, $expectedUtc, $format, $timezone, $timezoneAbbrs, $message
|
|
) {
|
|
$parser = TestingAccessWrapper::newFromObject(
|
|
DiscussionToolsCommentParser::newFromGlobalState()
|
|
);
|
|
|
|
$regexp = $parser->getTimestampRegexp( $format, '\\d', $timezoneAbbrs );
|
|
$tsParser = $parser->getTimestampParser( $format, null, $timezone, $timezoneAbbrs );
|
|
|
|
$expected = new DateTimeImmutable( $expected );
|
|
$expectedUtc = new DateTimeImmutable( $expectedUtc );
|
|
|
|
preg_match( $regexp, $sample, $match, PREG_OFFSET_CAPTURE );
|
|
$date = $tsParser( $match );
|
|
|
|
self::assertEquals( $expected, $date, $message );
|
|
self::assertEquals( $expectedUtc, $date, $message );
|
|
}
|
|
|
|
public function provideTimestampParserDST() {
|
|
return self::getJson( './cases/timestamp-parser-dst.json' );
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideAuthors
|
|
* @covers ::getAuthors
|
|
*/
|
|
public function testGetAuthors( $thread, $expected ) {
|
|
$parser = DiscussionToolsCommentParser::newFromGlobalState();
|
|
|
|
self::assertEquals( $expected, $parser->getAuthors( $thread ) );
|
|
}
|
|
|
|
public function provideAuthors() {
|
|
return [
|
|
[
|
|
'thread' => (object)[
|
|
'replies' => [
|
|
(object)[
|
|
'author' => 'Eve',
|
|
'replies' => []
|
|
],
|
|
(object)[
|
|
'author' => 'Bob',
|
|
'replies' => [
|
|
(object)[
|
|
'author' => 'Alice',
|
|
'replies' => []
|
|
]
|
|
]
|
|
]
|
|
|
|
]
|
|
],
|
|
'expected' => [ 'Alice', 'Bob', 'Eve' ]
|
|
]
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideComments
|
|
* @covers ::getComments
|
|
* @covers ::groupThreads
|
|
*/
|
|
public function testGetComments( $name, $dom, $expected, $config, $data ) {
|
|
$dom = self::getHtml( $dom );
|
|
$expected = self::getJson( $expected );
|
|
$config = self::getJson( $config );
|
|
$data = self::getJson( $data );
|
|
|
|
$this->setupEnv( $config, $data );
|
|
$parserOrig = self::createParser( $data );
|
|
|
|
$parser = TestingAccessWrapper::newFromObject( $parserOrig );
|
|
|
|
$doc = self::createDocument( $dom );
|
|
$container = $doc->documentElement->childNodes[0];
|
|
|
|
$comments = $parserOrig->getComments( $container );
|
|
$threads = $parserOrig->groupThreads( $comments );
|
|
|
|
$processedThreads = [];
|
|
|
|
foreach ( $threads as $i => $thread ) {
|
|
self::serializeComments( $thread, $container );
|
|
$thread = json_decode( json_encode( $thread ), true );
|
|
$processedThreads[] = $thread;
|
|
self::assertEquals( $expected[$i], $processedThreads[$i], $name . ' section ' . $i );
|
|
}
|
|
}
|
|
|
|
public function provideComments() {
|
|
return self::getJson( './cases/comments.json' );
|
|
}
|
|
}
|