mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/DiscussionTools
synced 2024-11-17 05:10:50 +00:00
375bfe028e
Previously, parser would output offsets that don't exist in their containers, because we were pretending that entities are parts of their neighboring text nodes. Turns out it's much easier to do it right when going backwards. Change-Id: I9bccca2d403f1a976ae517449989170cdd99721e
219 lines
6.7 KiB
PHP
219 lines
6.7 KiB
PHP
<?php
|
|
|
|
namespace MediaWiki\Extension\DiscussionTools\Tests;
|
|
|
|
use DateTimeImmutable;
|
|
use DOMElement;
|
|
use DOMNode;
|
|
use Error;
|
|
use MediaWiki\Extension\DiscussionTools\CommentItem;
|
|
use MediaWiki\Extension\DiscussionTools\CommentParser;
|
|
use MediaWiki\Extension\DiscussionTools\CommentUtils;
|
|
use MediaWiki\Extension\DiscussionTools\HeadingItem;
|
|
use MediaWiki\Extension\DiscussionTools\ImmutableRange;
|
|
use MediaWiki\Extension\DiscussionTools\ThreadItem;
|
|
use stdClass;
|
|
use Wikimedia\TestingAccessWrapper;
|
|
|
|
/**
|
|
* @coversDefaultClass \MediaWiki\Extension\DiscussionTools\CommentParser
|
|
*
|
|
* @group DiscussionTools
|
|
*/
|
|
class CommentParserTest extends CommentTestCase {
|
|
|
|
/**
|
|
* Convert UTF-8 byte offsets to UTF-16 code unit offsets.
|
|
*
|
|
* @param DOMElement $ancestor
|
|
* @param DOMNode $node
|
|
* @param int $nodeOffset
|
|
* @return string
|
|
*/
|
|
private static function getOffsetPath(
|
|
DOMElement $ancestor, DOMNode $node, int $nodeOffset
|
|
) : string {
|
|
if ( $node->nodeType === XML_TEXT_NODE ) {
|
|
$str = substr( $node->nodeValue, 0, $nodeOffset );
|
|
// Count characters that require two code units to encode in UTF-16
|
|
$count = preg_match_all( '/[\x{010000}-\x{10FFFF}]/u', $str );
|
|
$nodeOffset = mb_strlen( $str ) + $count;
|
|
}
|
|
|
|
$path = [ $nodeOffset ];
|
|
while ( $node !== $ancestor ) {
|
|
if ( !$node->parentNode ) {
|
|
throw new Error( 'Not a descendant' );
|
|
}
|
|
array_unshift( $path, CommentUtils::childIndexOf( $node ) );
|
|
$node = $node->parentNode;
|
|
}
|
|
return implode( '/', $path );
|
|
}
|
|
|
|
private static function serializeComments( ThreadItem &$threadItem, DOMElement $root ) : stdClass {
|
|
$serialized = new stdClass();
|
|
|
|
if ( $threadItem instanceof HeadingItem && $threadItem->isPlaceholderHeading() ) {
|
|
$serialized->placeholderHeading = $threadItem->isPlaceholderHeading();
|
|
}
|
|
|
|
$serialized->type = $threadItem->getType();
|
|
|
|
if ( $threadItem instanceof CommentItem ) {
|
|
$serialized->timestamp = $threadItem->getTimestamp();
|
|
$serialized->author = $threadItem->getAuthor();
|
|
}
|
|
|
|
// Can't serialize the DOM nodes involved in the range,
|
|
// instead use their offsets within their parent nodes
|
|
$range = $threadItem->getRange();
|
|
$serialized->range = [
|
|
self::getOffsetPath( $root, $range->startContainer, $range->startOffset ),
|
|
self::getOffsetPath( $root, $range->endContainer, $range->endOffset )
|
|
];
|
|
|
|
if ( $threadItem instanceof CommentItem ) {
|
|
$serialized->signatureRanges = array_map( function ( ImmutableRange $range ) use ( $root ) {
|
|
return [
|
|
self::getOffsetPath( $root, $range->startContainer, $range->startOffset ),
|
|
self::getOffsetPath( $root, $range->endContainer, $range->endOffset )
|
|
];
|
|
}, $threadItem->getSignatureRanges() );
|
|
}
|
|
|
|
$serialized->level = $threadItem->getLevel();
|
|
$serialized->id = $threadItem->getId();
|
|
|
|
if ( $threadItem instanceof CommentItem ) {
|
|
$warnings = $threadItem->getWarnings();
|
|
if ( count( $warnings ) ) {
|
|
$serialized->warnings = $threadItem->getWarnings();
|
|
}
|
|
}
|
|
|
|
$serialized->replies = [];
|
|
foreach ( $threadItem->getReplies() as $reply ) {
|
|
$serialized->replies[] = self::serializeComments( $reply, $root );
|
|
}
|
|
|
|
return $serialized;
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideTimestampRegexps
|
|
* @covers ::getTimestampRegexp
|
|
*/
|
|
public function testGetTimestampRegexp(
|
|
string $format, string $expected, string $message
|
|
) : void {
|
|
$parser = TestingAccessWrapper::newFromObject(
|
|
CommentParser::newFromGlobalState( new DOMElement( 'div' ) )
|
|
);
|
|
|
|
// HACK: Fix differences between JS & PHP regexes
|
|
// TODO: We may just have to have two version in the test data
|
|
$expected = preg_replace( '/\\\\u([0-9A-F]+)/', '\\\\x{$1}', $expected );
|
|
$expected = str_replace( ':', '\:', $expected );
|
|
$expected = '/' . $expected . '/u';
|
|
|
|
$result = $parser->getTimestampRegexp( $format, '\\d', [ 'UTC' => 'UTC' ] );
|
|
self::assertSame( $expected, $result, $message );
|
|
}
|
|
|
|
public function provideTimestampRegexps() : array {
|
|
return self::getJson( '../cases/timestamp-regex.json' );
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideTimestampParser
|
|
* @covers ::getTimestampParser
|
|
*/
|
|
public function testGetTimestampParser(
|
|
string $format, array $data, string $expected, string $message
|
|
) : void {
|
|
$parser = TestingAccessWrapper::newFromObject(
|
|
CommentParser::newFromGlobalState( new DOMELement( 'div' ) )
|
|
);
|
|
|
|
$expected = new DateTimeImmutable( $expected );
|
|
|
|
$tsParser = $parser->getTimestampParser( $format, null, 'UTC', [ 'UTC' => 'UTC' ] );
|
|
self::assertEquals( $expected, $tsParser( $data ), $message );
|
|
}
|
|
|
|
public function provideTimestampParser() : array {
|
|
return self::getJson( '../cases/timestamp-parser.json' );
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideTimestampParserDST
|
|
* @covers ::getTimestampParser
|
|
*/
|
|
public function testGetTimestampParserDST(
|
|
string $sample, string $expected, string $expectedUtc, string $format,
|
|
string $timezone, array $timezoneAbbrs, string $message
|
|
) : void {
|
|
$parser = TestingAccessWrapper::newFromObject(
|
|
CommentParser::newFromGlobalState( new DOMELement( 'div' ) )
|
|
);
|
|
|
|
$regexp = $parser->getTimestampRegexp( $format, '\\d', $timezoneAbbrs );
|
|
$tsParser = $parser->getTimestampParser( $format, null, $timezone, $timezoneAbbrs );
|
|
|
|
$expected = new DateTimeImmutable( $expected );
|
|
$expectedUtc = new DateTimeImmutable( $expectedUtc );
|
|
|
|
preg_match( $regexp, $sample, $match, PREG_OFFSET_CAPTURE );
|
|
$date = $tsParser( $match );
|
|
|
|
self::assertEquals( $expected, $date, $message );
|
|
self::assertEquals( $expectedUtc, $date, $message );
|
|
}
|
|
|
|
public function provideTimestampParserDST() : array {
|
|
return self::getJson( '../cases/timestamp-parser-dst.json' );
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideComments
|
|
* @covers ::getThreads
|
|
*/
|
|
public function testGetThreads(
|
|
string $name, string $dom, string $expected, string $config, string $data
|
|
) : void {
|
|
$dom = self::getHtml( $dom );
|
|
$expectedPath = $expected;
|
|
$expected = self::getJson( $expected );
|
|
$config = self::getJson( $config );
|
|
$data = self::getJson( $data );
|
|
|
|
$doc = self::createDocument( $dom );
|
|
$body = $doc->getElementsByTagName( 'body' )->item( 0 );
|
|
|
|
$this->setupEnv( $config, $data );
|
|
$parser = self::createParser( $body, $data );
|
|
$threads = $parser->getThreads();
|
|
|
|
$processedThreads = [];
|
|
|
|
foreach ( $threads as $i => $thread ) {
|
|
$thread = self::serializeComments( $thread, $body );
|
|
$thread = json_decode( json_encode( $thread ), true );
|
|
$processedThreads[] = $thread;
|
|
}
|
|
|
|
// Uncomment this to write updated content to the JSON files:
|
|
// self::overwriteJsonFile( $expectedPath, $processedThreads );
|
|
|
|
foreach ( $threads as $i => $thread ) {
|
|
self::assertEquals( $expected[$i], $processedThreads[$i], $name . ' section ' . $i );
|
|
}
|
|
}
|
|
|
|
public function provideComments() : array {
|
|
return self::getJson( '../cases/comments.json' );
|
|
}
|
|
|
|
}
|