2019-12-10 02:38:17 +00:00
|
|
|
<?php
|
|
|
|
|
2020-05-14 22:44:49 +00:00
|
|
|
namespace MediaWiki\Extension\DiscussionTools\Tests;
|
|
|
|
|
|
|
|
use DateTimeImmutable;
|
2023-05-19 07:44:23 +00:00
|
|
|
use MediaWiki\Extension\DiscussionTools\CommentParser;
|
2020-05-14 22:44:49 +00:00
|
|
|
use MediaWiki\Extension\DiscussionTools\CommentUtils;
|
2020-05-22 13:47:21 +00:00
|
|
|
use MediaWiki\Extension\DiscussionTools\ImmutableRange;
|
2022-03-18 03:28:06 +00:00
|
|
|
use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentCommentItem;
|
|
|
|
use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem;
|
|
|
|
use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem;
|
2022-10-28 18:24:02 +00:00
|
|
|
use RuntimeException;
|
2020-05-14 23:09:20 +00:00
|
|
|
use stdClass;
|
2021-07-29 02:16:15 +00:00
|
|
|
use Wikimedia\Parsoid\DOM\Element;
|
|
|
|
use Wikimedia\Parsoid\DOM\Node;
|
2022-02-21 18:42:36 +00:00
|
|
|
use Wikimedia\Parsoid\DOM\Text;
|
2019-12-10 02:38:17 +00:00
|
|
|
use Wikimedia\TestingAccessWrapper;
|
|
|
|
|
|
|
|
/**
|
2020-05-14 22:44:49 +00:00
|
|
|
* @group DiscussionTools
|
2022-09-15 12:08:30 +00:00
|
|
|
* @covers \MediaWiki\Extension\DiscussionTools\CommentParser
|
2022-09-16 11:22:43 +00:00
|
|
|
* @covers \MediaWiki\Extension\DiscussionTools\CommentUtils
|
2019-12-10 02:38:17 +00:00
|
|
|
*/
|
2021-02-02 14:12:51 +00:00
|
|
|
class CommentParserTest extends IntegrationTestCase {
|
2019-12-10 02:38:17 +00:00
|
|
|
|
2020-05-13 20:24:35 +00:00
|
|
|
/**
|
2021-08-02 15:16:32 +00:00
|
|
|
* Get the offset path from ancestor to offset in descendant
|
|
|
|
*
|
|
|
|
* Convert Unicode codepoint offsets to UTF-16 code unit offsets.
|
2020-05-13 20:24:35 +00:00
|
|
|
*
|
2021-07-29 02:16:15 +00:00
|
|
|
* @param Element $ancestor
|
|
|
|
* @param Node $node
|
2020-05-13 20:24:35 +00:00
|
|
|
* @param int $nodeOffset
|
2020-05-14 23:09:20 +00:00
|
|
|
* @return string
|
2020-05-13 20:24:35 +00:00
|
|
|
*/
|
2020-05-14 23:09:20 +00:00
|
|
|
private static function getOffsetPath(
|
2021-07-29 02:16:15 +00:00
|
|
|
Element $ancestor, Node $node, int $nodeOffset
|
2021-07-22 07:25:13 +00:00
|
|
|
): string {
|
2022-02-21 18:42:36 +00:00
|
|
|
if ( $node instanceof Text ) {
|
2021-08-02 15:16:32 +00:00
|
|
|
$str = mb_substr( $node->nodeValue, 0, $nodeOffset );
|
2020-05-13 20:24:35 +00:00
|
|
|
// Count characters that require two code units to encode in UTF-16
|
|
|
|
$count = preg_match_all( '/[\x{010000}-\x{10FFFF}]/u', $str );
|
2021-08-02 15:16:32 +00:00
|
|
|
$nodeOffset += $count;
|
2020-05-13 20:24:35 +00:00
|
|
|
}
|
|
|
|
|
2020-05-05 13:12:51 +00:00
|
|
|
$path = [ $nodeOffset ];
|
|
|
|
while ( $node !== $ancestor ) {
|
|
|
|
if ( !$node->parentNode ) {
|
2022-10-28 18:24:02 +00:00
|
|
|
throw new RuntimeException( 'Not a descendant' );
|
2020-05-05 13:12:51 +00:00
|
|
|
}
|
2020-05-14 22:44:49 +00:00
|
|
|
array_unshift( $path, CommentUtils::childIndexOf( $node ) );
|
2020-05-05 13:12:51 +00:00
|
|
|
$node = $node->parentNode;
|
|
|
|
}
|
|
|
|
return implode( '/', $path );
|
|
|
|
}
|
|
|
|
|
2022-10-04 12:50:57 +00:00
|
|
|
private static function getPathsFromRange( ImmutableRange $range, Element $root ): array {
|
|
|
|
return [
|
|
|
|
static::getOffsetPath( $root, $range->startContainer, $range->startOffset ),
|
|
|
|
static::getOffsetPath( $root, $range->endContainer, $range->endOffset )
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
2022-03-18 03:28:06 +00:00
|
|
|
private static function serializeComments( ContentThreadItem $threadItem, Element $root ): stdClass {
|
2020-05-22 16:26:05 +00:00
|
|
|
$serialized = new stdClass();
|
|
|
|
|
2022-03-18 03:28:06 +00:00
|
|
|
if ( $threadItem instanceof ContentHeadingItem ) {
|
2020-07-30 23:34:56 +00:00
|
|
|
$serialized->placeholderHeading = $threadItem->isPlaceholderHeading();
|
|
|
|
}
|
|
|
|
|
2020-05-22 16:26:05 +00:00
|
|
|
$serialized->type = $threadItem->getType();
|
2020-07-30 23:34:56 +00:00
|
|
|
|
2022-03-18 03:28:06 +00:00
|
|
|
if ( $threadItem instanceof ContentCommentItem ) {
|
2021-11-08 17:47:03 +00:00
|
|
|
$serialized->timestamp = $threadItem->getTimestampString();
|
2020-07-30 23:34:56 +00:00
|
|
|
$serialized->author = $threadItem->getAuthor();
|
2022-02-04 18:16:24 +00:00
|
|
|
if ( $threadItem->getDisplayName() ) {
|
|
|
|
$serialized->displayName = $threadItem->getDisplayName();
|
|
|
|
}
|
2020-07-30 23:34:56 +00:00
|
|
|
}
|
2020-05-05 13:12:51 +00:00
|
|
|
|
|
|
|
// Can't serialize the DOM nodes involved in the range,
|
|
|
|
// instead use their offsets within their parent nodes
|
2020-05-22 16:26:05 +00:00
|
|
|
$range = $threadItem->getRange();
|
2022-10-04 12:50:57 +00:00
|
|
|
$serialized->range = static::getPathsFromRange( $range, $root );
|
2020-05-22 16:26:05 +00:00
|
|
|
|
2022-03-18 03:28:06 +00:00
|
|
|
if ( $threadItem instanceof ContentCommentItem ) {
|
2020-05-22 16:26:05 +00:00
|
|
|
$serialized->signatureRanges = array_map( function ( ImmutableRange $range ) use ( $root ) {
|
2022-10-04 12:50:57 +00:00
|
|
|
return static::getPathsFromRange( $range, $root );
|
2020-05-22 16:26:05 +00:00
|
|
|
}, $threadItem->getSignatureRanges() );
|
2022-10-04 12:50:57 +00:00
|
|
|
|
|
|
|
$serialized->timestampRanges = array_map( function ( ImmutableRange $range ) use ( $root ) {
|
|
|
|
return static::getPathsFromRange( $range, $root );
|
|
|
|
}, $threadItem->getTimestampRanges() );
|
2020-07-30 23:34:56 +00:00
|
|
|
}
|
|
|
|
|
2022-03-18 03:28:06 +00:00
|
|
|
if ( $threadItem instanceof ContentHeadingItem ) {
|
2020-10-01 19:36:11 +00:00
|
|
|
$serialized->headingLevel = $threadItem->getHeadingLevel();
|
|
|
|
}
|
2020-07-30 23:34:56 +00:00
|
|
|
$serialized->level = $threadItem->getLevel();
|
2021-02-12 19:16:13 +00:00
|
|
|
$serialized->name = $threadItem->getName();
|
2020-07-30 23:34:56 +00:00
|
|
|
$serialized->id = $threadItem->getId();
|
|
|
|
|
2020-11-02 18:35:38 +00:00
|
|
|
$serialized->warnings = $threadItem->getWarnings();
|
2020-05-05 13:12:51 +00:00
|
|
|
|
2020-07-30 23:34:56 +00:00
|
|
|
$serialized->replies = [];
|
|
|
|
foreach ( $threadItem->getReplies() as $reply ) {
|
2022-06-09 13:51:33 +00:00
|
|
|
$serialized->replies[] = static::serializeComments( $reply, $root );
|
2020-05-05 13:12:51 +00:00
|
|
|
}
|
2020-05-22 16:26:05 +00:00
|
|
|
|
|
|
|
return $serialized;
|
2020-05-05 13:12:51 +00:00
|
|
|
}
|
|
|
|
|
2019-12-10 02:38:17 +00:00
|
|
|
/**
|
|
|
|
* @dataProvider provideTimestampRegexps
|
|
|
|
*/
|
2020-05-14 23:09:20 +00:00
|
|
|
public function testGetTimestampRegexp(
|
|
|
|
string $format, string $expected, string $message
|
2021-07-22 07:25:13 +00:00
|
|
|
): void {
|
2022-03-11 23:42:25 +00:00
|
|
|
$config = static::getJson( "../data/enwiki-config.json" );
|
|
|
|
$data = static::getJson( "../data/enwiki-data.json" );
|
2023-05-19 07:44:23 +00:00
|
|
|
/** @var CommentParser $parser */
|
2019-12-10 02:38:17 +00:00
|
|
|
$parser = TestingAccessWrapper::newFromObject(
|
2022-03-11 23:42:25 +00:00
|
|
|
$this->createParser( $config, $data )
|
2019-12-10 02:38:17 +00:00
|
|
|
);
|
|
|
|
|
|
|
|
// HACK: Fix differences between JS & PHP regexes
|
|
|
|
// TODO: We may just have to have two version in the test data
|
|
|
|
$expected = preg_replace( '/\\\\u([0-9A-F]+)/', '\\\\x{$1}', $expected );
|
|
|
|
$expected = str_replace( ':', '\:', $expected );
|
|
|
|
$expected = '/' . $expected . '/u';
|
|
|
|
|
2020-09-03 20:59:33 +00:00
|
|
|
$result = $parser->getTimestampRegexp( 'en', $format, '\\d', [ 'UTC' => 'UTC' ] );
|
2022-06-09 13:51:33 +00:00
|
|
|
static::assertSame( $expected, $result, $message );
|
2019-12-10 02:38:17 +00:00
|
|
|
}
|
|
|
|
|
2023-05-20 13:57:13 +00:00
|
|
|
public static function provideTimestampRegexps(): array {
|
2022-06-09 13:51:33 +00:00
|
|
|
return static::getJson( '../cases/timestamp-regex.json' );
|
2019-12-10 02:38:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @dataProvider provideTimestampParser
|
|
|
|
*/
|
2020-05-14 23:09:20 +00:00
|
|
|
public function testGetTimestampParser(
|
2022-03-11 23:42:25 +00:00
|
|
|
string $format, ?array $digits, array $matchData, string $expected, string $message
|
2021-07-22 07:25:13 +00:00
|
|
|
): void {
|
2022-03-11 23:42:25 +00:00
|
|
|
$config = static::getJson( "../data/enwiki-config.json" );
|
|
|
|
$data = static::getJson( "../data/enwiki-data.json" );
|
2023-05-19 07:44:23 +00:00
|
|
|
/** @var CommentParser $parser */
|
2019-12-10 02:38:17 +00:00
|
|
|
$parser = TestingAccessWrapper::newFromObject(
|
2022-03-11 23:42:25 +00:00
|
|
|
$this->createParser( $config, $data )
|
2019-12-10 02:38:17 +00:00
|
|
|
);
|
|
|
|
|
|
|
|
$expected = new DateTimeImmutable( $expected );
|
|
|
|
|
2023-10-30 08:20:27 +00:00
|
|
|
$tsParser = $parser->getTimestampParser( 'en', $format, $digits, 'UTC', [ 'UTC' => 'UTC' ] );
|
2022-03-11 23:42:25 +00:00
|
|
|
static::assertEquals( $expected, $tsParser( $matchData )['date'], $message );
|
2019-12-10 02:38:17 +00:00
|
|
|
}
|
|
|
|
|
2023-05-20 13:57:13 +00:00
|
|
|
public static function provideTimestampParser(): array {
|
2022-06-09 13:51:33 +00:00
|
|
|
return static::getJson( '../cases/timestamp-parser.json' );
|
2019-12-10 02:38:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @dataProvider provideTimestampParserDST
|
|
|
|
*/
|
|
|
|
public function testGetTimestampParserDST(
|
2020-05-14 23:09:20 +00:00
|
|
|
string $sample, string $expected, string $expectedUtc, string $format,
|
|
|
|
string $timezone, array $timezoneAbbrs, string $message
|
2021-07-22 07:25:13 +00:00
|
|
|
): void {
|
2022-03-11 23:42:25 +00:00
|
|
|
$config = static::getJson( "../data/enwiki-config.json" );
|
|
|
|
$data = static::getJson( "../data/enwiki-data.json" );
|
2023-05-19 07:44:23 +00:00
|
|
|
/** @var CommentParser $parser */
|
2019-12-10 02:38:17 +00:00
|
|
|
$parser = TestingAccessWrapper::newFromObject(
|
2022-03-11 23:42:25 +00:00
|
|
|
$this->createParser( $config, $data )
|
2019-12-10 02:38:17 +00:00
|
|
|
);
|
|
|
|
|
2020-09-03 20:59:33 +00:00
|
|
|
$regexp = $parser->getTimestampRegexp( 'en', $format, '\\d', $timezoneAbbrs );
|
|
|
|
$tsParser = $parser->getTimestampParser( 'en', $format, null, $timezone, $timezoneAbbrs );
|
2019-12-10 02:38:17 +00:00
|
|
|
|
|
|
|
$expected = new DateTimeImmutable( $expected );
|
|
|
|
$expectedUtc = new DateTimeImmutable( $expectedUtc );
|
|
|
|
|
|
|
|
preg_match( $regexp, $sample, $match, PREG_OFFSET_CAPTURE );
|
2022-03-03 16:29:39 +00:00
|
|
|
$date = $tsParser( $match )['date'];
|
2019-12-10 02:38:17 +00:00
|
|
|
|
2022-06-09 13:51:33 +00:00
|
|
|
static::assertEquals( $expected, $date, $message );
|
|
|
|
static::assertEquals( $expectedUtc, $date, $message );
|
2019-12-10 02:38:17 +00:00
|
|
|
}
|
|
|
|
|
2023-05-20 13:57:13 +00:00
|
|
|
public static function provideTimestampParserDST(): array {
|
2022-06-09 13:51:33 +00:00
|
|
|
return static::getJson( '../cases/timestamp-parser-dst.json' );
|
2019-12-10 02:38:17 +00:00
|
|
|
}
|
|
|
|
|
2020-05-05 13:12:51 +00:00
|
|
|
/**
|
|
|
|
* @dataProvider provideComments
|
|
|
|
*/
|
2020-07-20 21:15:03 +00:00
|
|
|
public function testGetThreads(
|
2022-01-11 15:45:55 +00:00
|
|
|
string $name, string $title, string $dom, string $expected, string $config, string $data
|
2021-07-22 07:25:13 +00:00
|
|
|
): void {
|
2022-06-09 13:51:33 +00:00
|
|
|
$dom = static::getHtml( $dom );
|
2020-07-30 23:34:56 +00:00
|
|
|
$expectedPath = $expected;
|
2022-06-09 13:51:33 +00:00
|
|
|
$expected = static::getJson( $expected );
|
|
|
|
$config = static::getJson( $config );
|
|
|
|
$data = static::getJson( $data );
|
2020-05-05 13:12:51 +00:00
|
|
|
|
2022-06-09 13:51:33 +00:00
|
|
|
$doc = static::createDocument( $dom );
|
|
|
|
$container = static::getThreadContainer( $doc );
|
2020-05-05 13:12:51 +00:00
|
|
|
|
2022-03-11 23:42:25 +00:00
|
|
|
$title = $this->createTitleParser( $config )->parseTitle( $title );
|
|
|
|
$threadItemSet = $this->createParser( $config, $data )->parse( $container, $title );
|
2022-02-19 06:31:34 +00:00
|
|
|
$threads = $threadItemSet->getThreads();
|
2020-05-05 13:12:51 +00:00
|
|
|
|
|
|
|
$processedThreads = [];
|
|
|
|
|
|
|
|
foreach ( $threads as $i => $thread ) {
|
2022-06-09 13:51:33 +00:00
|
|
|
$thread = static::serializeComments( $thread, $container );
|
2020-05-05 13:12:51 +00:00
|
|
|
$thread = json_decode( json_encode( $thread ), true );
|
|
|
|
$processedThreads[] = $thread;
|
|
|
|
}
|
2020-07-30 23:34:56 +00:00
|
|
|
|
2020-10-19 20:51:43 +00:00
|
|
|
// Optionally write updated content to the JSON files
|
|
|
|
if ( getenv( 'DISCUSSIONTOOLS_OVERWRITE_TESTS' ) ) {
|
2022-06-09 13:51:33 +00:00
|
|
|
static::overwriteJsonFile( $expectedPath, $processedThreads );
|
2020-10-19 20:51:43 +00:00
|
|
|
}
|
2020-08-10 21:31:49 +00:00
|
|
|
|
|
|
|
foreach ( $threads as $i => $thread ) {
|
2022-06-09 13:51:33 +00:00
|
|
|
static::assertEquals( $expected[$i], $processedThreads[$i], $name . ' section ' . $i );
|
2020-08-10 21:31:49 +00:00
|
|
|
}
|
2020-05-05 13:12:51 +00:00
|
|
|
}
|
|
|
|
|
2023-05-20 13:57:13 +00:00
|
|
|
public static function provideComments(): array {
|
2022-06-09 13:51:33 +00:00
|
|
|
return static::getJson( '../cases/comments.json' );
|
2020-05-05 13:12:51 +00:00
|
|
|
}
|
2020-05-15 00:51:36 +00:00
|
|
|
|
2019-12-10 02:38:17 +00:00
|
|
|
}
|