Bartosz Dziewoński 42ce942c86 Introduce comment "names" to identify comments across revisions/pages
The existing comment IDs can't be used to find the same comment on
a different revision or page (when it's transcluded), because they
depend on the comment's parent and its position on the page.

Comment names depend only on the author and timestamp. The trade-off
is that they can't distinguish comments posted within the same minute,
or in the same edit, so we will still need the IDs sometimes.

Prefer using comment names when replying, if they're not ambiguous.
This fixes T273413 and T275821.

Heading names depend on the author and timestamp of the oldest comment.
This way we don't have to detect changes to the heading text, but we
can't distinguish headings without any comments.

Bug: T274685
Bug: T273413
Bug: T275821
Change-Id: Id85c50ba38d1e532cec106708c077b908a3fcd49
2021-03-23 16:08:42 +00:00

222 lines
6.9 KiB

namespace MediaWiki\Extension\DiscussionTools\Tests;
use DateTimeImmutable;
use DOMElement;
use DOMNode;
use Error;
use MediaWiki\Extension\DiscussionTools\CommentItem;
use MediaWiki\Extension\DiscussionTools\CommentParser;
use MediaWiki\Extension\DiscussionTools\CommentUtils;
use MediaWiki\Extension\DiscussionTools\HeadingItem;
use MediaWiki\Extension\DiscussionTools\ImmutableRange;
use MediaWiki\Extension\DiscussionTools\ThreadItem;
use stdClass;
use Wikimedia\TestingAccessWrapper;
* @coversDefaultClass \MediaWiki\Extension\DiscussionTools\CommentParser
* @group DiscussionTools
class CommentParserTest extends IntegrationTestCase {
* Convert UTF-8 byte offsets to UTF-16 code unit offsets.
* @param DOMElement $ancestor
* @param DOMNode $node
* @param int $nodeOffset
* @return string
private static function getOffsetPath(
DOMElement $ancestor, DOMNode $node, int $nodeOffset
) : string {
if ( $node->nodeType === XML_TEXT_NODE ) {
$str = substr( $node->nodeValue, 0, $nodeOffset );
// Count characters that require two code units to encode in UTF-16
$count = preg_match_all( '/[\x{010000}-\x{10FFFF}]/u', $str );
$nodeOffset = mb_strlen( $str ) + $count;
$path = [ $nodeOffset ];
while ( $node !== $ancestor ) {
if ( !$node->parentNode ) {
throw new Error( 'Not a descendant' );
array_unshift( $path, CommentUtils::childIndexOf( $node ) );
$node = $node->parentNode;
return implode( '/', $path );
private static function serializeComments( ThreadItem &$threadItem, DOMElement $root ) : stdClass {
$serialized = new stdClass();
if ( $threadItem instanceof HeadingItem ) {
$serialized->placeholderHeading = $threadItem->isPlaceholderHeading();
$serialized->type = $threadItem->getType();
if ( $threadItem instanceof CommentItem ) {
$serialized->timestamp = $threadItem->getTimestamp();
$serialized->author = $threadItem->getAuthor();
// Can't serialize the DOM nodes involved in the range,
// instead use their offsets within their parent nodes
$range = $threadItem->getRange();
$serialized->range = [
self::getOffsetPath( $root, $range->startContainer, $range->startOffset ),
self::getOffsetPath( $root, $range->endContainer, $range->endOffset )
if ( $threadItem instanceof CommentItem ) {
$serialized->signatureRanges = array_map( function ( ImmutableRange $range ) use ( $root ) {
return [
self::getOffsetPath( $root, $range->startContainer, $range->startOffset ),
self::getOffsetPath( $root, $range->endContainer, $range->endOffset )
}, $threadItem->getSignatureRanges() );
if ( $threadItem instanceof HeadingItem ) {
$serialized->headingLevel = $threadItem->getHeadingLevel();
$serialized->level = $threadItem->getLevel();
$serialized->name = $threadItem->getName();
$serialized->id = $threadItem->getId();
$serialized->warnings = $threadItem->getWarnings();
// Ignore warnings about legacy IDs (we don't have them in JS)
$serialized->warnings = array_values( array_diff( $serialized->warnings, [ 'Duplicate comment legacy ID' ] ) );
$serialized->replies = [];
foreach ( $threadItem->getReplies() as $reply ) {
$serialized->replies[] = self::serializeComments( $reply, $root );
return $serialized;
* @dataProvider provideTimestampRegexps
* @covers ::getTimestampRegexp
public function testGetTimestampRegexp(
string $format, string $expected, string $message
) : void {
$parser = TestingAccessWrapper::newFromObject(
CommentParser::newFromGlobalState( new DOMElement( 'div' ) )
// HACK: Fix differences between JS & PHP regexes
// TODO: We may just have to have two version in the test data
$expected = preg_replace( '/\\\\u([0-9A-F]+)/', '\\\\x{$1}', $expected );
$expected = str_replace( ':', '\:', $expected );
$expected = '/' . $expected . '/u';
$result = $parser->getTimestampRegexp( 'en', $format, '\\d', [ 'UTC' => 'UTC' ] );
self::assertSame( $expected, $result, $message );
public function provideTimestampRegexps() : array {
return self::getJson( '../cases/timestamp-regex.json' );
* @dataProvider provideTimestampParser
* @covers ::getTimestampParser
public function testGetTimestampParser(
string $format, array $data, string $expected, string $message
) : void {
$parser = TestingAccessWrapper::newFromObject(
CommentParser::newFromGlobalState( new DOMElement( 'div' ) )
$expected = new DateTimeImmutable( $expected );
$tsParser = $parser->getTimestampParser( 'en', $format, null, 'UTC', [ 'UTC' => 'UTC' ] );
self::assertEquals( $expected, $tsParser( $data ), $message );
public function provideTimestampParser() : array {
return self::getJson( '../cases/timestamp-parser.json' );
* @dataProvider provideTimestampParserDST
* @covers ::getTimestampParser
public function testGetTimestampParserDST(
string $sample, string $expected, string $expectedUtc, string $format,
string $timezone, array $timezoneAbbrs, string $message
) : void {
$parser = TestingAccessWrapper::newFromObject(
CommentParser::newFromGlobalState( new DOMElement( 'div' ) )
$regexp = $parser->getTimestampRegexp( 'en', $format, '\\d', $timezoneAbbrs );
$tsParser = $parser->getTimestampParser( 'en', $format, null, $timezone, $timezoneAbbrs );
$expected = new DateTimeImmutable( $expected );
$expectedUtc = new DateTimeImmutable( $expectedUtc );
preg_match( $regexp, $sample, $match, PREG_OFFSET_CAPTURE );
$date = $tsParser( $match );
self::assertEquals( $expected, $date, $message );
self::assertEquals( $expectedUtc, $date, $message );
public function provideTimestampParserDST() : array {
return self::getJson( '../cases/timestamp-parser-dst.json' );
* @dataProvider provideComments
* @covers ::getThreads
public function testGetThreads(
string $name, string $dom, string $expected, string $config, string $data
) : void {
$dom = self::getHtml( $dom );
$expectedPath = $expected;
$expected = self::getJson( $expected );
$config = self::getJson( $config );
$data = self::getJson( $data );
$doc = self::createDocument( $dom );
$body = $doc->getElementsByTagName( 'body' )->item( 0 );
$this->setupEnv( $config, $data );
$parser = self::createParser( $body, $data );
$threads = $parser->getThreads();
$processedThreads = [];
foreach ( $threads as $i => $thread ) {
$thread = self::serializeComments( $thread, $body );
$thread = json_decode( json_encode( $thread ), true );
$processedThreads[] = $thread;
// Optionally write updated content to the JSON files
self::overwriteJsonFile( $expectedPath, $processedThreads );
foreach ( $threads as $i => $thread ) {
self::assertEquals( $expected[$i], $processedThreads[$i], $name . ' section ' . $i );
public function provideComments() : array {
return self::getJson( '../cases/comments.json' );