From a27e27fc68bab5f5102bcf359cd5b1742f217588 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartosz=20Dziewo=C5=84ski?= Date: Thu, 7 Dec 2023 23:57:21 +0100 Subject: [PATCH] Move finding transclusion source from ContentThreadItem to CommentParser Reasons: * Various other methods dealing with ranges already live there * It would be neat if ContentThreadItem was just a value class without a lot of logic, similar to DatabaseThreadItem, particularly for writing unit tests * The methods access global state through Title, which can't be fixed while they're in ContentThreadItem (see I9dfccc83) The computation is now always done, instead of only when needed, but that's a small drawback, since it's fast (fast enough that I don't see the difference in the time taken when running tests), and we were already computing it for all comments in many places. Change-Id: Ic718a964e309ae3a8e15e299081f46d4db860731 --- includes/ApiDiscussionToolsPageInfo.php | 2 +- includes/CommentParser.php | 219 ++++++++++++++++++++- includes/ThreadItem/ContentCommentItem.php | 5 +- includes/ThreadItem/ContentHeadingItem.php | 21 +- includes/ThreadItem/ContentThreadItem.php | 213 +------------------- tests/phpunit/ContentThreadItemTest.php | 4 +- 6 files changed, 233 insertions(+), 231 deletions(-) diff --git a/includes/ApiDiscussionToolsPageInfo.php b/includes/ApiDiscussionToolsPageInfo.php index 09912e14e..02ba9a4b1 100644 --- a/includes/ApiDiscussionToolsPageInfo.php +++ b/includes/ApiDiscussionToolsPageInfo.php @@ -173,7 +173,7 @@ class ApiDiscussionToolsPageInfo extends ApiBase { $closest = CommentUtils::closestElementWithSibling( $firstRange->startContainer, 'previous' ); if ( $closest && !$rootNode->isSameNode( $closest ) ) { $range = new ImmutableRange( $rootNode, 0, $rootNode, 0 ); - $fakeHeading = new ContentHeadingItem( $range, null ); + $fakeHeading = new ContentHeadingItem( $range, false, null ); $fakeHeading->setRootNode( $rootNode ); $fakeHeading->setName( 'h-' ); $fakeHeading->setId( 'h-' ); diff --git a/includes/CommentParser.php b/includes/CommentParser.php index d800c2c7d..865a149bb 100644 --- a/includes/CommentParser.php +++ b/includes/CommentParser.php @@ -15,6 +15,7 @@ use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem; use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem; use MediaWiki\Languages\LanguageConverterFactory; use MediaWiki\Title\MalformedTitleException; +use MediaWiki\Title\Title; use MediaWiki\Title\TitleParser; use MediaWiki\Title\TitleValue; use MediaWiki\Utils\MWTimestamp; @@ -911,7 +912,8 @@ class CommentParser { $range = new ImmutableRange( $headingNode, $startOffset, $headingNode, $headingNode->childNodes->length ); - $curComment = new ContentHeadingItem( $range, (int)( $match[ 1 ] ) ); + $transcludedFrom = $this->computeTranscludedFrom( $range ); + $curComment = new ContentHeadingItem( $range, $transcludedFrom, (int)( $match[ 1 ] ) ); $curComment->setRootNode( $this->rootNode ); $result->addThreadItem( $curComment ); $curCommentEnd = $node; @@ -989,6 +991,7 @@ class CommentParser { $endNode, $length ); + $transcludedFrom = $this->computeTranscludedFrom( $range ); $startLevel = CommentUtils::getIndentLevel( $startNode, $this->rootNode ) + 1; $endLevel = CommentUtils::getIndentLevel( $node, $this->rootNode ) + 1; @@ -1011,6 +1014,7 @@ class CommentParser { $curComment = new ContentCommentItem( $level, $range, + $transcludedFrom, $sigRanges, $timestampRanges, $dateTime, @@ -1025,7 +1029,7 @@ class CommentParser { // Add a fake placeholder heading if there are any comments in the 0th section // (before the first real heading) $range = new ImmutableRange( $this->rootNode, 0, $this->rootNode, 0 ); - $fakeHeading = new ContentHeadingItem( $range, null ); + $fakeHeading = new ContentHeadingItem( $range, false, null ); $fakeHeading->setRootNode( $this->rootNode ); $result->addThreadItem( $fakeHeading ); } @@ -1037,6 +1041,217 @@ class CommentParser { return $result; } + /** + * Get the name of the page from which this thread item is transcluded (if any). Replies to + * transcluded items must be posted on that page, instead of the current one. + * + * This is tricky, because we don't want to mark items as trancluded when they're just using a + * template (e.g. {{ping|…}} or a non-substituted signature template). Sometimes the whole comment + * can be template-generated (e.g. when using some wrapper templates), but as long as a reply can + * be added outside of that template, we should not treat it as transcluded. + * + * The start/end boundary points of comment ranges and Parsoid transclusion ranges don't line up + * exactly, even when to a human it's obvious that they cover the same content, making this more + * complicated. + * + * @return string|bool `false` if this item is not transcluded. A string if it's transcluded + * from a single page (the page title, in text form with spaces). `true` if it's transcluded, but + * we can't determine the source. + */ + public function computeTranscludedFrom( ImmutableRange $commentRange ) { + // Collapsed ranges should otherwise be impossible, but they're not (T299583) + // TODO: See if we can fix the root cause, and remove this? + if ( $commentRange->collapsed ) { + return false; + } + + // General approach: + // + // Compare the comment range to each transclusion range on the page, and if it overlaps any of + // them, examine the overlap. There are a few cases: + // + // * Comment and transclusion do not overlap: + // → Not transcluded. + // * Comment contains the transclusion: + // → Not transcluded (just a template). + // * Comment is contained within the transclusion: + // → Transcluded, we can determine the source page (unless it's a complex transclusion). + // * Comment and transclusion overlap partially: + // → Transcluded, but we can't determine the source page. + // * Comment (almost) exactly matches the transclusion: + // → Maybe transcluded (it could be that the source page only contains that single comment), + // maybe not transcluded (it could be a wrapper template that covers a single comment). + // This is very sad, and we decide based on the namespace. + // + // Most transclusion ranges on the page trivially fall in the "do not overlap" or "contains" + // cases, and we only have to carefully examine the two transclusion ranges that contain the + // first and last node of the comment range. + // + // To check for almost exact matches, we walk between the relevant boundary points, and if we + // only find uninteresting nodes (that would be ignored when detecting comments), we treat them + // like exact matches. + + $startTransclNode = CommentUtils::getTranscludedFromElement( + CommentUtils::getRangeFirstNode( $commentRange ) + ); + $endTransclNode = CommentUtils::getTranscludedFromElement( + CommentUtils::getRangeLastNode( $commentRange ) + ); + + // We only have to examine the two transclusion ranges that contain the first/last node of the + // comment range (if they exist). Ignore ranges outside the comment or in the middle of it. + $transclNodes = []; + if ( $startTransclNode ) { + $transclNodes[] = $startTransclNode; + } + if ( $endTransclNode && $endTransclNode !== $startTransclNode ) { + $transclNodes[] = $endTransclNode; + } + + foreach ( $transclNodes as $transclNode ) { + $transclRange = static::getTransclusionRange( $transclNode ); + $compared = CommentUtils::compareRanges( $commentRange, $transclRange ); + $transclTitles = $this->getTransclusionTitles( $transclNode ); + $simpleTransclTitle = count( $transclTitles ) === 1 ? $transclTitles[0] : null; + + switch ( $compared ) { + case 'equal': + // Comment (almost) exactly matches the transclusion + if ( $simpleTransclTitle === null ) { + // Allow replying to some accidental complex transclusions consisting of only templates + // and wikitext (T313093) + if ( count( $transclTitles ) > 1 ) { + foreach ( $transclTitles as $transclTitle ) { + if ( $transclTitle && !$transclTitle->inNamespace( NS_TEMPLATE ) ) { + return true; + } + } + // Continue examining the other ranges. + break; + } + // Multi-template transclusion, or a parser function call, or template-affected wikitext outside + // of a template call, or a mix of the above + return true; + + } elseif ( $simpleTransclTitle->inNamespace( NS_TEMPLATE ) ) { + // Is that a subpage transclusion with a single comment, or a wrapper template + // transclusion on this page? We don't know, but let's guess based on the namespace. + // (T289873) + // Continue examining the other ranges. + break; + } elseif ( !$simpleTransclTitle->canExist() ) { + // Special page transclusion, probably accidental (T344622). Don't return the title, + // since it's useless for replying, and can't be stored in the permalink database. + return true; + } else { + return $simpleTransclTitle->getPrefixedText(); + } + + case 'contains': + // Comment contains the transclusion + + // If the entire transclusion is contained within the comment range, that's just a + // template. This is the same as a transclusion in the middle of the comment, which we + // ignored earlier, it just takes us longer to get here in this case. + + // Continue examining the other ranges. + break; + + case 'contained': + // Comment is contained within the transclusion + if ( $simpleTransclTitle === null ) { + return true; + } elseif ( !$simpleTransclTitle->canExist() ) { + // Special page transclusion, probably accidental (T344622). Don't return the title, + // since it's useless for replying, and can't be stored in the permalink database. + return true; + } else { + return $simpleTransclTitle->getPrefixedText(); + } + + case 'after': + case 'before': + // Comment and transclusion do not overlap + + // This should be impossible, because we ignored these ranges earlier. + throw new LogicException( 'Unexpected transclusion or comment range' ); + + case 'overlapstart': + case 'overlapend': + // Comment and transclusion overlap partially + return true; + + default: + throw new LogicException( 'Unexpected return value from compareRanges()' ); + } + } + + // If we got here, the comment range was not contained by or overlapping any of the transclusion + // ranges. Comment is not transcluded. + return false; + } + + /** + * Return the page titles for each part of the transclusion, or nulls for each part that isn't + * transcluded from another page. + * + * If the node represents a single-page transclusion, this will return an array containing a + * single Title object. + * + * @param Element $node + * @return (?Title)[] + */ + private function getTransclusionTitles( Element $node ): array { + $dataMw = json_decode( $node->getAttribute( 'data-mw' ) ?? '', true ); + $out = []; + + foreach ( $dataMw['parts'] ?? [] as $part ) { + if ( + !is_string( $part ) && + // 'href' will be unset if this is a parser function rather than a template + isset( $part['template']['target']['href'] ) + ) { + $parsoidHref = $part['template']['target']['href']; + Assert::precondition( substr( $parsoidHref, 0, 2 ) === './', "href has valid format" ); + $out[] = Title::newFromText( urldecode( substr( $parsoidHref, 2 ) ) ); + } else { + $out[] = null; + } + } + + return $out; + } + + /** + * Given a transclusion's first node (e.g. returned by CommentUtils::getTranscludedFromElement()), + * return a range starting before the node and ending after the transclusion's last node. + * + * @param Element $startNode + * @return ImmutableRange + */ + private function getTransclusionRange( Element $startNode ): ImmutableRange { + $endNode = $startNode; + while ( + // Phan doesn't realize that the conditions on $nextSibling can terminate the loop + // @phan-suppress-next-line PhanInfiniteLoop + $endNode && + ( $nextSibling = $endNode->nextSibling ) && + $nextSibling instanceof Element && + $nextSibling->getAttribute( 'about' ) === $endNode->getAttribute( 'about' ) + ) { + $endNode = $nextSibling; + } + + $range = new ImmutableRange( + $startNode->parentNode, + CommentUtils::childIndexOf( $startNode ), + $endNode->parentNode, + CommentUtils::childIndexOf( $endNode ) + 1 + ); + + return $range; + } + /** * Truncate user generated parts of IDs so full ID always fits within a database field of length 255 * diff --git a/includes/ThreadItem/ContentCommentItem.php b/includes/ThreadItem/ContentCommentItem.php index eff57b86a..cc73fa340 100644 --- a/includes/ThreadItem/ContentCommentItem.php +++ b/includes/ThreadItem/ContentCommentItem.php @@ -32,6 +32,7 @@ class ContentCommentItem extends ContentThreadItem implements CommentItem { /** * @param int $level * @param ImmutableRange $range + * @param bool|string $transcludedFrom * @param ImmutableRange[] $signatureRanges Objects describing the extent of signatures (plus * timestamps) for this comment. There is always at least one signature, but there may be * multiple. The author and timestamp of the comment is determined from the first signature. @@ -42,12 +43,12 @@ class ContentCommentItem extends ContentThreadItem implements CommentItem { * @param ?string $displayName Comment author's display name */ public function __construct( - int $level, ImmutableRange $range, + int $level, ImmutableRange $range, $transcludedFrom, array $signatureRanges, array $timestampRanges, DateTimeImmutable $timestamp, string $author, ?string $displayName = null ) { - parent::__construct( 'comment', $level, $range ); + parent::__construct( 'comment', $level, $range, $transcludedFrom ); $this->signatureRanges = $signatureRanges; $this->timestampRanges = $timestampRanges; $this->timestamp = $timestamp; diff --git a/includes/ThreadItem/ContentHeadingItem.php b/includes/ThreadItem/ContentHeadingItem.php index 5581295e2..b01e5acad 100644 --- a/includes/ThreadItem/ContentHeadingItem.php +++ b/includes/ThreadItem/ContentHeadingItem.php @@ -20,12 +20,13 @@ class ContentHeadingItem extends ContentThreadItem implements HeadingItem { /** * @param ImmutableRange $range + * @param bool|string $transcludedFrom * @param ?int $headingLevel Heading level (1-6). Use null for a placeholder heading. */ public function __construct( - ImmutableRange $range, ?int $headingLevel + ImmutableRange $range, $transcludedFrom, ?int $headingLevel ) { - parent::__construct( 'heading', 0, $range ); + parent::__construct( 'heading', 0, $range, $transcludedFrom ); $this->placeholderHeading = $headingLevel === null; $this->headingLevel = $this->placeholderHeading ? static::PLACEHOLDER_HEADING_LEVEL : $headingLevel; } @@ -97,22 +98,6 @@ class ContentHeadingItem extends ContentThreadItem implements HeadingItem { $this->placeholderHeading = $placeholderHeading; } - /** - * @inheritDoc - */ - public function getTranscludedFrom() { - // Placeholder headings break the usual logic, because their ranges are collapsed - if ( $this->isPlaceholderHeading() ) { - return false; - } - // Collapsed ranges should otherwise be impossible, but they're not (T299583) - // TODO: See if we can fix the root cause, and remove this? - if ( $this->getRange()->collapsed ) { - return false; - } - return parent::getTranscludedFrom(); - } - /** * @inheritDoc */ diff --git a/includes/ThreadItem/ContentThreadItem.php b/includes/ThreadItem/ContentThreadItem.php index 9fb90c292..6bd639e94 100644 --- a/includes/ThreadItem/ContentThreadItem.php +++ b/includes/ThreadItem/ContentThreadItem.php @@ -3,13 +3,9 @@ namespace MediaWiki\Extension\DiscussionTools\ThreadItem; use JsonSerializable; -use LogicException; use MediaWiki\Extension\DiscussionTools\CommentModifier; -use MediaWiki\Extension\DiscussionTools\CommentUtils; use MediaWiki\Extension\DiscussionTools\ImmutableRange; use MediaWiki\Parser\Sanitizer; -use MediaWiki\Title\Title; -use Wikimedia\Assert\Assert; use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\Utils\DOMUtils; @@ -31,6 +27,8 @@ abstract class ContentThreadItem implements JsonSerializable, ThreadItem { protected string $id; /** @var ContentThreadItem[] */ protected array $replies = []; + /** @var string|bool */ + private $transcludedFrom; /** @var ?array[] */ protected ?array $authors = null; @@ -43,13 +41,15 @@ abstract class ContentThreadItem implements JsonSerializable, ThreadItem { * @param int $level Indentation level * @param ImmutableRange $range Object describing the extent of the comment, including the * signature and timestamp. + * @param bool|string $transcludedFrom */ public function __construct( - string $type, int $level, ImmutableRange $range + string $type, int $level, ImmutableRange $range, $transcludedFrom ) { $this->type = $type; $this->level = $level; $this->range = $range; + $this->transcludedFrom = $transcludedFrom; } /** @@ -172,209 +172,10 @@ abstract class ContentThreadItem implements JsonSerializable, ThreadItem { } /** - * Get the name of the page from which this thread item is transcluded (if any). Replies to - * transcluded items must be posted on that page, instead of the current one. - * - * This is tricky, because we don't want to mark items as trancluded when they're just using a - * template (e.g. {{ping|…}} or a non-substituted signature template). Sometimes the whole comment - * can be template-generated (e.g. when using some wrapper templates), but as long as a reply can - * be added outside of that template, we should not treat it as transcluded. - * - * The start/end boundary points of comment ranges and Parsoid transclusion ranges don't line up - * exactly, even when to a human it's obvious that they cover the same content, making this more - * complicated. - * - * @return string|bool `false` if this item is not transcluded. A string if it's transcluded - * from a single page (the page title, in text form with spaces). `true` if it's transcluded, but - * we can't determine the source. + * @inheritDoc */ public function getTranscludedFrom() { - // General approach: - // - // Compare the comment range to each transclusion range on the page, and if it overlaps any of - // them, examine the overlap. There are a few cases: - // - // * Comment and transclusion do not overlap: - // → Not transcluded. - // * Comment contains the transclusion: - // → Not transcluded (just a template). - // * Comment is contained within the transclusion: - // → Transcluded, we can determine the source page (unless it's a complex transclusion). - // * Comment and transclusion overlap partially: - // → Transcluded, but we can't determine the source page. - // * Comment (almost) exactly matches the transclusion: - // → Maybe transcluded (it could be that the source page only contains that single comment), - // maybe not transcluded (it could be a wrapper template that covers a single comment). - // This is very sad, and we decide based on the namespace. - // - // Most transclusion ranges on the page trivially fall in the "do not overlap" or "contains" - // cases, and we only have to carefully examine the two transclusion ranges that contain the - // first and last node of the comment range. - // - // To check for almost exact matches, we walk between the relevant boundary points, and if we - // only find uninteresting nodes (that would be ignored when detecting comments), we treat them - // like exact matches. - - $commentRange = $this->getRange(); - $startTransclNode = CommentUtils::getTranscludedFromElement( - CommentUtils::getRangeFirstNode( $commentRange ) - ); - $endTransclNode = CommentUtils::getTranscludedFromElement( - CommentUtils::getRangeLastNode( $commentRange ) - ); - - // We only have to examine the two transclusion ranges that contain the first/last node of the - // comment range (if they exist). Ignore ranges outside the comment or in the middle of it. - $transclNodes = []; - if ( $startTransclNode ) { - $transclNodes[] = $startTransclNode; - } - if ( $endTransclNode && $endTransclNode !== $startTransclNode ) { - $transclNodes[] = $endTransclNode; - } - - foreach ( $transclNodes as $transclNode ) { - $transclRange = static::getTransclusionRange( $transclNode ); - $compared = CommentUtils::compareRanges( $commentRange, $transclRange ); - $transclTitles = $this->getTransclusionTitles( $transclNode ); - $simpleTransclTitle = count( $transclTitles ) === 1 ? $transclTitles[0] : null; - - switch ( $compared ) { - case 'equal': - // Comment (almost) exactly matches the transclusion - if ( $simpleTransclTitle === null ) { - // Allow replying to some accidental complex transclusions consisting of only templates - // and wikitext (T313093) - if ( count( $transclTitles ) > 1 ) { - foreach ( $transclTitles as $transclTitle ) { - if ( $transclTitle && !$transclTitle->inNamespace( NS_TEMPLATE ) ) { - return true; - } - } - // Continue examining the other ranges. - break; - } - // Multi-template transclusion, or a parser function call, or template-affected wikitext outside - // of a template call, or a mix of the above - return true; - - } elseif ( $simpleTransclTitle->inNamespace( NS_TEMPLATE ) ) { - // Is that a subpage transclusion with a single comment, or a wrapper template - // transclusion on this page? We don't know, but let's guess based on the namespace. - // (T289873) - // Continue examining the other ranges. - break; - } elseif ( !$simpleTransclTitle->canExist() ) { - // Special page transclusion, probably accidental (T344622). Don't return the title, - // since it's useless for replying, and can't be stored in the permalink database. - return true; - } else { - return $simpleTransclTitle->getPrefixedText(); - } - - case 'contains': - // Comment contains the transclusion - - // If the entire transclusion is contained within the comment range, that's just a - // template. This is the same as a transclusion in the middle of the comment, which we - // ignored earlier, it just takes us longer to get here in this case. - - // Continue examining the other ranges. - break; - - case 'contained': - // Comment is contained within the transclusion - if ( $simpleTransclTitle === null ) { - return true; - } elseif ( !$simpleTransclTitle->canExist() ) { - // Special page transclusion, probably accidental (T344622). Don't return the title, - // since it's useless for replying, and can't be stored in the permalink database. - return true; - } else { - return $simpleTransclTitle->getPrefixedText(); - } - - case 'after': - case 'before': - // Comment and transclusion do not overlap - - // This should be impossible, because we ignored these ranges earlier. - throw new LogicException( 'Unexpected transclusion or comment range' ); - - case 'overlapstart': - case 'overlapend': - // Comment and transclusion overlap partially - return true; - - default: - throw new LogicException( 'Unexpected return value from compareRanges()' ); - } - } - - // If we got here, the comment range was not contained by or overlapping any of the transclusion - // ranges. Comment is not transcluded. - return false; - } - - /** - * Return the page titles for each part of the transclusion, or nulls for each part that isn't - * transcluded from another page. - * - * If the node represents a single-page transclusion, this will return an array containing a - * single Title object. - * - * @param Element $node - * @return (?Title)[] - */ - private function getTransclusionTitles( Element $node ): array { - $dataMw = json_decode( $node->getAttribute( 'data-mw' ) ?? '', true ); - $out = []; - - foreach ( $dataMw['parts'] ?? [] as $part ) { - if ( - !is_string( $part ) && - // 'href' will be unset if this is a parser function rather than a template - isset( $part['template']['target']['href'] ) - ) { - $parsoidHref = $part['template']['target']['href']; - Assert::precondition( substr( $parsoidHref, 0, 2 ) === './', "href has valid format" ); - $out[] = Title::newFromText( urldecode( substr( $parsoidHref, 2 ) ) ); - } else { - $out[] = null; - } - } - - return $out; - } - - /** - * Given a transclusion's first node (e.g. returned by CommentUtils::getTranscludedFromElement()), - * return a range starting before the node and ending after the transclusion's last node. - * - * @param Element $startNode - * @return ImmutableRange - */ - private function getTransclusionRange( Element $startNode ): ImmutableRange { - $endNode = $startNode; - while ( - // Phan doesn't realize that the conditions on $nextSibling can terminate the loop - // @phan-suppress-next-line PhanInfiniteLoop - $endNode && - ( $nextSibling = $endNode->nextSibling ) && - $nextSibling instanceof Element && - $nextSibling->getAttribute( 'about' ) === $endNode->getAttribute( 'about' ) - ) { - $endNode = $nextSibling; - } - - $range = new ImmutableRange( - $startNode->parentNode, - CommentUtils::childIndexOf( $startNode ), - $endNode->parentNode, - CommentUtils::childIndexOf( $endNode ) + 1 - ); - - return $range; + return $this->transcludedFrom; } /** diff --git a/tests/phpunit/ContentThreadItemTest.php b/tests/phpunit/ContentThreadItemTest.php index b6c8b2934..ba8f3296e 100644 --- a/tests/phpunit/ContentThreadItemTest.php +++ b/tests/phpunit/ContentThreadItemTest.php @@ -32,12 +32,12 @@ class ContentThreadItemTest extends IntegrationTestCase { $makeThreadItem = static function ( array $arr ) use ( &$makeThreadItem, $range ): ContentThreadItem { if ( $arr['type'] === 'comment' ) { $item = new ContentCommentItem( - 1, $range, [], [], new DateTimeImmutable(), + 1, $range, false, [], [], new DateTimeImmutable(), $arr['author'], $arr['displayName'] ?? null ); } else { - $item = new ContentHeadingItem( $range, 2 ); + $item = new ContentHeadingItem( $range, false, 2 ); } $item->setId( $arr['id'] ); foreach ( $arr['replies'] as $reply ) {