Merge "Move finding transclusion source from ContentThreadItem to CommentParser"

This commit is contained in:
jenkins-bot 2023-12-15 16:17:13 +00:00 committed by Gerrit Code Review
commit bf9fc18120
6 changed files with 233 additions and 231 deletions

View file

@ -173,7 +173,7 @@ class ApiDiscussionToolsPageInfo extends ApiBase {
$closest = CommentUtils::closestElementWithSibling( $firstRange->startContainer, 'previous' );
if ( $closest && !$rootNode->isSameNode( $closest ) ) {
$range = new ImmutableRange( $rootNode, 0, $rootNode, 0 );
$fakeHeading = new ContentHeadingItem( $range, null );
$fakeHeading = new ContentHeadingItem( $range, false, null );
$fakeHeading->setRootNode( $rootNode );
$fakeHeading->setName( 'h-' );
$fakeHeading->setId( 'h-' );

View file

@ -15,6 +15,7 @@ use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem;
use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem;
use MediaWiki\Languages\LanguageConverterFactory;
use MediaWiki\Title\MalformedTitleException;
use MediaWiki\Title\Title;
use MediaWiki\Title\TitleParser;
use MediaWiki\Title\TitleValue;
use MediaWiki\Utils\MWTimestamp;
@ -911,7 +912,8 @@ class CommentParser {
$range = new ImmutableRange(
$headingNode, $startOffset, $headingNode, $headingNode->childNodes->length
);
$curComment = new ContentHeadingItem( $range, (int)( $match[ 1 ] ) );
$transcludedFrom = $this->computeTranscludedFrom( $range );
$curComment = new ContentHeadingItem( $range, $transcludedFrom, (int)( $match[ 1 ] ) );
$curComment->setRootNode( $this->rootNode );
$result->addThreadItem( $curComment );
$curCommentEnd = $node;
@ -989,6 +991,7 @@ class CommentParser {
$endNode,
$length
);
$transcludedFrom = $this->computeTranscludedFrom( $range );
$startLevel = CommentUtils::getIndentLevel( $startNode, $this->rootNode ) + 1;
$endLevel = CommentUtils::getIndentLevel( $node, $this->rootNode ) + 1;
@ -1011,6 +1014,7 @@ class CommentParser {
$curComment = new ContentCommentItem(
$level,
$range,
$transcludedFrom,
$sigRanges,
$timestampRanges,
$dateTime,
@ -1025,7 +1029,7 @@ class CommentParser {
// Add a fake placeholder heading if there are any comments in the 0th section
// (before the first real heading)
$range = new ImmutableRange( $this->rootNode, 0, $this->rootNode, 0 );
$fakeHeading = new ContentHeadingItem( $range, null );
$fakeHeading = new ContentHeadingItem( $range, false, null );
$fakeHeading->setRootNode( $this->rootNode );
$result->addThreadItem( $fakeHeading );
}
@ -1037,6 +1041,217 @@ class CommentParser {
return $result;
}
/**
* Get the name of the page from which this thread item is transcluded (if any). Replies to
* transcluded items must be posted on that page, instead of the current one.
*
* This is tricky, because we don't want to mark items as trancluded when they're just using a
* template (e.g. {{ping|}} or a non-substituted signature template). Sometimes the whole comment
* can be template-generated (e.g. when using some wrapper templates), but as long as a reply can
* be added outside of that template, we should not treat it as transcluded.
*
* The start/end boundary points of comment ranges and Parsoid transclusion ranges don't line up
* exactly, even when to a human it's obvious that they cover the same content, making this more
* complicated.
*
* @return string|bool `false` if this item is not transcluded. A string if it's transcluded
* from a single page (the page title, in text form with spaces). `true` if it's transcluded, but
* we can't determine the source.
*/
public function computeTranscludedFrom( ImmutableRange $commentRange ) {
// Collapsed ranges should otherwise be impossible, but they're not (T299583)
// TODO: See if we can fix the root cause, and remove this?
if ( $commentRange->collapsed ) {
return false;
}
// General approach:
//
// Compare the comment range to each transclusion range on the page, and if it overlaps any of
// them, examine the overlap. There are a few cases:
//
// * Comment and transclusion do not overlap:
// → Not transcluded.
// * Comment contains the transclusion:
// → Not transcluded (just a template).
// * Comment is contained within the transclusion:
// → Transcluded, we can determine the source page (unless it's a complex transclusion).
// * Comment and transclusion overlap partially:
// → Transcluded, but we can't determine the source page.
// * Comment (almost) exactly matches the transclusion:
// → Maybe transcluded (it could be that the source page only contains that single comment),
// maybe not transcluded (it could be a wrapper template that covers a single comment).
// This is very sad, and we decide based on the namespace.
//
// Most transclusion ranges on the page trivially fall in the "do not overlap" or "contains"
// cases, and we only have to carefully examine the two transclusion ranges that contain the
// first and last node of the comment range.
//
// To check for almost exact matches, we walk between the relevant boundary points, and if we
// only find uninteresting nodes (that would be ignored when detecting comments), we treat them
// like exact matches.
$startTransclNode = CommentUtils::getTranscludedFromElement(
CommentUtils::getRangeFirstNode( $commentRange )
);
$endTransclNode = CommentUtils::getTranscludedFromElement(
CommentUtils::getRangeLastNode( $commentRange )
);
// We only have to examine the two transclusion ranges that contain the first/last node of the
// comment range (if they exist). Ignore ranges outside the comment or in the middle of it.
$transclNodes = [];
if ( $startTransclNode ) {
$transclNodes[] = $startTransclNode;
}
if ( $endTransclNode && $endTransclNode !== $startTransclNode ) {
$transclNodes[] = $endTransclNode;
}
foreach ( $transclNodes as $transclNode ) {
$transclRange = static::getTransclusionRange( $transclNode );
$compared = CommentUtils::compareRanges( $commentRange, $transclRange );
$transclTitles = $this->getTransclusionTitles( $transclNode );
$simpleTransclTitle = count( $transclTitles ) === 1 ? $transclTitles[0] : null;
switch ( $compared ) {
case 'equal':
// Comment (almost) exactly matches the transclusion
if ( $simpleTransclTitle === null ) {
// Allow replying to some accidental complex transclusions consisting of only templates
// and wikitext (T313093)
if ( count( $transclTitles ) > 1 ) {
foreach ( $transclTitles as $transclTitle ) {
if ( $transclTitle && !$transclTitle->inNamespace( NS_TEMPLATE ) ) {
return true;
}
}
// Continue examining the other ranges.
break;
}
// Multi-template transclusion, or a parser function call, or template-affected wikitext outside
// of a template call, or a mix of the above
return true;
} elseif ( $simpleTransclTitle->inNamespace( NS_TEMPLATE ) ) {
// Is that a subpage transclusion with a single comment, or a wrapper template
// transclusion on this page? We don't know, but let's guess based on the namespace.
// (T289873)
// Continue examining the other ranges.
break;
} elseif ( !$simpleTransclTitle->canExist() ) {
// Special page transclusion, probably accidental (T344622). Don't return the title,
// since it's useless for replying, and can't be stored in the permalink database.
return true;
} else {
return $simpleTransclTitle->getPrefixedText();
}
case 'contains':
// Comment contains the transclusion
// If the entire transclusion is contained within the comment range, that's just a
// template. This is the same as a transclusion in the middle of the comment, which we
// ignored earlier, it just takes us longer to get here in this case.
// Continue examining the other ranges.
break;
case 'contained':
// Comment is contained within the transclusion
if ( $simpleTransclTitle === null ) {
return true;
} elseif ( !$simpleTransclTitle->canExist() ) {
// Special page transclusion, probably accidental (T344622). Don't return the title,
// since it's useless for replying, and can't be stored in the permalink database.
return true;
} else {
return $simpleTransclTitle->getPrefixedText();
}
case 'after':
case 'before':
// Comment and transclusion do not overlap
// This should be impossible, because we ignored these ranges earlier.
throw new LogicException( 'Unexpected transclusion or comment range' );
case 'overlapstart':
case 'overlapend':
// Comment and transclusion overlap partially
return true;
default:
throw new LogicException( 'Unexpected return value from compareRanges()' );
}
}
// If we got here, the comment range was not contained by or overlapping any of the transclusion
// ranges. Comment is not transcluded.
return false;
}
/**
* Return the page titles for each part of the transclusion, or nulls for each part that isn't
* transcluded from another page.
*
* If the node represents a single-page transclusion, this will return an array containing a
* single Title object.
*
* @param Element $node
* @return (?Title)[]
*/
private function getTransclusionTitles( Element $node ): array {
$dataMw = json_decode( $node->getAttribute( 'data-mw' ) ?? '', true );
$out = [];
foreach ( $dataMw['parts'] ?? [] as $part ) {
if (
!is_string( $part ) &&
// 'href' will be unset if this is a parser function rather than a template
isset( $part['template']['target']['href'] )
) {
$parsoidHref = $part['template']['target']['href'];
Assert::precondition( substr( $parsoidHref, 0, 2 ) === './', "href has valid format" );
$out[] = Title::newFromText( urldecode( substr( $parsoidHref, 2 ) ) );
} else {
$out[] = null;
}
}
return $out;
}
/**
* Given a transclusion's first node (e.g. returned by CommentUtils::getTranscludedFromElement()),
* return a range starting before the node and ending after the transclusion's last node.
*
* @param Element $startNode
* @return ImmutableRange
*/
private function getTransclusionRange( Element $startNode ): ImmutableRange {
$endNode = $startNode;
while (
// Phan doesn't realize that the conditions on $nextSibling can terminate the loop
// @phan-suppress-next-line PhanInfiniteLoop
$endNode &&
( $nextSibling = $endNode->nextSibling ) &&
$nextSibling instanceof Element &&
$nextSibling->getAttribute( 'about' ) === $endNode->getAttribute( 'about' )
) {
$endNode = $nextSibling;
}
$range = new ImmutableRange(
$startNode->parentNode,
CommentUtils::childIndexOf( $startNode ),
$endNode->parentNode,
CommentUtils::childIndexOf( $endNode ) + 1
);
return $range;
}
/**
* Truncate user generated parts of IDs so full ID always fits within a database field of length 255
*

View file

@ -32,6 +32,7 @@ class ContentCommentItem extends ContentThreadItem implements CommentItem {
/**
* @param int $level
* @param ImmutableRange $range
* @param bool|string $transcludedFrom
* @param ImmutableRange[] $signatureRanges Objects describing the extent of signatures (plus
* timestamps) for this comment. There is always at least one signature, but there may be
* multiple. The author and timestamp of the comment is determined from the first signature.
@ -42,12 +43,12 @@ class ContentCommentItem extends ContentThreadItem implements CommentItem {
* @param ?string $displayName Comment author's display name
*/
public function __construct(
int $level, ImmutableRange $range,
int $level, ImmutableRange $range, $transcludedFrom,
array $signatureRanges, array $timestampRanges,
DateTimeImmutable $timestamp,
string $author, ?string $displayName = null
) {
parent::__construct( 'comment', $level, $range );
parent::__construct( 'comment', $level, $range, $transcludedFrom );
$this->signatureRanges = $signatureRanges;
$this->timestampRanges = $timestampRanges;
$this->timestamp = $timestamp;

View file

@ -20,12 +20,13 @@ class ContentHeadingItem extends ContentThreadItem implements HeadingItem {
/**
* @param ImmutableRange $range
* @param bool|string $transcludedFrom
* @param ?int $headingLevel Heading level (1-6). Use null for a placeholder heading.
*/
public function __construct(
ImmutableRange $range, ?int $headingLevel
ImmutableRange $range, $transcludedFrom, ?int $headingLevel
) {
parent::__construct( 'heading', 0, $range );
parent::__construct( 'heading', 0, $range, $transcludedFrom );
$this->placeholderHeading = $headingLevel === null;
$this->headingLevel = $this->placeholderHeading ? static::PLACEHOLDER_HEADING_LEVEL : $headingLevel;
}
@ -97,22 +98,6 @@ class ContentHeadingItem extends ContentThreadItem implements HeadingItem {
$this->placeholderHeading = $placeholderHeading;
}
/**
* @inheritDoc
*/
public function getTranscludedFrom() {
// Placeholder headings break the usual logic, because their ranges are collapsed
if ( $this->isPlaceholderHeading() ) {
return false;
}
// Collapsed ranges should otherwise be impossible, but they're not (T299583)
// TODO: See if we can fix the root cause, and remove this?
if ( $this->getRange()->collapsed ) {
return false;
}
return parent::getTranscludedFrom();
}
/**
* @inheritDoc
*/

View file

@ -3,13 +3,9 @@
namespace MediaWiki\Extension\DiscussionTools\ThreadItem;
use JsonSerializable;
use LogicException;
use MediaWiki\Extension\DiscussionTools\CommentModifier;
use MediaWiki\Extension\DiscussionTools\CommentUtils;
use MediaWiki\Extension\DiscussionTools\ImmutableRange;
use MediaWiki\Parser\Sanitizer;
use MediaWiki\Title\Title;
use Wikimedia\Assert\Assert;
use Wikimedia\Parsoid\DOM\Element;
use Wikimedia\Parsoid\Utils\DOMUtils;
@ -31,6 +27,8 @@ abstract class ContentThreadItem implements JsonSerializable, ThreadItem {
protected string $id;
/** @var ContentThreadItem[] */
protected array $replies = [];
/** @var string|bool */
private $transcludedFrom;
/** @var ?array[] */
protected ?array $authors = null;
@ -43,13 +41,15 @@ abstract class ContentThreadItem implements JsonSerializable, ThreadItem {
* @param int $level Indentation level
* @param ImmutableRange $range Object describing the extent of the comment, including the
* signature and timestamp.
* @param bool|string $transcludedFrom
*/
public function __construct(
string $type, int $level, ImmutableRange $range
string $type, int $level, ImmutableRange $range, $transcludedFrom
) {
$this->type = $type;
$this->level = $level;
$this->range = $range;
$this->transcludedFrom = $transcludedFrom;
}
/**
@ -172,209 +172,10 @@ abstract class ContentThreadItem implements JsonSerializable, ThreadItem {
}
/**
* Get the name of the page from which this thread item is transcluded (if any). Replies to
* transcluded items must be posted on that page, instead of the current one.
*
* This is tricky, because we don't want to mark items as trancluded when they're just using a
* template (e.g. {{ping|}} or a non-substituted signature template). Sometimes the whole comment
* can be template-generated (e.g. when using some wrapper templates), but as long as a reply can
* be added outside of that template, we should not treat it as transcluded.
*
* The start/end boundary points of comment ranges and Parsoid transclusion ranges don't line up
* exactly, even when to a human it's obvious that they cover the same content, making this more
* complicated.
*
* @return string|bool `false` if this item is not transcluded. A string if it's transcluded
* from a single page (the page title, in text form with spaces). `true` if it's transcluded, but
* we can't determine the source.
* @inheritDoc
*/
public function getTranscludedFrom() {
// General approach:
//
// Compare the comment range to each transclusion range on the page, and if it overlaps any of
// them, examine the overlap. There are a few cases:
//
// * Comment and transclusion do not overlap:
// → Not transcluded.
// * Comment contains the transclusion:
// → Not transcluded (just a template).
// * Comment is contained within the transclusion:
// → Transcluded, we can determine the source page (unless it's a complex transclusion).
// * Comment and transclusion overlap partially:
// → Transcluded, but we can't determine the source page.
// * Comment (almost) exactly matches the transclusion:
// → Maybe transcluded (it could be that the source page only contains that single comment),
// maybe not transcluded (it could be a wrapper template that covers a single comment).
// This is very sad, and we decide based on the namespace.
//
// Most transclusion ranges on the page trivially fall in the "do not overlap" or "contains"
// cases, and we only have to carefully examine the two transclusion ranges that contain the
// first and last node of the comment range.
//
// To check for almost exact matches, we walk between the relevant boundary points, and if we
// only find uninteresting nodes (that would be ignored when detecting comments), we treat them
// like exact matches.
$commentRange = $this->getRange();
$startTransclNode = CommentUtils::getTranscludedFromElement(
CommentUtils::getRangeFirstNode( $commentRange )
);
$endTransclNode = CommentUtils::getTranscludedFromElement(
CommentUtils::getRangeLastNode( $commentRange )
);
// We only have to examine the two transclusion ranges that contain the first/last node of the
// comment range (if they exist). Ignore ranges outside the comment or in the middle of it.
$transclNodes = [];
if ( $startTransclNode ) {
$transclNodes[] = $startTransclNode;
}
if ( $endTransclNode && $endTransclNode !== $startTransclNode ) {
$transclNodes[] = $endTransclNode;
}
foreach ( $transclNodes as $transclNode ) {
$transclRange = static::getTransclusionRange( $transclNode );
$compared = CommentUtils::compareRanges( $commentRange, $transclRange );
$transclTitles = $this->getTransclusionTitles( $transclNode );
$simpleTransclTitle = count( $transclTitles ) === 1 ? $transclTitles[0] : null;
switch ( $compared ) {
case 'equal':
// Comment (almost) exactly matches the transclusion
if ( $simpleTransclTitle === null ) {
// Allow replying to some accidental complex transclusions consisting of only templates
// and wikitext (T313093)
if ( count( $transclTitles ) > 1 ) {
foreach ( $transclTitles as $transclTitle ) {
if ( $transclTitle && !$transclTitle->inNamespace( NS_TEMPLATE ) ) {
return true;
}
}
// Continue examining the other ranges.
break;
}
// Multi-template transclusion, or a parser function call, or template-affected wikitext outside
// of a template call, or a mix of the above
return true;
} elseif ( $simpleTransclTitle->inNamespace( NS_TEMPLATE ) ) {
// Is that a subpage transclusion with a single comment, or a wrapper template
// transclusion on this page? We don't know, but let's guess based on the namespace.
// (T289873)
// Continue examining the other ranges.
break;
} elseif ( !$simpleTransclTitle->canExist() ) {
// Special page transclusion, probably accidental (T344622). Don't return the title,
// since it's useless for replying, and can't be stored in the permalink database.
return true;
} else {
return $simpleTransclTitle->getPrefixedText();
}
case 'contains':
// Comment contains the transclusion
// If the entire transclusion is contained within the comment range, that's just a
// template. This is the same as a transclusion in the middle of the comment, which we
// ignored earlier, it just takes us longer to get here in this case.
// Continue examining the other ranges.
break;
case 'contained':
// Comment is contained within the transclusion
if ( $simpleTransclTitle === null ) {
return true;
} elseif ( !$simpleTransclTitle->canExist() ) {
// Special page transclusion, probably accidental (T344622). Don't return the title,
// since it's useless for replying, and can't be stored in the permalink database.
return true;
} else {
return $simpleTransclTitle->getPrefixedText();
}
case 'after':
case 'before':
// Comment and transclusion do not overlap
// This should be impossible, because we ignored these ranges earlier.
throw new LogicException( 'Unexpected transclusion or comment range' );
case 'overlapstart':
case 'overlapend':
// Comment and transclusion overlap partially
return true;
default:
throw new LogicException( 'Unexpected return value from compareRanges()' );
}
}
// If we got here, the comment range was not contained by or overlapping any of the transclusion
// ranges. Comment is not transcluded.
return false;
}
/**
* Return the page titles for each part of the transclusion, or nulls for each part that isn't
* transcluded from another page.
*
* If the node represents a single-page transclusion, this will return an array containing a
* single Title object.
*
* @param Element $node
* @return (?Title)[]
*/
private function getTransclusionTitles( Element $node ): array {
$dataMw = json_decode( $node->getAttribute( 'data-mw' ) ?? '', true );
$out = [];
foreach ( $dataMw['parts'] ?? [] as $part ) {
if (
!is_string( $part ) &&
// 'href' will be unset if this is a parser function rather than a template
isset( $part['template']['target']['href'] )
) {
$parsoidHref = $part['template']['target']['href'];
Assert::precondition( substr( $parsoidHref, 0, 2 ) === './', "href has valid format" );
$out[] = Title::newFromText( urldecode( substr( $parsoidHref, 2 ) ) );
} else {
$out[] = null;
}
}
return $out;
}
/**
* Given a transclusion's first node (e.g. returned by CommentUtils::getTranscludedFromElement()),
* return a range starting before the node and ending after the transclusion's last node.
*
* @param Element $startNode
* @return ImmutableRange
*/
private function getTransclusionRange( Element $startNode ): ImmutableRange {
$endNode = $startNode;
while (
// Phan doesn't realize that the conditions on $nextSibling can terminate the loop
// @phan-suppress-next-line PhanInfiniteLoop
$endNode &&
( $nextSibling = $endNode->nextSibling ) &&
$nextSibling instanceof Element &&
$nextSibling->getAttribute( 'about' ) === $endNode->getAttribute( 'about' )
) {
$endNode = $nextSibling;
}
$range = new ImmutableRange(
$startNode->parentNode,
CommentUtils::childIndexOf( $startNode ),
$endNode->parentNode,
CommentUtils::childIndexOf( $endNode ) + 1
);
return $range;
return $this->transcludedFrom;
}
/**

View file

@ -32,12 +32,12 @@ class ContentThreadItemTest extends IntegrationTestCase {
$makeThreadItem = static function ( array $arr ) use ( &$makeThreadItem, $range ): ContentThreadItem {
if ( $arr['type'] === 'comment' ) {
$item = new ContentCommentItem(
1, $range, [], [], new DateTimeImmutable(),
1, $range, false, [], [], new DateTimeImmutable(),
$arr['author'],
$arr['displayName'] ?? null
);
} else {
$item = new ContentHeadingItem( $range, 2 );
$item = new ContentHeadingItem( $range, false, 2 );
}
$item->setId( $arr['id'] );
foreach ( $arr['replies'] as $reply ) {