mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Echo
synced 2024-12-04 20:28:49 +00:00
fdc0eb2bbc
Instead of turning the array keys into an array of strings and searching that we can access the original array by key. Change-Id: Id4fddd242cff66ee70fa1cc07cdcb6938482696c
1282 lines
39 KiB
PHP
1282 lines
39 KiB
PHP
<?php
|
|
|
|
use MediaWiki\Extension\Notifications\Model\Event;
|
|
use MediaWiki\MediaWikiServices;
|
|
use MediaWiki\Revision\RevisionRecord;
|
|
use MediaWiki\Revision\SlotRecord;
|
|
use MediaWiki\User\UserNameUtils;
|
|
|
|
abstract class EchoDiscussionParser {
|
|
private const HEADER_REGEX = '^(==+)\h*([^=].*)\h*\1$';
|
|
|
|
public const DEFAULT_SNIPPET_LENGTH = 150;
|
|
|
|
/** @var string|null */
|
|
protected static $timestampRegex;
|
|
/** @var array[][] */
|
|
protected static $revisionInterpretationCache = [];
|
|
/** @var EchoDiffParser|null */
|
|
protected static $diffParser;
|
|
|
|
/**
|
|
* Given a RevisionRecord object, generates Event objects for
|
|
* the discussion-related actions that occurred in that Revision.
|
|
*
|
|
* @param RevisionRecord $revision
|
|
* @param bool $isRevert
|
|
*/
|
|
public static function generateEventsForRevision( RevisionRecord $revision, $isRevert ) {
|
|
global $wgEchoMentionsOnMultipleSectionEdits;
|
|
global $wgEchoMentionOnChanges;
|
|
$store = MediaWikiServices::getInstance()->getRevisionStore();
|
|
|
|
// use replica database if there is a previous revision
|
|
if ( $store->getPreviousRevision( $revision ) ) {
|
|
$title = Title::newFromID( $revision->getPageId() );
|
|
// use primary database for new page
|
|
} else {
|
|
$title = Title::newFromID( $revision->getPageId(), Title::GAID_FOR_UPDATE );
|
|
}
|
|
|
|
// not a valid title
|
|
if ( !$title ) {
|
|
return;
|
|
}
|
|
|
|
$events = [];
|
|
|
|
$interpretation = self::getChangeInterpretationForRevision( $revision );
|
|
|
|
$userID = $revision->getUser()->getId();
|
|
$userName = $revision->getUser()->getName();
|
|
$user = $userID !== 0 ? User::newFromId( $userID ) : User::newFromName( $userName, false );
|
|
|
|
foreach ( $interpretation as $action ) {
|
|
if ( $action['type'] === 'add-comment' ) {
|
|
$fullSection = $action['full-section'];
|
|
$header = self::extractHeader( $fullSection );
|
|
$userLinks = self::getUserLinks( $action['content'], $title );
|
|
$events = array_merge(
|
|
$events,
|
|
self::collectMentionEvents( $header, $userLinks, $action['content'], $revision, $user )
|
|
);
|
|
} elseif ( $action['type'] === 'new-section-with-comment' ) {
|
|
$content = $action['content'];
|
|
$header = self::extractHeader( $content );
|
|
$userLinks = self::getUserLinks( $content, $title );
|
|
$events = array_merge(
|
|
$events,
|
|
self::collectMentionEvents( $header, $userLinks, $content, $revision, $user )
|
|
);
|
|
} elseif ( $action['type'] === 'add-section-multiple' && $wgEchoMentionsOnMultipleSectionEdits ) {
|
|
$content = self::stripHeader( $action['content'] );
|
|
$content = self::stripSignature( $content );
|
|
$userLinks = self::getUserLinks( $content, $title );
|
|
$events = array_merge(
|
|
$events,
|
|
self::collectMentionEvents( $action['header'], $userLinks, $content, $revision, $user )
|
|
);
|
|
} elseif ( $action['type'] === 'unknown-signed-change' ) {
|
|
$userLinks = array_diff_key(
|
|
self::getUserLinks( $action['new_content'], $title ),
|
|
self::getUserLinks( $action['old_content'], $title )
|
|
);
|
|
$header = self::extractHeader( $action['full-section'] );
|
|
|
|
if ( $wgEchoMentionOnChanges ) {
|
|
$events = array_merge(
|
|
$events,
|
|
self::collectMentionEvents( $header, $userLinks, $action['new_content'], $revision, $user )
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( $title->getNamespace() === NS_USER_TALK ) {
|
|
$notifyUser = User::newFromName( $title->getText() );
|
|
// If the recipient is a valid non-anonymous user and hasn't turned
|
|
// off their notifications, generate a talk page post Echo notification.
|
|
if ( $notifyUser && $notifyUser->getId() ) {
|
|
$permManager = MediaWikiServices::getInstance()->getPermissionManager();
|
|
// If this is a minor edit, only notify if the agent doesn't have talk page minor
|
|
// edit notification blocked
|
|
if ( !$revision->isMinor() || !$permManager->userHasRight( $user, 'nominornewtalk' ) ) {
|
|
$section = self::detectSectionTitleAndText( $interpretation, $title );
|
|
if ( $section['section-text'] === '' ) {
|
|
$comment = $revision->getComment( RevisionRecord::FOR_PUBLIC, $notifyUser );
|
|
if ( $comment ) {
|
|
$section['section-text'] = $comment->text;
|
|
}
|
|
}
|
|
$events[] = [
|
|
'type' => 'edit-user-talk',
|
|
'title' => $title,
|
|
'extra' => [
|
|
'revid' => $revision->getId(),
|
|
'minoredit' => $revision->isMinor(),
|
|
'section-title' => $section['section-title'],
|
|
'section-text' => $section['section-text'],
|
|
'target-page' => $title->getArticleID(),
|
|
],
|
|
'agent' => $user,
|
|
];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Notify users mentioned in edit summary
|
|
global $wgEchoMaxMentionsInEditSummary;
|
|
|
|
if ( $wgEchoMaxMentionsInEditSummary > 0 && !$user->isBot() && !$isRevert ) {
|
|
$summaryParser = new EchoSummaryParser();
|
|
$usersInSummary = $summaryParser->parse( $revision->getComment()->text );
|
|
|
|
// Don't allow pinging yourself
|
|
unset( $usersInSummary[$userName] );
|
|
|
|
$count = 0;
|
|
$mentionedUsers = [];
|
|
foreach ( $usersInSummary as $summaryUser ) {
|
|
if ( $summaryUser->getTalkPage()->equals( $title ) ) {
|
|
// Users already get a ping when their talk page is edited
|
|
continue;
|
|
}
|
|
if ( $count >= $wgEchoMaxMentionsInEditSummary ) {
|
|
break;
|
|
}
|
|
$mentionedUsers[$summaryUser->getId()] = $summaryUser->getId();
|
|
$count++;
|
|
}
|
|
|
|
if ( $mentionedUsers ) {
|
|
$events[] = [
|
|
'type' => 'mention-summary',
|
|
'title' => $title,
|
|
'extra' => [
|
|
'revid' => $revision->getId(),
|
|
'mentioned-users' => $mentionedUsers,
|
|
],
|
|
'agent' => $user,
|
|
];
|
|
}
|
|
}
|
|
|
|
// Allow extensions to generate more events for a revision, and de-duplicate
|
|
// against the standard events created above.
|
|
Hooks::run( 'EchoGetEventsForRevision', [ &$events, $revision, $isRevert ] );
|
|
|
|
// Create events
|
|
foreach ( $events as $event ) {
|
|
Event::create( $event );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Attempts to determine what section title the edit was performed under (if any)
|
|
*
|
|
* @param array[] $interpretation Results of {@see getChangeInterpretationForRevision}
|
|
* @param Title|null $title
|
|
* @return string[] Array containing section title and text
|
|
*/
|
|
public static function detectSectionTitleAndText( array $interpretation, Title $title = null ) {
|
|
$header = $snippet = '';
|
|
$found = false;
|
|
|
|
foreach ( $interpretation as $action ) {
|
|
switch ( $action['type'] ) {
|
|
case 'add-comment':
|
|
$header = self::extractHeader( $action['full-section'] );
|
|
$snippet = self::getTextSnippet(
|
|
self::stripSignature( self::stripHeader( $action['content'] ), $title ),
|
|
RequestContext::getMain()->getLanguage(),
|
|
self::DEFAULT_SNIPPET_LENGTH,
|
|
$title );
|
|
break;
|
|
case 'new-section-with-comment':
|
|
$header = self::extractHeader( $action['content'] );
|
|
$snippet = self::getTextSnippet(
|
|
self::stripSignature( self::stripHeader( $action['content'] ), $title ),
|
|
RequestContext::getMain()->getLanguage(),
|
|
self::DEFAULT_SNIPPET_LENGTH,
|
|
$title );
|
|
break;
|
|
}
|
|
if ( $header ) {
|
|
// If we find a second header within the same change interpretation then
|
|
// we cannot choose just 1 to link to
|
|
if ( $found ) {
|
|
$found = false;
|
|
break;
|
|
}
|
|
$found = true;
|
|
}
|
|
}
|
|
if ( !$found ) {
|
|
return [ 'section-title' => '', 'section-text' => '' ];
|
|
}
|
|
|
|
return [ 'section-title' => $header, 'section-text' => $snippet ];
|
|
}
|
|
|
|
/**
|
|
* For an action taken on a talk page, notify users whose user pages
|
|
* are linked.
|
|
* @param string $header The subject line for the discussion.
|
|
* @param int[] $userLinks
|
|
* @param string $content The content of the post, as a wikitext string.
|
|
* @param RevisionRecord $revision
|
|
* @param User $agent The user who made the comment.
|
|
*/
|
|
public static function generateMentionEvents(
|
|
$header,
|
|
array $userLinks,
|
|
$content,
|
|
RevisionRecord $revision,
|
|
User $agent
|
|
) {
|
|
$events = self::collectMentionEvents( $header, $userLinks, $content, $revision, $agent );
|
|
foreach ( $events as $event ) {
|
|
Event::create( $event );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate mention event data for a talk page action
|
|
* @param string $header The subject line for the discussion.
|
|
* @param int[] $userLinks
|
|
* @param string $content The content of the post, as a wikitext string.
|
|
* @param RevisionRecord $revision
|
|
* @param User $agent The user who made the comment.
|
|
* @return array List of event info arrays
|
|
*/
|
|
protected static function collectMentionEvents(
|
|
$header,
|
|
array $userLinks,
|
|
$content,
|
|
RevisionRecord $revision,
|
|
User $agent
|
|
) {
|
|
global $wgEchoMaxMentionsCount, $wgEchoMentionStatusNotifications;
|
|
|
|
$title = Title::newFromLinkTarget( $revision->getPageAsLinkTarget() );
|
|
if ( !$title ) {
|
|
return [];
|
|
}
|
|
$content = self::stripHeader( $content );
|
|
$content = self::stripSignature( $content, $title );
|
|
|
|
if ( !$userLinks ) {
|
|
return [];
|
|
}
|
|
|
|
$userMentions = self::getUserMentions(
|
|
$title, $revision->getUser( RevisionRecord::RAW )->getId(), $userLinks
|
|
);
|
|
$overallMentionsCount = self::getOverallUserMentionsCount( $userMentions );
|
|
if ( $overallMentionsCount === 0 ) {
|
|
return [];
|
|
}
|
|
|
|
$events = [];
|
|
$stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
|
|
|
|
if ( $overallMentionsCount > $wgEchoMaxMentionsCount ) {
|
|
if ( $wgEchoMentionStatusNotifications ) {
|
|
$events[] = [
|
|
'type' => 'mention-failure-too-many',
|
|
'title' => $title,
|
|
'extra' => [
|
|
'max-mentions' => $wgEchoMaxMentionsCount,
|
|
'section-title' => $header,
|
|
],
|
|
'agent' => $agent,
|
|
];
|
|
$stats->increment( 'echo.event.mention.notification.failure-too-many' );
|
|
}
|
|
return $events;
|
|
}
|
|
|
|
if ( $userMentions['validMentions'] ) {
|
|
$events[] = [
|
|
'type' => 'mention',
|
|
'title' => $title,
|
|
'extra' => [
|
|
'content' => $content,
|
|
'section-title' => $header,
|
|
'revid' => $revision->getId(),
|
|
'mentioned-users' => $userMentions['validMentions'],
|
|
],
|
|
'agent' => $agent,
|
|
];
|
|
}
|
|
|
|
if ( $wgEchoMentionStatusNotifications ) {
|
|
// TODO batch?
|
|
foreach ( $userMentions['validMentions'] as $mentionedUserId ) {
|
|
$events[] = [
|
|
'type' => 'mention-success',
|
|
'title' => $title,
|
|
'extra' => [
|
|
'subject-name' => User::newFromId( $mentionedUserId )->getName(),
|
|
'section-title' => $header,
|
|
'revid' => $revision->getId(),
|
|
],
|
|
'agent' => $agent,
|
|
];
|
|
$stats->increment( 'echo.event.mention.notification.success' );
|
|
}
|
|
|
|
// TODO batch?
|
|
foreach ( $userMentions['anonymousUsers'] as $anonymousUser ) {
|
|
$events[] = [
|
|
'type' => 'mention-failure',
|
|
'title' => $title,
|
|
'extra' => [
|
|
'failure-type' => 'user-anonymous',
|
|
'subject-name' => $anonymousUser,
|
|
'section-title' => $header,
|
|
'revid' => $revision->getId(),
|
|
],
|
|
'agent' => $agent,
|
|
];
|
|
$stats->increment( 'echo.event.mention.notification.failure-user-anonymous' );
|
|
}
|
|
|
|
// TODO batch?
|
|
foreach ( $userMentions['unknownUsers'] as $unknownUser ) {
|
|
$events[] = [
|
|
'type' => 'mention-failure',
|
|
'title' => $title,
|
|
'extra' => [
|
|
'failure-type' => 'user-unknown',
|
|
'subject-name' => $unknownUser,
|
|
'section-title' => $header,
|
|
'revid' => $revision->getId(),
|
|
],
|
|
'agent' => $agent,
|
|
];
|
|
$stats->increment( 'echo.event.mention.notification.failure-user-unknown' );
|
|
}
|
|
}
|
|
|
|
return $events;
|
|
}
|
|
|
|
private static function getOverallUserMentionsCount( array $userMentions ) {
|
|
return count( $userMentions, COUNT_RECURSIVE ) - count( $userMentions );
|
|
}
|
|
|
|
/**
|
|
* @param Title $title
|
|
* @param int $revisionUserId
|
|
* @param int[] $userLinks
|
|
* @return array[]
|
|
* Set of arrays containing valid mentions and possible intended but failed mentions.
|
|
* - [validMentions]: An array of valid users to mention with ID => ID.
|
|
* - [unknownUsers]: An array of DBKey strings representing unknown users.
|
|
* - [anonymousUsers]: An array of DBKey strings representing anonymous IP users.
|
|
*/
|
|
public static function getUserMentions( Title $title, $revisionUserId, array $userLinks ) {
|
|
global $wgEchoMaxMentionsCount;
|
|
|
|
$userMentions = [
|
|
'validMentions' => [],
|
|
'unknownUsers' => [],
|
|
'anonymousUsers' => [],
|
|
];
|
|
|
|
$count = 0;
|
|
$stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
|
|
$userNameUtils = MediaWikiServices::getInstance()->getUserNameUtils();
|
|
|
|
foreach ( $userLinks as $dbk => $page_id ) {
|
|
// If more users are being pinged this is likely a spam/attack vector
|
|
// Don't send any mention notifications.
|
|
if ( $count > $wgEchoMaxMentionsCount ) {
|
|
$stats->increment( 'echo.event.mention.error.tooMany' );
|
|
break;
|
|
}
|
|
|
|
// we should not add user to 'mention' notification list if
|
|
// 1. the user link links to a subpage
|
|
if ( self::hasSubpage( $dbk ) ) {
|
|
continue;
|
|
}
|
|
|
|
// 2. user is an anonymous IP
|
|
if ( $userNameUtils->isIP( $dbk ) ) {
|
|
$userMentions['anonymousUsers'][] = $dbk;
|
|
$count++;
|
|
$stats->increment( 'echo.event.mention.error.anonUser' );
|
|
continue;
|
|
}
|
|
|
|
$user = User::newFromName( $dbk );
|
|
// 3. the user name is not valid
|
|
if ( !$user ) {
|
|
$userMentions['unknownUsers'][] = str_replace( '_', ' ', $dbk );
|
|
$count++;
|
|
$stats->increment( 'echo.event.mention.error.invalidUser' );
|
|
continue;
|
|
}
|
|
|
|
// 4. the user mentions themselves
|
|
if ( $user->getId() === $revisionUserId ) {
|
|
$stats->increment( 'echo.event.mention.error.sameUser' );
|
|
continue;
|
|
}
|
|
|
|
// 5. the user is the owner of the talk page
|
|
if ( $title->getNamespace() === NS_USER_TALK && $title->getDBkey() === $dbk ) {
|
|
$stats->increment( 'echo.event.mention.error.ownPage' );
|
|
continue;
|
|
}
|
|
|
|
// 6. user does not exist
|
|
if ( $user->getId() === 0 ) {
|
|
$userMentions['unknownUsers'][] = str_replace( '_', ' ', $dbk );
|
|
$count++;
|
|
$stats->increment( 'echo.event.mention.error.unknownUser' );
|
|
continue;
|
|
}
|
|
|
|
$userMentions['validMentions'][$user->getId()] = $user->getId();
|
|
$count++;
|
|
}
|
|
|
|
return $userMentions;
|
|
}
|
|
|
|
/**
|
|
* @param string $content
|
|
* @param Title $title
|
|
* @return int[]
|
|
* Array of links in the user namespace with DBKey => ID.
|
|
*/
|
|
public static function getUserLinks( $content, Title $title ) {
|
|
$output = self::parseNonEditWikitext( $content, new Article( $title ) );
|
|
$links = $output->getLinks();
|
|
|
|
if ( !isset( $links[NS_USER] ) || !is_array( $links[NS_USER] ) ) {
|
|
return [];
|
|
}
|
|
|
|
return $links[NS_USER];
|
|
}
|
|
|
|
private static function hasSubpage( $dbk ) {
|
|
return strpos( $dbk, '/' ) !== false;
|
|
}
|
|
|
|
/**
|
|
* It's like Article::prepareTextForEdit,
|
|
* but not for editing (old wikitext usually)
|
|
* Stolen from AbuseFilter's VariableHolder
|
|
*
|
|
* @param string $wikitext
|
|
* @param Article $article
|
|
*
|
|
* @return ParserOutput
|
|
*/
|
|
public static function parseNonEditWikitext( $wikitext, Article $article ) {
|
|
static $cache = [];
|
|
|
|
$cacheKey = md5( $wikitext ) . ':' . $article->getTitle()->getPrefixedText();
|
|
|
|
if ( isset( $cache[$cacheKey] ) ) {
|
|
return $cache[$cacheKey];
|
|
}
|
|
|
|
$parser = MediaWikiServices::getInstance()->getParser();
|
|
|
|
$options = new ParserOptions( $article->getContext()->getUser() );
|
|
$output = $parser->parse( $wikitext, $article->getTitle(), $options );
|
|
$cache[$cacheKey] = $output;
|
|
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Given a Revision object, returns a talk-page-centric interpretation
|
|
* of the changes made in it.
|
|
*
|
|
* @param RevisionRecord $revision
|
|
* @see EchoDiscussionParser::interpretDiff
|
|
* @return array[] See {@see interpretDiff} for details.
|
|
*/
|
|
private static function getChangeInterpretationForRevision( RevisionRecord $revision ) {
|
|
if ( $revision->getId() && isset( self::$revisionInterpretationCache[$revision->getId()] ) ) {
|
|
return self::$revisionInterpretationCache[$revision->getId()];
|
|
}
|
|
|
|
$userIdentity = $revision->getUser();
|
|
|
|
$prevText = '';
|
|
if ( $revision->getParentId() ) {
|
|
$store = MediaWikiServices::getInstance()->getRevisionStore();
|
|
$prevRevision = $store->getRevisionById( $revision->getParentId() );
|
|
if ( $prevRevision ) {
|
|
$prevContent = $prevRevision->getContent( SlotRecord::MAIN );
|
|
$prevText = ( $prevContent instanceof TextContent ) ? $prevContent->getText() : '';
|
|
}
|
|
}
|
|
|
|
$content = $revision->getContent( SlotRecord::MAIN );
|
|
$changes = self::getMachineReadableDiff(
|
|
$prevText,
|
|
( $content instanceof TextContent ) ? $content->getText() : ''
|
|
);
|
|
$output = self::interpretDiff(
|
|
$changes,
|
|
$userIdentity ? $userIdentity->getName() : '',
|
|
Title::newFromLinkTarget( $revision->getPageAsLinkTarget() )
|
|
);
|
|
|
|
self::$revisionInterpretationCache[$revision->getId()] = $output;
|
|
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Given a machine-readable diff, interprets the changes
|
|
* in terms of discussion page actions
|
|
*
|
|
* @todo Expand recognisable actions.
|
|
*
|
|
* @param array[] $changes Output of Event::getMachineReadableDiff
|
|
* @param string $username
|
|
* @param Title|null $title
|
|
* @return array[] Array of associative arrays.
|
|
*
|
|
* Each entry represents an action, which is classified in the 'action' field.
|
|
* All types contain a 'content' field except 'unknown'
|
|
* (which instead passes through the machine-readable diff in 'details')
|
|
* and 'unknown-change' (which provides 'new_content' and 'old_content')
|
|
* action may be:
|
|
* - add-comment: A comment signed by the user is added to an
|
|
* existing section.
|
|
* - new-section-with-comment: A new section is added, containing
|
|
* a single comment signed by the user in question.
|
|
* - add-section-multiple: A new section or additions to a section
|
|
* while editing multiple sections at once.
|
|
* - unknown-multi-signed-addition: Some signed content is added,
|
|
* but it contains multiple signatures.
|
|
* - unknown-unsigned-addition: Some content is added, but it is
|
|
* unsigned.
|
|
* - unknown-subtraction: Some content was removed. These actions are
|
|
* not currently analysed.
|
|
* - unknown-change: Some content was replaced with other content.
|
|
* - unknown-signed-change: Same as unknown-change, but signed.
|
|
* - unknown-multi-signed-change: Same as unknown-change,
|
|
* but it contains multiple signatures.
|
|
* - unknown: Unrecognised change type.
|
|
*/
|
|
public static function interpretDiff( array $changes, $username, Title $title = null ) {
|
|
// One extra item in $changes for _info
|
|
$actions = [];
|
|
$signedSections = [];
|
|
|
|
foreach ( $changes as $index => $change ) {
|
|
if ( !is_numeric( $index ) ) {
|
|
continue;
|
|
}
|
|
|
|
if ( !$change['action'] ) {
|
|
// Unknown action; skip
|
|
continue;
|
|
}
|
|
|
|
if ( $change['action'] === 'add' ) {
|
|
$content = trim( $change['content'] );
|
|
// The \A means the regex must match at the beginning of the string.
|
|
// This is slightly different than ^ which matches beginning of each
|
|
// line in multiline mode.
|
|
$startSection = preg_match( '/\A' . self::HEADER_REGEX . '/um', $content );
|
|
$sectionCount = self::getSectionCount( $content );
|
|
$signedUsers = self::extractSignatures( $content, $title );
|
|
|
|
if (
|
|
count( $signedUsers ) === 1 &&
|
|
isset( $signedUsers[$username] )
|
|
) {
|
|
if ( $sectionCount === 0 ) {
|
|
$signedSections[] = self::getSectionSpan( $change['right-pos'], $changes['_info']['rhs'] );
|
|
$fullSection = self::getFullSection( $changes['_info']['rhs'], $change['right-pos'] );
|
|
$actions[] = [
|
|
'type' => 'add-comment',
|
|
'content' => $content,
|
|
'full-section' => $fullSection,
|
|
];
|
|
} elseif ( $startSection && $sectionCount === 1 ) {
|
|
$signedSections[] = self::getSectionSpan( $change['right-pos'], $changes['_info']['rhs'] );
|
|
$actions[] = [
|
|
'type' => 'new-section-with-comment',
|
|
'content' => $content,
|
|
];
|
|
} else {
|
|
$nextSectionStart = $change['right-pos'];
|
|
$sectionData = self::extractSections( $content );
|
|
foreach ( $sectionData as $section ) {
|
|
$sectionSpan = self::getSectionSpan( $nextSectionStart, $changes['_info']['rhs'] );
|
|
$nextSectionStart = $sectionSpan[1] + 1;
|
|
$sectionSignedUsers = self::extractSignatures( $section['content'], $title );
|
|
if ( !empty( $sectionSignedUsers ) ) {
|
|
$signedSections[] = $sectionSpan;
|
|
if ( !$section['header'] ) {
|
|
$fullSection = self::getFullSection(
|
|
$changes['_info']['rhs'],
|
|
$change['right-pos']
|
|
);
|
|
$section['header'] = self::extractHeader( $fullSection );
|
|
}
|
|
$actions[] = [
|
|
'type' => 'add-section-multiple',
|
|
'content' => $section['content'],
|
|
'header' => $section['header'],
|
|
];
|
|
} else {
|
|
$actions[] = [
|
|
'type' => 'unknown-unsigned-addition',
|
|
'content' => $section['content'],
|
|
];
|
|
}
|
|
}
|
|
}
|
|
} elseif ( $signedUsers !== [] ) {
|
|
$actions[] = [
|
|
'type' => 'unknown-multi-signed-addition',
|
|
'content' => $content,
|
|
];
|
|
} else {
|
|
$actions[] = [
|
|
'type' => 'unknown-unsigned-addition',
|
|
'content' => $content,
|
|
];
|
|
}
|
|
} elseif ( $change['action'] === 'subtract' ) {
|
|
$actions[] = [
|
|
'type' => 'unknown-subtraction',
|
|
'content' => $change['content'],
|
|
];
|
|
} elseif ( $change['action'] === 'change' ) {
|
|
$actions[] = [
|
|
'type' => 'unknown-change',
|
|
'old_content' => $change['old_content'],
|
|
'new_content' => $change['new_content'],
|
|
'right-pos' => $change['right-pos'],
|
|
'full-section' => self::getFullSection( $changes['_info']['rhs'], $change['right-pos'] ),
|
|
];
|
|
|
|
if ( self::hasNewSignature(
|
|
$change['old_content'],
|
|
$change['new_content'],
|
|
$username,
|
|
$title
|
|
) ) {
|
|
$signedSections[] = self::getSectionSpan( $change['right-pos'], $changes['_info']['rhs'] );
|
|
}
|
|
} else {
|
|
$actions[] = [
|
|
'type' => 'unknown',
|
|
'details' => $change,
|
|
];
|
|
}
|
|
}
|
|
|
|
if ( !empty( $signedSections ) ) {
|
|
$actions = self::convertToUnknownSignedChanges( $signedSections, $actions );
|
|
}
|
|
|
|
return $actions;
|
|
}
|
|
|
|
private static function hasNewSignature( $oldContent, $newContent, $username, $title ) {
|
|
$oldSignedUsers = self::extractSignatures( $oldContent, $title );
|
|
$newSignedUsers = self::extractSignatures( $newContent, $title );
|
|
|
|
return !isset( $oldSignedUsers[$username] ) && isset( $newSignedUsers[$username] );
|
|
}
|
|
|
|
/**
|
|
* Converts actions of type "unknown-change" to "unknown-signed-change" if the change is in a signed section.
|
|
*
|
|
* @param array[] $signedSections Array of arrays containing first and last line number of signed sections
|
|
* @param array[] $actions
|
|
* @return array[] Converted actions
|
|
*/
|
|
private static function convertToUnknownSignedChanges( array $signedSections, array $actions ) {
|
|
return array_map( function ( $action ) use( $signedSections ) {
|
|
if (
|
|
$action['type'] === 'unknown-change' &&
|
|
self::isInSignedSection( $action['right-pos'], $signedSections )
|
|
) {
|
|
$signedUsers = self::extractSignatures( $action['new_content'], null );
|
|
if ( count( $signedUsers ) === 1 ) {
|
|
$action['type'] = 'unknown-signed-change';
|
|
} else {
|
|
$action['type'] = 'unknown-multi-signed-change';
|
|
}
|
|
}
|
|
|
|
return $action;
|
|
}, $actions );
|
|
}
|
|
|
|
/**
|
|
* @param int $line
|
|
* @param array[] $signedSections
|
|
* @return bool
|
|
*/
|
|
private static function isInSignedSection( $line, array $signedSections ) {
|
|
foreach ( $signedSections as $section ) {
|
|
if ( $line > $section[0] && $line <= $section[1] ) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Finds the section that a given line is in.
|
|
*
|
|
* @param array $lines of lines in the page.
|
|
* @param int $offset The line to find the full section for.
|
|
* @return string Content of the section.
|
|
*/
|
|
public static function getFullSection( array $lines, $offset ) {
|
|
$start = self::getSectionStartIndex( $offset, $lines );
|
|
$end = self::getSectionEndIndex( $offset, $lines );
|
|
$content = implode( "\n", array_slice( $lines, $start, $end - $start ) );
|
|
|
|
return trim( $content, "\n" );
|
|
}
|
|
|
|
/**
|
|
* Given a line number and a text, find the first and last line of the section the line number is in.
|
|
* If there are subsections, the last line index will be the line before the beginning of the first subsection.
|
|
* @param int $offset line number
|
|
* @param string[] $lines
|
|
* @return int[] Tuple [$firstLine, $lastLine]
|
|
*/
|
|
private static function getSectionSpan( $offset, array $lines ) {
|
|
return [
|
|
self::getSectionStartIndex( $offset, $lines ),
|
|
self::getSectionEndIndex( $offset, $lines )
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Finds the line number of the start of the section that $offset is in.
|
|
* @param int $offset
|
|
* @param string[] $lines
|
|
* @return int
|
|
*/
|
|
private static function getSectionStartIndex( $offset, array $lines ) {
|
|
for ( $i = $offset - 1; $i >= 0; $i-- ) {
|
|
if ( self::getSectionCount( $lines[$i] ) ) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $i;
|
|
}
|
|
|
|
/**
|
|
* Finds the line number of the end of the section that $offset is in.
|
|
* @param int $offset
|
|
* @param array $lines
|
|
* @return int
|
|
*/
|
|
private static function getSectionEndIndex( $offset, array $lines ) {
|
|
$lastLine = count( $lines );
|
|
for ( $i = $offset; $i < $lastLine; $i++ ) {
|
|
if ( self::getSectionCount( $lines[$i] ) ) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $i;
|
|
}
|
|
|
|
/**
|
|
* Gets the number of section headers in a string.
|
|
*
|
|
* @param string $text
|
|
* @return int Number of section headers found.
|
|
*/
|
|
public static function getSectionCount( $text ) {
|
|
$text = trim( $text );
|
|
|
|
return (int)preg_match_all( '/' . self::HEADER_REGEX . '/um', $text );
|
|
}
|
|
|
|
/**
|
|
* Gets the title of a section or sub section
|
|
*
|
|
* @param string $text The text of the section.
|
|
* @return string|false The title of the section or false if not found
|
|
*/
|
|
public static function extractHeader( $text ) {
|
|
$text = trim( $text );
|
|
|
|
$matches = [];
|
|
|
|
if ( !preg_match_all( '/' . self::HEADER_REGEX . '/um', $text, $matches ) ) {
|
|
return false;
|
|
}
|
|
|
|
return trim( end( $matches[2] ) );
|
|
}
|
|
|
|
/**
|
|
* Extracts sections and their contents from text.
|
|
*
|
|
* @param string $text The text to parse.
|
|
* @return array[]
|
|
* Array of arrays containing sections with header and content.
|
|
* - [header]: The full header string of the section or false if there is preceding text without header.
|
|
* - [content]: The content of the section including the header string.
|
|
*/
|
|
private static function extractSections( $text ) {
|
|
$matches = [];
|
|
|
|
if ( !preg_match_all( '/' . self::HEADER_REGEX . '/um', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
|
|
return [ [
|
|
'header' => false,
|
|
'content' => $text
|
|
] ];
|
|
}
|
|
|
|
$sectionNum = count( $matches[0] );
|
|
$sections = [];
|
|
|
|
// is there text before the first headline?
|
|
if ( $matches[0][0][1] > 1 ) {
|
|
$sections[] = [
|
|
'header' => false,
|
|
'content' => substr( $text, 0, $matches[0][0][1] - 1 )
|
|
];
|
|
}
|
|
for ( $i = 0; $i < $sectionNum; $i++ ) {
|
|
if ( $i + 1 < $sectionNum ) {
|
|
$content = substr( $text, $matches[0][$i][1], $matches[0][$i + 1][1] - $matches[0][$i][1] );
|
|
} else {
|
|
$content = substr( $text, $matches[0][$i][1] );
|
|
}
|
|
$sections[] = [
|
|
'header' => self::extractHeader( $matches[0][$i][0] ),
|
|
'content' => trim( $content )
|
|
];
|
|
}
|
|
|
|
return $sections;
|
|
}
|
|
|
|
/**
|
|
* Strips out a signature if possible.
|
|
*
|
|
* @param string $text The wikitext to strip
|
|
* @param Title|null $title
|
|
* @return string
|
|
*/
|
|
private static function stripSignature( $text, Title $title = null ) {
|
|
$output = self::getUserFromLine( $text, $title );
|
|
if ( $output === false ) {
|
|
$timestampPos = self::getTimestampPosition( $text );
|
|
|
|
return substr( $text, 0, $timestampPos );
|
|
}
|
|
|
|
// Use truncateForDatabase() instead of truncateHTML() because
|
|
// truncateHTML() would not strip signature if the text contains
|
|
// < or &. (And we can't use truncateForVisual() because
|
|
// self::getUserFromLine() returns byte offsets, not character
|
|
// offsets.)
|
|
return MediaWikiServices::getInstance()->getContentLanguage()
|
|
->truncateForDatabase( $text, $output[0], '' );
|
|
}
|
|
|
|
/**
|
|
* Strips out a section header
|
|
* @param string $text The text to strip out the section header from.
|
|
* @return string The same text, with the section header stripped out.
|
|
*/
|
|
private static function stripHeader( $text ) {
|
|
$text = preg_replace( '/' . self::HEADER_REGEX . '/um', '', $text );
|
|
|
|
return $text;
|
|
}
|
|
|
|
/**
|
|
* Determines whether the input is a signed comment.
|
|
*
|
|
* @param string $text The text to check.
|
|
* @param User|bool $user If set, will only return true if the comment is
|
|
* signed by this user.
|
|
* @param Title|null $title
|
|
* @return bool
|
|
*/
|
|
public static function isSignedComment( $text, $user = false, Title $title = null ) {
|
|
$userData = self::getUserFromLine( $text, $title );
|
|
|
|
if ( $userData === false ) {
|
|
return false;
|
|
} elseif ( $user === false ) {
|
|
return true;
|
|
}
|
|
|
|
[ , $foundUser ] = $userData;
|
|
$userNameUtils = MediaWikiServices::getInstance()->getUserNameUtils();
|
|
|
|
return $userNameUtils->getCanonical( $foundUser, UserNameUtils::RIGOR_NONE ) ===
|
|
$userNameUtils->getCanonical( $user, UserNameUtils::RIGOR_NONE );
|
|
}
|
|
|
|
/**
|
|
* Finds the start position, if any, of the timestamp on a line
|
|
*
|
|
* @param string $line The line to search for a signature on
|
|
* @return int|false Integer position
|
|
*/
|
|
public static function getTimestampPosition( $line ) {
|
|
$timestampRegex = self::getTimestampRegex();
|
|
$tsMatches = [];
|
|
if ( !preg_match(
|
|
"/$timestampRegex/mu",
|
|
$line,
|
|
$tsMatches,
|
|
PREG_OFFSET_CAPTURE
|
|
) ) {
|
|
return false;
|
|
}
|
|
|
|
return $tsMatches[0][1];
|
|
}
|
|
|
|
/**
|
|
* Finds differences between $oldText and $newText
|
|
* and returns the result in a machine-readable format.
|
|
*
|
|
* @param string $oldText The "left hand side" of the diff.
|
|
* @param string $newText The "right hand side" of the diff.
|
|
* @throws MWException
|
|
* @return array[] Array of changes.
|
|
* Each change consists of:
|
|
* * An 'action', one of:
|
|
* - add
|
|
* - subtract
|
|
* - change
|
|
* * 'content' that was added or removed, or in the case
|
|
* of a change, 'old_content' and 'new_content'
|
|
* * 'left_pos' and 'right_pos' (in lines) of the change.
|
|
*/
|
|
public static function getMachineReadableDiff( $oldText, $newText ) {
|
|
if ( !isset( self::$diffParser ) ) {
|
|
self::$diffParser = new EchoDiffParser;
|
|
}
|
|
|
|
return self::$diffParser->getChangeSet( $oldText, $newText );
|
|
}
|
|
|
|
/**
|
|
* Finds and extracts signatures in $text
|
|
*
|
|
* @param string $text The text in which to look for signed comments.
|
|
* @param Title|null $title
|
|
* @return array<string,string> Associative array, the key is the username, the value
|
|
* is the last signature that was found.
|
|
*/
|
|
private static function extractSignatures( $text, Title $title = null ) {
|
|
$lines = explode( "\n", $text );
|
|
|
|
$output = [];
|
|
|
|
$lineNumber = 0;
|
|
|
|
foreach ( $lines as $line ) {
|
|
++$lineNumber;
|
|
|
|
// Look for the last user link on the line.
|
|
$userData = self::getUserFromLine( $line, $title );
|
|
if ( $userData === false ) {
|
|
continue;
|
|
}
|
|
|
|
[ $signaturePos, $user ] = $userData;
|
|
|
|
$signature = substr( $line, $signaturePos );
|
|
$output[$user] = $signature;
|
|
}
|
|
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* From a line in the signature, extract all the users linked to
|
|
*
|
|
* @param string $line Line of text potentially including linked user, user talk,
|
|
* and contribution pages
|
|
* @return string[] array of usernames, empty array for none detected
|
|
*/
|
|
public static function extractUsersFromLine( $line ) {
|
|
/*
|
|
* Signatures can look like anything (as defined by i18n messages
|
|
* "signature" & "signature-anon").
|
|
* A signature can, e.g., be both a link to user & user-talk page.
|
|
*/
|
|
// match all title-like excerpts in this line
|
|
if ( !preg_match_all( '/\[\[([^\[]+)\]\]/', $line, $matches ) ) {
|
|
return [];
|
|
}
|
|
|
|
$matches = $matches[1];
|
|
|
|
$usernames = [];
|
|
|
|
foreach ( $matches as $match ) {
|
|
/*
|
|
* Create an object out of the link title.
|
|
* In theory, links can be [[text]], [[text|text]] or pipe tricks
|
|
* [[text|]] or [[|text]].
|
|
* In the case of reverse pipe trick, the value we use *could* be
|
|
* empty, but Parser::pstPass2 should have normalized that for us
|
|
* already.
|
|
*/
|
|
$match = explode( '|', $match, 2 );
|
|
$title = Title::newFromText( $match[0] );
|
|
|
|
// figure out if we the link is related to a user
|
|
if (
|
|
$title &&
|
|
( $title->getNamespace() === NS_USER || $title->getNamespace() === NS_USER_TALK )
|
|
) {
|
|
$usernames[] = $title->getText();
|
|
} elseif ( $title && $title->isSpecial( 'Contributions' ) ) {
|
|
$parts = explode( '/', $title->getText(), 2 );
|
|
$usernames[] = end( $parts );
|
|
} else {
|
|
// move on to next matched title-like excerpt
|
|
continue;
|
|
}
|
|
}
|
|
|
|
return $usernames;
|
|
}
|
|
|
|
/**
|
|
* From a line in a wiki page, determine which user, if any,
|
|
* has signed it.
|
|
*
|
|
* @param string $line
|
|
* @param Title|null $title
|
|
* @return array|false False for none, array for success.
|
|
* - First element is the position of the signature.
|
|
* - Second element is the normalised user name.
|
|
*/
|
|
public static function getUserFromLine( $line, Title $title = null ) {
|
|
$parser = MediaWikiServices::getInstance()->getParser();
|
|
|
|
/*
|
|
* First we call extractUsersFromLine to get all the potential usernames
|
|
* from the line. Then, we loop backwards through them, figure out which
|
|
* match to a user, regenerate the signature based on that user, and
|
|
* see if it matches!
|
|
*/
|
|
$usernames = self::extractUsersFromLine( $line );
|
|
$usernames = array_reverse( $usernames );
|
|
foreach ( $usernames as $username ) {
|
|
// generate (dateless) signature from the user we think we've
|
|
// discovered the signature from
|
|
// don't validate the username - anon (IP) is fine!
|
|
$user = User::newFromName( $username, false );
|
|
$sig = $parser->preSaveTransform(
|
|
'~~~',
|
|
$title ?: Title::newMainPage(),
|
|
$user,
|
|
new ParserOptions( $user )
|
|
);
|
|
|
|
// see if we can find this user's generated signature in the content
|
|
$pos = strrpos( $line, $sig );
|
|
if ( $pos !== false ) {
|
|
return [ $pos, $username ];
|
|
}
|
|
// couldn't find sig, move on to next link excerpt and try there
|
|
}
|
|
|
|
// couldn't find any matching signature
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Find the last link beginning with a given prefix on a line.
|
|
*
|
|
* @param string $line The line to search.
|
|
* @param string $linkPrefix The prefix to search for.
|
|
* @param int|false $failureOffset
|
|
* @return array|false False for failure, array for success.
|
|
* - First element is the string offset of the link.
|
|
* - Second element is the user the link refers to.
|
|
*/
|
|
private static function getLinkFromLine( $line, $linkPrefix, $failureOffset = false ) {
|
|
$offset = 0;
|
|
|
|
// If extraction failed at another offset, try again.
|
|
if ( $failureOffset !== false ) {
|
|
$offset = $failureOffset - strlen( $line ) - 1;
|
|
}
|
|
|
|
// Avoid PHP warning: Offset is greater than the length of haystack string
|
|
if ( abs( $offset ) > strlen( $line ) ) {
|
|
return false;
|
|
}
|
|
|
|
$linkPos = strripos( $line, $linkPrefix, $offset );
|
|
|
|
if ( $linkPos === false ) {
|
|
return false;
|
|
}
|
|
|
|
$linkUser = self::extractUserFromLink( $line, $linkPrefix, $linkPos );
|
|
|
|
if ( $linkUser === false ) {
|
|
// Look for another place.
|
|
return self::getLinkFromLine( $line, $linkPrefix, $linkPos );
|
|
} else {
|
|
return [ $linkPos, $linkUser ];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Given text including a link, gives the user that that link refers to
|
|
*
|
|
* @param string $text The text to extract from.
|
|
* @param string $prefix The link prefix that was used to find the link.
|
|
* @param int $offset Optionally, the offset of the start of the link.
|
|
* @return bool|string Type description
|
|
*/
|
|
private static function extractUserFromLink( $text, $prefix, $offset = 0 ) {
|
|
$userPart = substr( $text, strlen( $prefix ) + $offset );
|
|
|
|
$userMatches = [];
|
|
if ( !preg_match(
|
|
'/^[^\|\]\#]+/u',
|
|
$userPart,
|
|
$userMatches
|
|
) ) {
|
|
// user link is invalid
|
|
return false;
|
|
}
|
|
|
|
$user = $userMatches[0];
|
|
$userNameUtils = MediaWikiServices::getInstance()->getUserNameUtils();
|
|
if (
|
|
!$userNameUtils->isIP( $user ) &&
|
|
$userNameUtils->getCanonical( $user ) === false
|
|
) {
|
|
// Not a real username
|
|
return false;
|
|
}
|
|
|
|
return $userNameUtils->getCanonical( $userMatches[0], UserNameUtils::RIGOR_NONE );
|
|
}
|
|
|
|
/**
|
|
* Gets a regular expression that will match this wiki's
|
|
* timestamps as given by ~~~~.
|
|
*
|
|
* @throws MWException
|
|
* @return string regular expression fragment.
|
|
*/
|
|
public static function getTimestampRegex() {
|
|
if ( self::$timestampRegex !== null ) {
|
|
return self::$timestampRegex;
|
|
}
|
|
|
|
// Step 1: Get an exemplar timestamp
|
|
$title = Title::newMainPage();
|
|
$user = User::newFromName( 'Test' );
|
|
$options = new ParserOptions( $user );
|
|
|
|
$parser = MediaWikiServices::getInstance()->getParser();
|
|
$exemplarTimestamp =
|
|
$parser->preSaveTransform( '~~~~~', $title, $user, $options );
|
|
|
|
// Step 2: Generalise it
|
|
// Trim off the timezone to replace at the end
|
|
$output = $exemplarTimestamp;
|
|
$tzRegex = '/\h*\(\w+\)\h*$/u';
|
|
$tzMatches = [];
|
|
if ( preg_match( $tzRegex, $output, $tzMatches, PREG_OFFSET_CAPTURE ) ) {
|
|
$output = substr( $output, 0, $tzMatches[0][1] );
|
|
}
|
|
$output = preg_quote( $output, '/' );
|
|
$output = preg_replace( '/[^\d\W]+/u', '[^\d\W]+', $output );
|
|
$output = preg_replace( '/\d+/u', '\d+', $output );
|
|
|
|
if ( $tzMatches ) {
|
|
$output .= preg_quote( $tzMatches[0][0] );
|
|
}
|
|
|
|
if ( !preg_match( "/$output/u", $exemplarTimestamp ) ) {
|
|
throw new MWException( "Timestamp regex does not match exemplar" );
|
|
}
|
|
|
|
self::$timestampRegex = $output;
|
|
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Parse wikitext into truncated plain text.
|
|
* @param string $text
|
|
* @param Language $lang
|
|
* @param int $length Length in characters (not bytes); default DEFAULT_SNIPPET_LENGTH
|
|
* @param Title|null $title Page from which the text snippet is being extracted
|
|
* @param bool $linestart Whether or not this is at the start of a line
|
|
* @return string
|
|
*/
|
|
public static function getTextSnippet(
|
|
$text, Language $lang, $length = self::DEFAULT_SNIPPET_LENGTH, $title = null, $linestart = true
|
|
) {
|
|
// Parse wikitext
|
|
$html = MediaWikiServices::getInstance()->getMessageCache()->parse( $text, $title, $linestart )->getText( [
|
|
'enableSectionEditLinks' => false
|
|
] );
|
|
$plaintext = trim( Sanitizer::stripAllTags( $html ) );
|
|
return $lang->truncateForVisual( $plaintext, $length );
|
|
}
|
|
|
|
/**
|
|
* Parse an edit summary into truncated plain text.
|
|
* @param string $text
|
|
* @param Language $lang
|
|
* @param int $length Length in characters (not bytes); default DEFAULT_SNIPPET_LENGTH
|
|
* @return string
|
|
*/
|
|
public static function getTextSnippetFromSummary( $text, Language $lang, $length = self::DEFAULT_SNIPPET_LENGTH ) {
|
|
// Parse wikitext with summary parser
|
|
$html = MediaWikiServices::getInstance()->getCommentFormatter()
|
|
->formatLinks( Sanitizer::escapeHtmlAllowEntities( $text ) );
|
|
$plaintext = trim( Sanitizer::stripAllTags( $html ) );
|
|
return $lang->truncateForVisual( $plaintext, $length );
|
|
}
|
|
|
|
/**
|
|
* Extract an edit excerpt from a revision
|
|
*
|
|
* @param RevisionRecord $revision
|
|
* @param Language $lang
|
|
* @param int $length Length in characters (not bytes); default DEFAULT_SNIPPET_LENGTH
|
|
* @return string
|
|
*/
|
|
public static function getEditExcerpt(
|
|
RevisionRecord $revision, Language $lang, $length = self::DEFAULT_SNIPPET_LENGTH
|
|
) {
|
|
$interpretation = self::getChangeInterpretationForRevision( $revision );
|
|
$section = self::detectSectionTitleAndText( $interpretation );
|
|
return $lang->truncateForVisual( $section['section-title'] . ' ' . $section['section-text'], $length );
|
|
}
|
|
}
|