2012-07-27 22:16:19 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
abstract class EchoDiscussionParser {
|
|
|
|
static $timestampRegex;
|
|
|
|
static $headerRegex = '^\=\=\s*([^=].*)\s*\=\=$';
|
|
|
|
static $revisionInterpretationCache = array();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given a Revision object, generates EchoEvent objects for
|
|
|
|
* the discussion-related actions that occurred in that Revision.
|
|
|
|
*
|
|
|
|
* @param $revision Revision object
|
|
|
|
* @return null
|
|
|
|
*/
|
|
|
|
static function generateEventsForRevision( $revision ) {
|
|
|
|
$interpretation = self::getChangeInterpretationForRevision( $revision );
|
|
|
|
$createdEvents = false;
|
2012-11-14 21:56:58 +00:00
|
|
|
|
|
|
|
// use slave database if there is a previous revision
|
|
|
|
if ( $revision->getPrevious() ) {
|
|
|
|
$title = Title::newFromID( $revision->getPage() );
|
|
|
|
// use master database for new page
|
|
|
|
} else {
|
|
|
|
$title = Title::newFromID( $revision->getPage(), Title::GAID_FOR_UPDATE );
|
|
|
|
}
|
|
|
|
|
|
|
|
// not a valid title
|
|
|
|
if ( !$title ) {
|
|
|
|
return;
|
|
|
|
}
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
$userID = $revision->getUser();
|
|
|
|
$userName = $revision->getUserText();
|
|
|
|
$user = $userID != 0 ? User::newFromId( $userID ) : User::newFromName( $userName, false );
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
foreach ( $interpretation as $action ) {
|
2012-12-20 01:00:19 +00:00
|
|
|
// These two event types are disabled temporarily, there is no need to process them
|
|
|
|
/*
|
2012-07-27 22:16:19 +00:00
|
|
|
if ( $action['type'] == 'add-comment' ) {
|
|
|
|
$fullSection = $action['full-section'];
|
|
|
|
$header = self::extractHeader( $fullSection );
|
|
|
|
|
|
|
|
EchoEvent::create( array(
|
|
|
|
'type' => 'add-comment',
|
|
|
|
'title' => $title,
|
|
|
|
'extra' => array(
|
|
|
|
'revid' => $revision->getID(),
|
|
|
|
'section-title' => $header,
|
|
|
|
'content' => $action['content'],
|
|
|
|
),
|
|
|
|
'agent' => $user,
|
|
|
|
) );
|
|
|
|
$createdEvents = true;
|
|
|
|
} elseif ( $action['type'] == 'new-section-with-comment' ) {
|
|
|
|
$content = $action['content'];
|
|
|
|
$header = self::extractHeader( $content );
|
|
|
|
EchoEvent::create( array(
|
|
|
|
'type' => 'add-talkpage-topic',
|
|
|
|
'title' => $title,
|
|
|
|
'extra' => array(
|
|
|
|
'revid' => $revision->getID(),
|
|
|
|
'section-title' => $header,
|
|
|
|
'content' => $content,
|
|
|
|
),
|
|
|
|
'agent' => $user,
|
|
|
|
) );
|
|
|
|
$createdEvents = true;
|
|
|
|
}
|
2012-12-20 01:00:19 +00:00
|
|
|
*/
|
2012-07-27 22:16:19 +00:00
|
|
|
}
|
|
|
|
|
2012-11-27 22:16:18 +00:00
|
|
|
if ( !$createdEvents && $title->getNamespace() == NS_USER_TALK ) {
|
2012-08-01 19:53:05 +00:00
|
|
|
$notifyUser = User::newFromName( $title->getText() );
|
|
|
|
if ( $notifyUser && $notifyUser->getID() ) {
|
2012-09-02 09:30:38 +00:00
|
|
|
EchoEvent::create( array(
|
2012-07-27 22:16:19 +00:00
|
|
|
'type' => 'edit-user-talk',
|
2012-08-01 19:53:05 +00:00
|
|
|
'title' => $title,
|
2012-08-30 16:04:39 +00:00
|
|
|
'extra' => array( 'revid' => $revision->getID() ),
|
2012-07-27 22:16:19 +00:00
|
|
|
'agent' => $user,
|
|
|
|
) );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given a Revision object, determines which users are interested
|
|
|
|
* in related EchoEvents.
|
|
|
|
*
|
|
|
|
* @param $revision Revision object.
|
|
|
|
* @return Array of User objects
|
|
|
|
*/
|
|
|
|
static function getNotifiedUsersForComment( $revision ) {
|
|
|
|
$interpretation = self::getChangeInterpretationForRevision( $revision );
|
|
|
|
$users = array();
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
foreach ( $interpretation as $action ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
if ( $action['type'] == 'add-comment' ) {
|
|
|
|
$fullSection = $action['full-section'];
|
|
|
|
$interestedUsers = array_keys( self::extractSignatures( $fullSection ) );
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
foreach ( $interestedUsers as $userName ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
$user = User::newFromName( $userName );
|
|
|
|
|
|
|
|
// Deliberately ignoring anonymous users
|
|
|
|
if ( $user && $user->getID() ) {
|
|
|
|
$users[$user->getID()] = $user;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( $revision->getTitle()->getNamespace() == NS_USER_TALK ) {
|
|
|
|
$userName = $revision->getTitle()->getText();
|
|
|
|
$user = User::newFromName( $userName );
|
|
|
|
|
|
|
|
if ( $user ) {
|
|
|
|
$users[$user->getID()] = $user;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $users;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given a Revision object, returns a talk-page-centric interpretation
|
|
|
|
* of the changes made in it.
|
|
|
|
*
|
|
|
|
* @param $revision Revision object
|
|
|
|
* @see EchoDiscussionParser::interpretDiff
|
|
|
|
* @return Array, see interpretDiff for details.
|
|
|
|
*/
|
|
|
|
static function getChangeInterpretationForRevision( $revision ) {
|
|
|
|
if ( $revision->getID() && isset( self::$revisionInterpretationCache[$revision->getID()] ) ) {
|
|
|
|
return self::$revisionInterpretationCache[$revision->getID()];
|
|
|
|
}
|
|
|
|
|
|
|
|
$userID = $revision->getUser();
|
|
|
|
$userName = $revision->getUserText();
|
|
|
|
$user = $userID != 0 ? User::newFromId( $userID ) : User::newFromName( $userName, false );
|
2012-08-13 14:57:57 +00:00
|
|
|
$prevText = '';
|
|
|
|
if ( $revision->getParentId() ) {
|
|
|
|
$prevRevision = Revision::newFromId( $revision->getParentId() );
|
|
|
|
if ( $prevRevision ) {
|
|
|
|
$prevText = $prevRevision->getText();
|
|
|
|
}
|
|
|
|
}
|
2012-07-27 22:16:19 +00:00
|
|
|
|
2012-08-13 14:57:57 +00:00
|
|
|
$changes = self::getMachineReadableDiff( $prevText, $revision->getText() );
|
2012-07-31 00:29:49 +00:00
|
|
|
$output = self::interpretDiff( $changes, $user->getName() );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
self::$revisionInterpretationCache[$revision->getID()] = $output;
|
|
|
|
return $output;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given a machine-readable diff, interprets the changes
|
|
|
|
* in terms of discussion page actions
|
|
|
|
*
|
|
|
|
* @todo Expand recognisable actions.
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $changes array Output of EchoEvent::getMachineReadableDiff
|
2012-07-31 00:29:49 +00:00
|
|
|
* @param $user User name
|
2012-07-27 22:16:19 +00:00
|
|
|
* @return Array of associative arrays.
|
|
|
|
* Each entry represents an action, which is classified in the 'action' field.
|
|
|
|
* All types contain a 'content' field except 'unknown'
|
|
|
|
* (which instead passes through the machine-readable diff in 'details')
|
|
|
|
* and 'unknown-change' (which provides 'new_content' and 'old_content')
|
|
|
|
* action may be:
|
|
|
|
* - add-comment: A comment signed by the user is added to an
|
|
|
|
* existing section.
|
|
|
|
* - new-section-with-comment: A new section is added, containing
|
|
|
|
* a single comment signed by the user in question.
|
|
|
|
* - unknown-signed-addition: Some signed content is added, but it
|
|
|
|
* includes section headers, is signed by another user or
|
|
|
|
* otherwise confuses the interpretation engine.
|
|
|
|
* - unknown-multi-signed-addition: Some signed content is added,
|
|
|
|
* but it contains multiple signatures.
|
|
|
|
* - unknown-unsigned-addition: Some content is added, but it is
|
|
|
|
* unsigned.
|
|
|
|
* - unknown-subtraction: Some content was removed. These actions are
|
|
|
|
* not currently analysed.
|
|
|
|
* - unknown-change: Some content was replaced with other content.
|
|
|
|
* These actions are not currently analysed.
|
|
|
|
* - unknown: Unrecognised change type.
|
|
|
|
*/
|
|
|
|
static function interpretDiff( $changes, $user ) {
|
|
|
|
// One extra item in $changes for _info
|
|
|
|
$actions = array();
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
foreach ( $changes as $index => $change ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
if ( !is_numeric( $index ) ) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( !$change['action'] ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
// Unknown action; skip
|
|
|
|
continue;
|
|
|
|
}
|
2012-07-31 00:29:49 +00:00
|
|
|
|
2012-07-27 22:16:19 +00:00
|
|
|
if ( $change['action'] == 'add' ) {
|
|
|
|
$content = trim( $change['content'] );
|
2012-08-31 21:50:46 +00:00
|
|
|
$startSection = preg_match( "/\A" . self::$headerRegex . '/um', $content );
|
2012-07-27 22:16:19 +00:00
|
|
|
$sectionCount = self::getSectionCount( $content );
|
|
|
|
$signedUsers = array_keys( self::extractSignatures( $content ) );
|
|
|
|
|
|
|
|
if (
|
|
|
|
count( $signedUsers ) == 1 &&
|
|
|
|
in_array( $user, $signedUsers )
|
|
|
|
) {
|
|
|
|
if ( $sectionCount === 0 ) {
|
|
|
|
$fullSection = self::getFullSection( $changes['_info']['rhs'], $change['right-pos'] );
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'add-comment',
|
|
|
|
'content' => $content,
|
|
|
|
'full-section' => $fullSection,
|
|
|
|
);
|
|
|
|
} elseif ( $startSection && $sectionCount === 1 ) {
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'new-section-with-comment',
|
|
|
|
'content' => $content,
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'unknown-signed-addition',
|
|
|
|
'content' => $content,
|
|
|
|
);
|
|
|
|
}
|
|
|
|
} elseif ( count( $signedUsers ) >= 1 ) {
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'unknown-multi-signed-addition',
|
|
|
|
'content' => $content,
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'unknown-unsigned-addition',
|
|
|
|
'content' => $content,
|
|
|
|
);
|
|
|
|
}
|
|
|
|
} elseif ( $change['action'] == 'subtract' ) {
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'unknown-subtraction',
|
|
|
|
'content' => $change['content'],
|
|
|
|
);
|
|
|
|
} elseif ( $change['action'] == 'change' ) {
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'unknown-change',
|
|
|
|
'old_content' => $change['old_content'],
|
|
|
|
'new_content' => $change['new_content'],
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
$actions[] = array(
|
|
|
|
'type' => 'unknown',
|
|
|
|
'details' => $change,
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// $actions['_diff'] = $changes;
|
|
|
|
// unset( $actions['_diff']['_info'] );
|
|
|
|
|
|
|
|
return $actions;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finds the section that a given line is in.
|
|
|
|
*
|
|
|
|
* @param $lines Array of lines in the page.
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $offset int The line to find the full section for.
|
|
|
|
* @return string Content of the section.
|
2012-07-27 22:16:19 +00:00
|
|
|
*/
|
|
|
|
static function getFullSection( $lines, $offset ) {
|
2012-07-31 00:29:49 +00:00
|
|
|
$content = $lines[$offset - 1];
|
2012-07-27 22:16:19 +00:00
|
|
|
$headerRegex = '/' . self::$headerRegex . '/um';
|
|
|
|
|
|
|
|
// Expand backwards...
|
2012-08-31 21:50:46 +00:00
|
|
|
$continue = !preg_match( $headerRegex, $lines[$offset - 1] );
|
2012-07-31 00:29:49 +00:00
|
|
|
$i = $offset - 1;
|
2012-07-27 22:16:19 +00:00
|
|
|
while ( $continue && $i > 0 ) {
|
|
|
|
--$i;
|
|
|
|
$line = $lines[$i];
|
|
|
|
$content = "$line\n$content";
|
|
|
|
if ( preg_match( $headerRegex, $line ) ) {
|
|
|
|
$continue = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// And then forwards...
|
|
|
|
|
|
|
|
$continue = true;
|
2012-07-31 00:29:49 +00:00
|
|
|
$i = $offset - 1;
|
2012-08-30 16:04:39 +00:00
|
|
|
while ( $continue && $i < count( $lines ) - 1 ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
++$i;
|
|
|
|
$line = $lines[$i];
|
|
|
|
if ( preg_match( $headerRegex, $line ) ) {
|
|
|
|
$continue = false;
|
|
|
|
} else {
|
|
|
|
$content .= "\n$line";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $content;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets the number of section headers in a string.
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The text.
|
|
|
|
* @return int Number of section headers found.
|
2012-07-27 22:16:19 +00:00
|
|
|
*/
|
|
|
|
static function getSectionCount( $text ) {
|
2012-08-30 16:04:39 +00:00
|
|
|
$text = trim( $text );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
$matches = array();
|
2012-08-31 21:50:46 +00:00
|
|
|
preg_match_all( '/' . self::$headerRegex . '/um', $text, $matches );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
return count( $matches[0] );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets the title of a section
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The text of the section.
|
|
|
|
* @return string The title of the section.
|
2012-07-27 22:16:19 +00:00
|
|
|
*/
|
|
|
|
static function extractHeader( $text ) {
|
2012-08-30 16:04:39 +00:00
|
|
|
$text = trim( $text );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
$matches = array();
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( !preg_match( '/' . self::$headerRegex . '/um', $text, $matches ) ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return trim( $matches[1] );
|
|
|
|
}
|
|
|
|
|
2012-07-31 00:29:49 +00:00
|
|
|
/**
|
|
|
|
* Strips out a signature if possible.
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The wikitext to strip
|
|
|
|
* @return string
|
2012-07-31 00:29:49 +00:00
|
|
|
*/
|
|
|
|
static function stripSignature( $text ) {
|
|
|
|
$timestampPos = self::getTimestampPosition( $text );
|
|
|
|
|
|
|
|
if ( $timestampPos === false ) {
|
|
|
|
return $text;
|
|
|
|
}
|
|
|
|
|
|
|
|
$output = self::getUserFromLine( $text, $timestampPos );
|
|
|
|
|
|
|
|
if ( $output === false ) {
|
|
|
|
return substr( $text, 0, $timestampPos );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Strip off signature with HTML truncation method.
|
|
|
|
// This way tags which are opened will be closed.
|
|
|
|
global $wgContLang;
|
|
|
|
$truncated_text = $wgContLang->truncateHtml( $text, $output[0], '' );
|
|
|
|
|
|
|
|
return $truncated_text;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Strips unnecessary indentation and so on from comments
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The text to strip from
|
|
|
|
* @return string Stripped wikitext
|
2012-07-31 00:29:49 +00:00
|
|
|
*/
|
|
|
|
static function stripIndents( $text ) {
|
|
|
|
// First strip all indentation from the beginning of lines
|
|
|
|
$text = preg_replace( '/^\s*\:+/m', '', $text );
|
|
|
|
|
|
|
|
// Now if there is only one list item, strip that too
|
|
|
|
$listRegex = '/^\s*(?:[\:#*]\s*)*[#*]/m';
|
|
|
|
$matches = array();
|
2012-08-01 19:53:05 +00:00
|
|
|
if ( preg_match_all( $listRegex, $text, $matches ) ) {
|
2012-08-30 16:04:39 +00:00
|
|
|
if ( count( $matches ) == 1 ) {
|
2012-07-31 00:29:49 +00:00
|
|
|
$text = preg_replace( $listRegex, '', $text );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $text;
|
|
|
|
}
|
|
|
|
|
2012-07-27 22:16:19 +00:00
|
|
|
/**
|
|
|
|
* Strips out a section header
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The text to strip out the section header from.
|
|
|
|
* @return string: The same text, with the section header stripped out.
|
2012-07-27 22:16:19 +00:00
|
|
|
*/
|
|
|
|
static function stripHeader( $text ) {
|
2012-08-31 21:50:46 +00:00
|
|
|
$text = preg_replace( '/' . self::$headerRegex . '/um', '', $text );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
return $text;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determines whether the input is a signed comment.
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The text to check.
|
|
|
|
* @param $user User|bool If set, will only return true if the comment is
|
2012-07-27 22:16:19 +00:00
|
|
|
* signed by this user.
|
2012-09-02 09:30:38 +00:00
|
|
|
* @return bool: true or false.
|
2012-07-27 22:16:19 +00:00
|
|
|
*/
|
|
|
|
static function isSignedComment( $text, $user = false ) {
|
2012-07-31 00:29:49 +00:00
|
|
|
$timestampPos = self::getTimestampPosition( $text );
|
|
|
|
|
|
|
|
if ( $timestampPos === false ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-07-31 00:29:49 +00:00
|
|
|
$userData = self::getUserFromLine( $text, $timestampPos );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
if ( $userData === false ) {
|
|
|
|
return false;
|
|
|
|
} elseif ( $user === false ) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
list( $signaturePos, $foundUser ) = $userData;
|
|
|
|
|
|
|
|
return User::getCanonicalName( $foundUser, false ) === User::getCanonicalName( $user, false );
|
|
|
|
}
|
|
|
|
|
2012-07-31 00:29:49 +00:00
|
|
|
/**
|
|
|
|
* Finds the start position, if any, of the timestamp on a line
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $line string The line to search for a signature on
|
|
|
|
* @return int|bool Integer position
|
2012-07-31 00:29:49 +00:00
|
|
|
*/
|
|
|
|
static function getTimestampPosition( $line ) {
|
|
|
|
$timestampRegex = self::getTimestampRegex();
|
|
|
|
$endOfLine = self::getLineEndingRegex();
|
|
|
|
$tsMatches = array();
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( !preg_match(
|
|
|
|
"/$timestampRegex$endOfLine/mu",
|
|
|
|
$line,
|
|
|
|
$tsMatches,
|
|
|
|
PREG_OFFSET_CAPTURE
|
2012-07-31 00:29:49 +00:00
|
|
|
) ) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $tsMatches[0][0];
|
|
|
|
}
|
|
|
|
|
2012-07-27 22:16:19 +00:00
|
|
|
/**
|
|
|
|
* Finds differences between $oldText and $newText
|
|
|
|
* and returns the result in a machine-readable format.
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $oldText string The "left hand side" of the diff.
|
|
|
|
* @param $newText string The "right hand side" of the diff.
|
|
|
|
* @throws MWException
|
2012-07-27 22:16:19 +00:00
|
|
|
* @return Array of changes.
|
|
|
|
* Each change consists of:
|
|
|
|
* * An 'action', one of:
|
|
|
|
* - add
|
|
|
|
* - subtract
|
|
|
|
* - change
|
|
|
|
* * 'content' that was added or removed, or in the case
|
|
|
|
* of a change, 'old_content' and 'new_content'
|
|
|
|
* * 'left_pos' and 'right_pos' (in lines) of the change.
|
|
|
|
*/
|
|
|
|
static function getMachineReadableDiff( $oldText, $newText ) {
|
2012-08-31 21:50:46 +00:00
|
|
|
$oldText = trim( $oldText ) . "\n";
|
|
|
|
$newText = trim( $newText ) . "\n";
|
2012-07-27 22:16:19 +00:00
|
|
|
$diff = wfDiff( $oldText, $newText, '-u -w' );
|
|
|
|
|
2012-08-30 16:04:39 +00:00
|
|
|
$old_lines = explode( "\n", $oldText );
|
|
|
|
$new_lines = explode( "\n", $newText );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
// First break down the diff into additions and subtractions
|
|
|
|
$diff_lines = explode( "\n", $diff );
|
|
|
|
$left_pos = 0;
|
|
|
|
$right_pos = 0;
|
|
|
|
$changes = array();
|
|
|
|
$change_run = false;
|
|
|
|
$sub_lines = 0;
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
for ( $i = 0; $i < count( $diff_lines ); ++$i ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
$line = $diff_lines[$i];
|
|
|
|
|
2012-08-30 16:04:39 +00:00
|
|
|
if ( strlen( $line ) == 0 ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$line_type = $line[0];
|
|
|
|
|
|
|
|
if ( $line_type == ' ' ) {
|
|
|
|
++$left_pos;
|
|
|
|
++$right_pos;
|
|
|
|
} elseif ( $line_type == '@' ) {
|
2012-08-30 16:04:39 +00:00
|
|
|
list( $at, $lhs_pos, $rhs_pos, $at ) = explode( ' ', $line );
|
2012-07-27 22:16:19 +00:00
|
|
|
$lhs_pos = substr( $lhs_pos, 1 );
|
|
|
|
$rhs_pos = substr( $rhs_pos, 1 );
|
|
|
|
list( $left_pos ) = explode( ',', $lhs_pos );
|
|
|
|
list( $right_pos ) = explode( ',', $rhs_pos );
|
|
|
|
$change_run = false;
|
|
|
|
} elseif ( $line_type == '-' ) {
|
|
|
|
$subtracted_line = substr( $line, 1 );
|
|
|
|
|
|
|
|
if ( trim( $subtracted_line ) === '' ) {
|
|
|
|
++$left_pos;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( $change_run && $changes[$change_run]['action'] == 'subtract' ) {
|
|
|
|
++$sub_lines;
|
|
|
|
$changes[$change_run]['content'] .= "\n" . $subtracted_line;
|
|
|
|
} else {
|
|
|
|
$sub_lines = 1;
|
|
|
|
$changes[] = array(
|
|
|
|
'action' => 'subtract',
|
|
|
|
'left-pos' => $left_pos,
|
|
|
|
'right-pos' => $right_pos,
|
|
|
|
'content' => $subtracted_line,
|
|
|
|
);
|
2012-08-31 21:50:46 +00:00
|
|
|
$change_run = count( $changes ) - 1;
|
2012-07-27 22:16:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Consistency check
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( $old_lines[$left_pos - 1] != $subtracted_line ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
throw new MWException( "Left offset consistency error.\nOffset: $right_pos\nExpected: {$old_lines[$left_pos-1]}\nActual: $subtracted_line" );
|
|
|
|
}
|
|
|
|
++$left_pos;
|
|
|
|
} elseif ( $line_type == '+' ) {
|
|
|
|
$added_line = substr( $line, 1 );
|
|
|
|
|
|
|
|
if ( $change_run !== false && $changes[$change_run]['action'] == 'add' ) {
|
|
|
|
$changes[$change_run]['content'] .= "\n" . $added_line;
|
|
|
|
} elseif ( $change_run !== false && $changes[$change_run]['action'] == 'subtract' ) {
|
|
|
|
$changes[$change_run]['action'] = 'change';
|
|
|
|
$changes[$change_run]['old_content'] = $changes[$change_run]['content'];
|
|
|
|
$changes[$change_run]['new_content'] = $added_line;
|
2012-07-31 00:29:49 +00:00
|
|
|
--$sub_lines;
|
2012-07-27 22:16:19 +00:00
|
|
|
unset( $changes[$change_run]['content'] );
|
|
|
|
} elseif ( $change_run !== false && $changes[$change_run]['action'] == 'change' && $sub_lines > 0 ) {
|
|
|
|
--$sub_lines;
|
|
|
|
$changes[$change_run]['new_content'] .= "\n" . $added_line;
|
|
|
|
} else {
|
|
|
|
$changes[] = array(
|
|
|
|
'action' => 'add',
|
|
|
|
'left-pos' => $left_pos,
|
|
|
|
'right-pos' => $right_pos,
|
|
|
|
'content' => $added_line,
|
|
|
|
);
|
2012-08-31 21:50:46 +00:00
|
|
|
$change_run = count( $changes ) - 1;
|
2012-07-27 22:16:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Consistency check
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( $new_lines[$right_pos - 1] != $added_line ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
throw new MWException( "Right offset consistency error.\nOffset: $right_pos\nExpected: {$new_lines[$right_pos-1]}\nActual: $added_line\n" );
|
|
|
|
}
|
|
|
|
++$right_pos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$changes['_info'] = array(
|
2012-08-30 16:04:39 +00:00
|
|
|
'lhs-length' => count( $old_lines ),
|
|
|
|
'rhs-length' => count( $new_lines ),
|
2012-07-27 22:16:19 +00:00
|
|
|
'lhs' => $old_lines,
|
|
|
|
'rhs' => $new_lines,
|
|
|
|
);
|
|
|
|
|
|
|
|
return $changes;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finds and extracts signatures in $text
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The text in which to look for signed comments.
|
|
|
|
* @return array. Associative array, the key is the username, the value
|
2012-07-27 22:16:19 +00:00
|
|
|
* is the last signature that was found.
|
|
|
|
*/
|
|
|
|
static function extractSignatures( $text ) {
|
|
|
|
$lines = explode( "\n", $text );
|
|
|
|
$timestampRegex = self::getTimestampRegex();
|
|
|
|
$endOfLine = self::getLineEndingRegex();
|
|
|
|
|
|
|
|
$output = array();
|
|
|
|
|
|
|
|
$lineNumber = 0;
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
foreach ( $lines as $line ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
++$lineNumber;
|
|
|
|
$tsMatches = array();
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( !preg_match(
|
|
|
|
"/$timestampRegex$endOfLine/mu",
|
|
|
|
$line,
|
|
|
|
$tsMatches,
|
|
|
|
PREG_OFFSET_CAPTURE
|
2012-07-27 22:16:19 +00:00
|
|
|
) ) {
|
|
|
|
// Ignore lines that don't finish with a timestamp
|
|
|
|
// print "I\tNo timestamp\n";
|
|
|
|
// print "$line\n";
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now that we know we have a timestamp, look for
|
|
|
|
// the last user link on the line.
|
|
|
|
$userData = self::getUserFromLine( $line, $tsMatches[0][0] );
|
|
|
|
if ( $userData === false ) {
|
|
|
|
// print "F\t$lineNumber\t$line\n";
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
// print "S\t$lineNumber\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
list( $signaturePos, $user ) = $userData;
|
|
|
|
|
|
|
|
$signature = substr( $line, $signaturePos );
|
|
|
|
$output[$user] = $signature;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $output;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* From a line in a wiki page, determine which user, if any,
|
|
|
|
* has signed it.
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $line string The line.
|
|
|
|
* @param $timestampPos int The offset of the start of the timestamp.
|
|
|
|
* @return bool|array false for none, Array for success.
|
2012-07-27 22:16:19 +00:00
|
|
|
* - First element is the position of the signature.
|
|
|
|
* - Second element is the normalised user name.
|
|
|
|
*/
|
|
|
|
static function getUserFromLine( $line, $timestampPos ) {
|
|
|
|
global $wgContLang;
|
|
|
|
$possiblePrefixes = array( // Later entries have a higher precedence
|
|
|
|
'[[' . $wgContLang->getNsText( NS_USER ) . ':',
|
|
|
|
'[[' . $wgContLang->getNsText( NS_USER_TALK ) . ':',
|
2012-08-30 16:04:39 +00:00
|
|
|
'[[' . SpecialPage::getTitleFor( 'Contributions' )->getPrefixedText() . '/',
|
2012-07-27 22:16:19 +00:00
|
|
|
);
|
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
foreach ( $possiblePrefixes as $prefix ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
if ( strpos( $prefix, '_' ) !== false ) {
|
|
|
|
$possiblePrefixes[] = str_replace( '_', ' ', $prefix );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$winningUser = false;
|
|
|
|
$winningPos = false;
|
|
|
|
|
|
|
|
// Look for the leftmost link to the rightmost user
|
2012-08-31 21:50:46 +00:00
|
|
|
foreach ( $possiblePrefixes as $prefix ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
$output = self::getLinkFromLine( $line, $prefix );
|
|
|
|
|
|
|
|
if ( $output === false ) {
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
list( $pos, $user ) = $output;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Couldn't be a signature
|
|
|
|
if ( ( $timestampPos - $pos ) > 255 ) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (
|
|
|
|
$winningPos === false ||
|
|
|
|
( $pos > $winningPos && $user !== $winningUser ) ||
|
|
|
|
(
|
|
|
|
$pos < $winningPos &&
|
|
|
|
$user === $winningUser
|
|
|
|
)
|
|
|
|
) {
|
|
|
|
$winningPos = $pos;
|
|
|
|
$winningUser = ucfirst( trim( $user ) );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( $winningUser === false ) {
|
|
|
|
// print "E\tNo winning user\n";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return array( $winningPos, $winningUser );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Find the last link beginning with a given prefix on a line.
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $line string The line to search.
|
|
|
|
* @param $linkPrefix string The prefix to search for.
|
|
|
|
* @param $failureOffset bool
|
|
|
|
* @return bool false for failure, array for success.
|
2012-07-27 22:16:19 +00:00
|
|
|
* - First element is the string offset of the link.
|
|
|
|
* - Second element is the user the link refers to.
|
|
|
|
*/
|
|
|
|
static function getLinkFromLine( $line, $linkPrefix, $failureOffset = false ) {
|
|
|
|
$offset = 0;
|
|
|
|
|
|
|
|
// If extraction failed at another offset, try again.
|
|
|
|
if ( $failureOffset !== false ) {
|
|
|
|
$offset = $failureOffset - strlen( $line ) - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
$linkPos = strripos( $line, $linkPrefix, $offset );
|
|
|
|
|
|
|
|
if ( $linkPos === false ) {
|
|
|
|
// print "I\tNo match for $linkPrefix\n";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
$linkUser = self::extractUserFromLink( $line, $linkPrefix, $linkPos );
|
|
|
|
|
|
|
|
if ( $linkUser === false ) {
|
|
|
|
// print "E\tExtraction failed\t$linkPrefix\n";
|
|
|
|
// Look for another place.
|
|
|
|
return self::getLinkFromLine( $line, $linkPrefix, $linkPos );
|
|
|
|
} else {
|
|
|
|
return array( $linkPos, $linkUser );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given text including a link, gives the user that that link refers to
|
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @param $text string The text to extract from.
|
|
|
|
* @param $prefix string The link prefix that was used to find the link.
|
|
|
|
* @param $offset int Optionally, the offset of the start of the link.
|
|
|
|
* @return bool|string Type description
|
2012-07-27 22:16:19 +00:00
|
|
|
*/
|
|
|
|
static function extractUserFromLink( $text, $prefix, $offset = 0 ) {
|
2012-08-30 16:04:39 +00:00
|
|
|
$userPart = substr( $text, strlen( $prefix ) + $offset );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
|
|
|
$userMatches = array();
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( !preg_match(
|
2012-07-27 22:16:19 +00:00
|
|
|
'/^[^\|\]\#]+/u',
|
|
|
|
$userPart,
|
2012-08-01 19:53:05 +00:00
|
|
|
$userMatches
|
2012-07-27 22:16:19 +00:00
|
|
|
) ) {
|
|
|
|
// user link is invalid
|
|
|
|
// print "I\tUser link invalid\t$userPart\n";
|
|
|
|
// print "E\tCannot find user info to extract\n";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
$user = $userMatches[0];
|
|
|
|
|
|
|
|
if (
|
2012-08-31 21:50:46 +00:00
|
|
|
!User::isIP( $user ) &&
|
2012-08-30 16:04:39 +00:00
|
|
|
User::getCanonicalName( $user ) === false
|
2012-07-27 22:16:19 +00:00
|
|
|
) {
|
|
|
|
// Not a real username
|
|
|
|
// print "E\tInvalid username\n";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return User::getCanonicalName( $userMatches[0], false );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets a regular expression fragmentmatching characters that
|
|
|
|
* can appear in a line after the signature.
|
2012-08-31 21:50:46 +00:00
|
|
|
*
|
2012-07-27 22:16:19 +00:00
|
|
|
* @return String regular expression fragment.
|
|
|
|
*/
|
|
|
|
static function getLineEndingRegex() {
|
|
|
|
$ignoredEndings = array(
|
|
|
|
'\s*',
|
2012-08-30 16:04:39 +00:00
|
|
|
preg_quote( '}' ),
|
|
|
|
preg_quote( '{' ),
|
2012-07-27 22:16:19 +00:00
|
|
|
'\<[^\>]+\>',
|
2012-08-31 21:50:46 +00:00
|
|
|
preg_quote( '{{' ) . '[^}]+' . preg_quote( '}}' ),
|
2012-07-27 22:16:19 +00:00
|
|
|
);
|
|
|
|
|
|
|
|
$regex = '(?:' . implode( '|', $ignoredEndings ) . ')*';
|
|
|
|
|
|
|
|
return $regex;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets a regular expression that will match this wiki's
|
|
|
|
* timestamps as given by ~~~~.
|
2012-08-31 21:50:46 +00:00
|
|
|
*
|
2012-09-02 09:30:38 +00:00
|
|
|
* @throws MWException
|
2012-07-27 22:16:19 +00:00
|
|
|
* @return String regular expression fragment.
|
|
|
|
*/
|
|
|
|
static function getTimestampRegex() {
|
|
|
|
if ( self::$timestampRegex !== null ) {
|
|
|
|
return self::$timestampRegex;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Step 1: Get an exemplar timestamp
|
|
|
|
$title = Title::newMainPage();
|
2012-08-30 16:04:39 +00:00
|
|
|
$user = User::newFromName( 'Test' );
|
2012-07-27 22:16:19 +00:00
|
|
|
$options = new ParserOptions;
|
|
|
|
|
|
|
|
global $wgParser;
|
|
|
|
$exemplarTimestamp =
|
|
|
|
$wgParser->preSaveTransform( '~~~~~', $title, $user, $options );
|
|
|
|
|
|
|
|
// Step 2: Generalise it
|
|
|
|
// Trim off the timezone to replace at the end
|
|
|
|
$output = $exemplarTimestamp;
|
|
|
|
$tzRegex = '/\s*\(\w+\)\s*$/';
|
|
|
|
$tzMatches = array();
|
2012-08-01 19:53:05 +00:00
|
|
|
preg_match( $tzRegex, $output, $tzMatches );
|
2012-07-27 22:16:19 +00:00
|
|
|
$output = preg_replace( $tzRegex, '', $output );
|
|
|
|
$output = preg_quote( $output, '/' );
|
|
|
|
$output = preg_replace( '/[^\d\W]+/u', '[^\d\W]+', $output );
|
|
|
|
$output = preg_replace( '/\d+/u', '\d+', $output );
|
|
|
|
|
2012-08-30 16:04:39 +00:00
|
|
|
$output .= preg_quote( $tzMatches[0] );
|
2012-07-27 22:16:19 +00:00
|
|
|
|
2012-08-31 21:50:46 +00:00
|
|
|
if ( !preg_match( "/$output/u", $exemplarTimestamp ) ) {
|
2012-07-27 22:16:19 +00:00
|
|
|
throw new MWException( "Timestamp regex does not match exemplar" );
|
|
|
|
}
|
|
|
|
|
|
|
|
self::$timestampRegex = $output;
|
|
|
|
|
|
|
|
return $output;
|
|
|
|
}
|
2012-11-07 01:41:06 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* This function returns plain text snippet, it also removes html tag,
|
|
|
|
* template from text content
|
|
|
|
* @param $text string
|
|
|
|
* @param $length int default 150
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
static function getTextSnippet( $text, $length = 150 ) {
|
|
|
|
global $wgLang;
|
|
|
|
|
|
|
|
$text = strip_tags( $text );
|
|
|
|
$attempt = 0;
|
|
|
|
|
|
|
|
// 10 attempts at most, the logic here is to find the first }} and
|
|
|
|
// find the matching {{ for that }}
|
|
|
|
while ( $attempt < 10 ) {
|
|
|
|
$closeCurPos = strpos( $text, '}}' );
|
|
|
|
|
|
|
|
if ( $closeCurPos === false ) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
$tempStr = substr( $text, 0, $closeCurPos + 2 );
|
|
|
|
|
|
|
|
$openCurPos = strrpos( $tempStr, '{{' );
|
|
|
|
if ( $openCurPos === false ) {
|
|
|
|
$text = substr_replace( $text, '', $closeCurPos, 2 );
|
|
|
|
} else {
|
|
|
|
$text = substr_replace( $text, '', $openCurPos, $closeCurPos - $openCurPos + 2 );
|
|
|
|
}
|
|
|
|
$attempt++;
|
|
|
|
}
|
|
|
|
|
|
|
|
$text = trim( strip_tags( htmlspecialchars_decode( MessageCache::singleton()->parse( $text )->getText() ) ) );
|
|
|
|
// strip out non-useful data for snippet
|
|
|
|
$text = str_replace( array( '{', '}' ), '', $text );
|
|
|
|
|
|
|
|
return $wgLang->truncate( $text, $length );
|
|
|
|
}
|
2012-08-30 16:04:39 +00:00
|
|
|
}
|