From 7b7a2cd69cab98997f08a5cccde603d3bde4de52 Mon Sep 17 00:00:00 2001 From: Roan Kattouw Date: Mon, 9 Dec 2019 18:38:17 -0800 Subject: [PATCH] The Great Parser JS to PHP port of 2020!* * Not to be confused with the Parsing Team's "Great Parser JS to PHP port of 2019" Gasp as OR hacks are changed to null coalescing operators. Applaud as variable declarations are dropped. Cheer as parameters and return values are type-hinted. Shudder as DomNodeLists have no indexOf method. Moving discussion parsing to the server should allow us to implement much cleaner APIs for commenting. Bug: T252252 Co-authored-by: Ed Sanders Change-Id: Ic1438d516e223db462cb227f6668e856672f538c --- .phpcs.xml | 4 +- extension.json | 1 + includes/DiscussionToolsCommentParser.php | 1103 +++++++++++++++++ .../DiscussionToolsCommentParserTest.php | 124 ++ 4 files changed, 1231 insertions(+), 1 deletion(-) create mode 100644 includes/DiscussionToolsCommentParser.php create mode 100644 tests/phpunit/DiscussionToolsCommentParserTest.php diff --git a/.phpcs.xml b/.phpcs.xml index 3839d7845..9402981e3 100644 --- a/.phpcs.xml +++ b/.phpcs.xml @@ -1,6 +1,8 @@ - + + + . diff --git a/extension.json b/extension.json index b6ab3afba..3f9e2ede0 100644 --- a/extension.json +++ b/extension.json @@ -225,6 +225,7 @@ ] }, "AutoloadClasses": { + "DiscussionToolsCommentParser": "includes/DiscussionToolsCommentParser.php", "DiscussionToolsData": "includes/DiscussionToolsData.php", "DiscussionToolsHooks": "includes/DiscussionToolsHooks.php" }, diff --git a/includes/DiscussionToolsCommentParser.php b/includes/DiscussionToolsCommentParser.php new file mode 100644 index 000000000..e727eb7e7 --- /dev/null +++ b/includes/DiscussionToolsCommentParser.php @@ -0,0 +1,1103 @@ +language = $language; + $this->config = $config; + $this->dateFormat = $this->language->getDateFormatString( + 'both', + $this->language->dateFormat( false ) + ); + // TODO: We probably shouldn't assume that each digit can be represented by a single BMP + // codepoint in every language (although it seems to be true right now). + $this->digits = $this->config->get( 'TranslateNumerals' ) ? + $this->language->formatNum( '0123456789', true ) : + null; + $this->digitsRegexp = $this->config->get( 'TranslateNumerals' ) ? + '[' . $this->language->formatNum( '0123456789', true ) . ']' : + '\\d'; + // TODO: Instead of passing data used for mocking, mock the methods that fetch the data. + $this->data = $data; + $this->localTimezone = $this->config->get( 'Localtimezone' ); + $this->timezoneAbbrs = $this->computeTimezoneAbbrs(); + } + + public static function newFromGlobalState() : DiscussionToolsCommentParser { + return new static( + MediaWikiServices::getInstance()->getContentLanguage(), + MediaWikiServices::getInstance()->getMainConfig() + ); + } + + /** + * Find closest ancestor element using one of the given tag names. + * + * @param DOMNode $el + * @param string[] $tagNames + * @return DOMNode|null + */ + private static function closestElement( DOMNode $el, array $tagNames ) : ?DOMNode { + do { + if ( + $el->nodeType === XML_ELEMENT_NODE && + in_array( strtolower( $el->nodeName ), $tagNames ) + ) { + return $el; + } + $el = $el->parentNode; + } while ( $el ); + return null; + } + + /** + * Build the timezone abbreviations map for the local timezone. + * @return array Associative array mapping localized timezone abbreviations to IANA abbreviations + */ + private function computeTimezoneAbbrs() : array { + // Return only timezone abbreviations for the local timezone (there will often be two, for + // non-DST and DST timestamps, and sometimes more due to historical data, but that's okay). + $timezoneAbbrs = array_keys( array_filter( + DateTimeZone::listAbbreviations(), + function ( $timezones ) { + foreach ( $timezones as $tz ) { + if ( $tz['timezone_id'] === $this->localTimezone ) { + return true; + } + } + return false; + } + ) ); + return array_combine( + array_map( function ( $tzMsg ) { + // MWTimestamp::getTimezoneMessage() + // Parser::pstPass2() + // Messages used here: 'timezone-utc' and so on + $key = 'timezone-' . strtolower( trim( $tzMsg ) ); + $msg = wfMessage( $key )->inLanguage( $this->language ); + // TODO: This probably causes a similar issue to https://phabricator.wikimedia.org/T221294, + // but we *must* check the message existence in the database, because the messages are not + // actually defined by MediaWiki core for any timezone other than UTC... + if ( $msg->exists() ) { + return $this->getMessages( [ $key ] )[0]; + } + return strtoupper( $tzMsg ); + }, $timezoneAbbrs ), + array_map( 'strtoupper', $timezoneAbbrs ) + ); + } + + /** + * Check whether a DOMNode contains (is an ancestor of) another DOMNode + * @param DOMNode $ancestor + * @param DOMNode $descendant + * @return bool + */ + private function contains( DOMNode $ancestor, DOMNode $descendant ) : bool { + // TODO can we use DOMNode->compareDocumentPosition() here maybe? + $node = $descendant; + while ( $node && $node !== $ancestor ) { + $node = $node->parentNode; + } + return $node === $ancestor; + } + + /** + * Get the index of $child in its parent + * @param DOMNode $child + * @return int + */ + public static function childIndexOf( DOMNode $child ) : int { + $parent = $child->parentNode; + for ( $i = 0; $i < $parent->childNodes->length; $i++ ) { + if ( $parent->childNodes->item( $i ) === $child ) { + return $i; + } + } + throw new MWException( 'Node was not inside its parent, this should never happen' ); + } + + /** + * Get a MediaWiki page title from a URL + * @param string $url + * @return Title|null + */ + private function getTitleFromUrl( string $url ) : ?Title { + // TODO: Set the correct base in the document? + if ( strpos( $url, './' ) === 0 ) { + $url = 'https://local/wiki/' . substr( $url, 2 ); + } elseif ( strpos( $url, '://' ) === false ) { + $url = 'https://local' . $url; + } + $bits = wfParseUrl( $url ); + $query = wfCgiToArray( $bits['query'] ?? '' ); + if ( isset( $query['title'] ) ) { + return Title::newFromText( $query['title'] ); + } + + $articlePathRegexp = '/' . str_replace( + preg_quote( '$1', '/' ), + '(.*)', + preg_quote( $this->config->get( 'ArticlePath' ), '/' ) + ) . '/'; + $matches = null; + if ( preg_match( $articlePathRegexp, $url, $matches ) ) { + return Title::newFromText( urldecode( $matches[1] ) ); + } + return null; + } + + /** + * Return the next leaf node in the tree order that is not an empty or whitespace-only text node. + * + * In other words, this returns a text node with content other than whitespace, or an element node + * with no children, that follows the given node. + * + * @param DOMNode $node Node to start searching at. This node's children are ignored. + * @param DOMNode $rootNode Node to stop searching at (ancestor of $startNode not to break out of) + * @return DOMNode|null + */ + private function nextInterestingLeafNode( DOMNode $node, DOMNode $rootNode ) : ?DOMNode { + $n = $node; + do { + if ( $n->firstChild && ( $node === $rootNode || $n !== $node ) ) { + $n = $n->firstChild; + } elseif ( $n->nextSibling ) { + $n = $n->nextSibling; + } else { + while ( $n && $n !== $rootNode && !$n->nextSibling ) { + $n = $n->parentNode; + } + $n = $n->nextSibling; + } + + if ( + $n && ( + ( $n->nodeType === XML_TEXT_NODE && trim( $n->nodeValue ) !== '' ) || + ( $n->nodeType === XML_ELEMENT_NODE && !$n->firstChild ) + ) + ) { + return $n; + } + } while ( $n && $n !== $rootNode ); + return null; + } + + /** + * @param string[] $values Values to match + * @return string Regular expression + */ + private static function regexpAlternateGroup( array $values ) : string { + return '(' . implode( '|', array_map( function ( $x ) { + return preg_quote( $x, '/' ); + }, $values ) ) . ')'; + } + + /** + * @param string[] $messageKeys Message keys + * @return string[] Message values + */ + private function getMessages( array $messageKeys ) : array { + return array_map( function ( $key ) { + return isset( $this->data['contLangMessages'][$key] ) ? + $this->data['contLangMessages'][$key] : + wfMessage( $key )->inLanguage( $this->language )->text(); + }, $messageKeys ); + } + + /** + * Get a regexp that matches timestamps generated using the given date format. + * + * This only supports format characters that are used by the default date format in any of + * MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters), + * and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are + * complicated). + * + * @param string $format Date format + * @param string $digitsRegexp Regular expression matching a single localized digit, e.g. '[0-9]' + * @param array $tzAbbrs Associative array mapping localized timezone abbreviations to + * IANA abbrevations, for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] + * @return string Regular expression + */ + private function getTimestampRegexp( + string $format, string $digitsRegexp, array $tzAbbrs + ) : string { + $formatLength = strlen( $format ); + $s = ''; + // Adapted from Language::sprintfDate() + for ( $p = 0; $p < $formatLength; $p++ ) { + $num = false; + $code = $format[ $p ]; + if ( $code === 'x' && $p < $formatLength - 1 ) { + $code .= $format[++$p]; + } + if ( $code === 'xk' && $p < $formatLength - 1 ) { + $code .= $format[++$p]; + } + + switch ( $code ) { + case 'xx' : + $s .= 'x'; + break; + case 'xg': + $s .= self::regexpAlternateGroup( + $this->getMessages( Language::MONTH_GENITIVE_MESSAGES ) + ); + break; + case 'd': + $num = '2'; + break; + case 'D': + $s .= self::regexpAlternateGroup( + $this->getMessages( Language::WEEKDAY_ABBREVIATED_MESSAGES ) + ); + break; + case 'j': + $num = '1,2'; + break; + case 'l': + $s .= self::regexpAlternateGroup( + $this->getMessages( Language::WEEKDAY_MESSAGES ) + ); + break; + case 'F': + $s .= self::regexpAlternateGroup( + $this->getMessages( Language::MONTH_MESSAGES ) + ); + break; + case 'M': + $s .= self::regexpAlternateGroup( + $this->getMessages( Language::MONTH_ABBREVIATED_MESSAGES ) + ); + break; + case 'n': + $num = '1,2'; + break; + case 'Y': + $num = '4'; + break; + case 'xkY': + $num = '4'; + break; + case 'G': + $num = '1,2'; + break; + case 'H': + $num = '2'; + break; + case 'i': + $num = '2'; + break; + case '\\': + // Backslash escaping + if ( $p < $formatLength - 1 ) { + $s .= preg_quote( $format[++$p], '/' ); + } else { + $s .= preg_quote( '\\', '/' ); + } + break; + case '"': + // Quoted literal + if ( $p < $formatLength - 1 ) { + $endQuote = strpos( $format, '"', $p + 1 ); + if ( $endQuote === false ) { + // No terminating quote, assume literal " + $s .= '"'; + } else { + $s .= preg_quote( substr( $format, $p + 1, $endQuote - $p - 1 ), '/' ); + $p = $endQuote; + } + } else { + # Quote at end of string, assume literal " + $s .= '"'; + } + break; + default: + $s .= preg_quote( $format[$p], '/' ); + } + if ( $num !== false ) { + $s .= '(' . $digitsRegexp . '{' . $num . '})'; + } + } + + $tzRegexp = self::regexpAlternateGroup( array_keys( $tzAbbrs ) ); + + // Hardcoded parentheses and space like in Parser::pstPass2 + // Ignore some invisible Unicode characters that often sneak into copypasted timestamps (T245784) + // \uNNNN syntax can only be used from PHP 7.3 + return '/' . $s . '[\\x{200E}\\x{200F}]? [\\x{200E}\\x{200F}]?\\(' . $tzRegexp . '\\)/u'; + } + + /** + * Get a function that parses timestamps generated using the given date format, based on the result + * of matching the regexp returned by getTimestampRegexp() + * + * @param string $format Date format, as used by MediaWiki + * @param string|null $digits Localised digits from 0 to 9, e.g. `0123456789` + * @param string $localTimezone Local timezone IANA name, e.g. `America/New_York` + * @param array $tzAbbrs Map of localised timezone abbreviations to IANA abbreviations + * for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] + * @return function Parser function + */ + private function getTimestampParser( + string $format, ?string $digits, string $localTimezone, array $tzAbbrs + ) : callable { + $untransformDigits = function ( string $text ) use ( $digits ) { + if ( !$digits ) { + return $text; + } + return preg_replace_callback( + '/[' . $digits . ']/', + function ( $m ) use ( $digits ) { + return strpos( $digits, $m[0] ); + }, + $text + ); + }; + + $formatLength = strlen( $format ); + $matchingGroups = []; + for ( $p = 0; $p < $formatLength; $p++ ) { + $code = $format[$p]; + if ( $code === 'x' && $p < $formatLength - 1 ) { + $code .= $format[++$p]; + } + if ( $code === 'xk' && $p < $formatLength - 1 ) { + $code .= $format[++$p]; + } + + switch ( $code ) { + case 'xx': + break; + case 'xg': + case 'd': + case 'j': + case 'D': + case 'l': + case 'F': + case 'M': + case 'n': + case 'Y': + case 'xkY': + case 'G': + case 'H': + case 'i': + $matchingGroups[] = $code; + break; + case '\\': + // Backslash escaping + if ( $p < $formatLength - 1 ) { + $p++; + } + break; + case '"': + // Quoted literal + if ( $p < $formatLength - 1 ) { + $endQuote = strpos( $format, '"', $p + 1 ); + if ( $endQuote !== false ) { + $p = $endQuote; + } + } + break; + default: + break; + } + } + + return function ( array $match ) use ( + $matchingGroups, $untransformDigits, $localTimezone, $tzAbbrs + ) { + if ( is_array( $match[0] ) ) { + // Strip PREG_OFFSET_CAPTURE data + $match = array_map( function ( $tuple ) { + return $tuple[0]; + }, $match ); + } + $year = 0; + $monthIdx = 0; + $day = 0; + $hour = 0; + $minute = 0; + foreach ( $matchingGroups as $i => $code ) { + $text = $match[$i + 1]; + switch ( $code ) { + case 'xg': + $monthIdx = array_search( $text, $this->getMessages( Language::MONTH_GENITIVE_MESSAGES ) ); + break; + case 'd': + case 'j': + $day = intval( $untransformDigits( $text ) ); + break; + case 'D': + case 'l': + // Day of the week - unused + break; + case 'F': + $monthIdx = array_search( $text, $this->getMessages( Language::MONTH_MESSAGES ) ); + break; + case 'M': + $monthIdx = array_search( $text, $this->getMessages( Language::MONTH_ABBREVIATED_MESSAGES ) ); + break; + case 'n': + $monthIdx = intval( $untransformDigits( $text ) ) - 1; + break; + case 'Y': + $year = intval( $untransformDigits( $text ) ); + break; + case 'xkY': + // Thai year + $year = intval( $untransformDigits( $text ) ) - 543; + break; + case 'G': + case 'H': + $hour = intval( $untransformDigits( $text ) ); + break; + case 'i': + $minute = intval( $untransformDigits( $text ) ); + break; + default: + // TODO throw NotImplementedException or whatever it's called + throw new MWException( 'Not implemented' ); + } + } + + // The last matching group is the timezone abbreviation + $tzAbbr = $tzAbbrs[ end( $match ) ]; + + // Most of the time, the timezone abbreviation is not necessary to parse the date, since we + // can assume all times are in the wiki's local timezone. + $date = new DateTime(); + // setTimezone must be called before setDate/setTime + $date->setTimezone( new DateTimeZone( $localTimezone ) ); + $date->setDate( $year, $monthIdx + 1, $day ); + $date->setTime( $hour, $minute, 0 ); + + // But during the "fall back" at the end of DST, some times will happen twice. + // Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect + // when PHP chose the wrong one, and then try the other one. It appears that PHP always + // uses the later (non-DST) hour, but that behavior isn't documented, so we account for both. + if ( $date->format( 'T' ) !== $tzAbbr ) { + $altDate = clone $date; + if ( $date->format( 'I' ) ) { + // Parsed time is DST, try non-DST by advancing one hour + $altDate->add( new DateInterval( 'PT1H' ) ); + } else { + // Parsed time is non-DST, try DST by going back one hour + $altDate->sub( new DateInterval( 'PT1H' ) ); + } + if ( $altDate->format( 'T' ) === $tzAbbr ) { + $date = $altDate; + } + // else: neither DST nor non-DST gives us the expected timezone + // TODO log a warning in this case? + } + + // Now set the timezone back to UTC for formatting + $date->setTimezone( new DateTimeZone( 'UTC' ) ); + $date = DateTimeImmutable::createFromMutable( $date ); + + return $date; + }; + } + + /** + * Get a regular expression that matches timestamps in the local date format. + * + * This calls getTimestampRegexp() with predefined data for the current wiki. + * + * @return string Regular expression + */ + public function getLocalTimestampRegexp() : string { + return self::getTimestampRegexp( + $this->dateFormat, + $this->digitsRegexp, + $this->timezoneAbbrs + ); + } + + /** + * Get a function that parses timestamps in the local date format, based on the result + * of matching the regexp returned by getLocalTimestampRegexp(). + * + * This calls getTimestampParser() with predefined data for the current wiki. + * + * @return function Parser function + */ + private function getLocalTimestampParser() : callable { + return $this->getTimestampParser( + $this->dateFormat, + $this->digits, + $this->localTimezone, + $this->timezoneAbbrs + ); + } + + /** + * Get the indent level of $node, relative to $rootNode. + * + * The indent level is the number of lists inside of which it is nested. + * + * @param DOMNode $node + * @param DOMNode $rootNode + * @return int + */ + private function getIndentLevel( DOMNode $node, DOMNode $rootNode ) : int { + $indent = 0; + while ( $node ) { + if ( $node === $rootNode ) { + break; + } + $nodeName = strtolower( $node->nodeName ); + if ( $nodeName === 'li' || $nodeName === 'dd' ) { + $indent++; + } + $node = $node->parentNode; + } + return $indent; + } + + /** + * Find a user signature preceding a timestamp. + * + * The signature includes the timestamp node. + * + * A signature must contain at least one link to the user's userpage, discussion page or + * contributions (and may contain other links). The link may be nested in other elements. + * + * This function returns a two-element array. The first element is an array of sibling nodes + * comprising the signature, with $timestampNode as the last element. The second element + * is the username (null for unsigned comments). + * + * @param DOMText $timestampNode Text node + * @param DOMNode|null $until Node to stop searching at + * @return array [ nodes, username ] + */ + private function findSignature( DOMText $timestampNode, DOMNode $until = null ) : array { + $node = $timestampNode; + $sigNodes = [ $node ]; + $sigUsername = null; + $length = 0; + $lastLinkNode = $timestampNode; + while ( + ( $node = $node->previousSibling ) && $length < self::SIGNATURE_SCAN_LIMIT && $node !== $until + ) { + $sigNodes[] = $node; + $length += $node->textContent ? strlen( $node->textContent ) : 0; + if ( $node->nodeType === XML_TEXT_NODE ) { + // FIXME use proper constant, or proper isText check + continue; + } + + $links = []; + if ( strtolower( $node->nodeName ) === 'a' ) { + $links = [ $node ]; + } elseif ( $node->nodeType === XML_ELEMENT_NODE ) { + // Handle links nested in formatting elements. + // Helpful accidental feature: users whose signature is not detected in full (due to + // text formatting) can just wrap it in a to fix that. + // "Ten Pound Hammer • (What did I screw up now?)" + // "« Saper // dyskusja »" + $links = $node->getElementsByTagName( 'a' ); + } + if ( !count( $links ) ) { + continue; + } + + // Find the earliest link that links to the user's user page + foreach ( $links as $link ) { + $username = null; + $title = $this->getTitleFromUrl( $link->getAttribute( 'href' ) ); + if ( !$title ) { + continue; + } + if ( $title->getNamespace() === NS_USER || $title->getNamespace() === NS_USER_TALK ) { + $username = $title->getText(); + if ( strpos( $username, '/' ) !== false ) { + continue; + } + } elseif ( $title->isSpecial( 'Contributions' ) ) { + $parts = explode( '/', $title->getText(), 2 ); + if ( !isset( $parts[1] ) ) { + continue; + } + // Normalize the username: users may link to their contributions with an unnormalized name + $userpage = Title::makeTitleSafe( NS_USER, $parts[1] ); + if ( !$userpage ) { + continue; + } + $username = $userpage->getText(); + } + if ( !$username ) { + continue; + } + if ( IP::isIPv6( $username ) ) { + // Bot-generated links "Preceding unsigned comment added by" have non-standard case + $username = strtoupper( $username ); + } + + // Accept the first link to the user namespace, then only accept links to that user + if ( $sigUsername === null ) { + $sigUsername = $username; + } + if ( $username === $sigUsername ) { + $lastLinkNode = $node; + break; + } + } + // Keep looking if a node with links wasn't a link to a user page + // "Doc James (talk · contribs · email)" + } + // Pop excess text nodes + while ( end( $sigNodes ) !== $lastLinkNode ) { + array_pop( $sigNodes ); + } + return [ $sigNodes, $sigUsername ]; + } + + /** + * Find all timestamps within a DOM subtree. + * + * @param DOMNode $rootNode + * @return array Array of [node, matchData] pairs + */ + public function findTimestamps( DOMNode $rootNode ) : array { + $xpath = new DOMXPath( $rootNode->ownerDocument ); + $textNodes = $xpath->query( '//text()', $rootNode ); + $matches = []; + $timestampRegex = self::getLocalTimestampRegexp(); + foreach ( $textNodes as $node ) { + $startNode = $node; + $nodeText = ''; + + while ( $node ) { + $nodeText .= $node->nodeValue; + + // In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML + // entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces, + // which apparently are often turned into   entities by buggy editing tools. To handle + // this, we must piece together the text, so that our regexp can match those timestamps. + if ( + $node->nextSibling && + $node->nextSibling->nodeType === XML_ELEMENT_NODE && + $node->nextSibling->getAttribute( 'typeof' ) === 'mw:Entity' + ) { + $nodeText .= $node->nextSibling->firstChild->nodeValue; + + // If the entity is followed by more text, do this again + if ( + $node->nextSibling->nextSibling && + $node->nextSibling->nextSibling->nodeType === XML_TEXT_NODE + ) { + $node = $node->nextSibling->nextSibling; + } else { + $node = null; + } + } else { + $node = null; + } + } + + $matchData = null; + // Allows us to mimic match.index in #getComments + if ( preg_match( $timestampRegex, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) { + $matches[] = [ $startNode, $matchData ]; + } + } + return $matches; + } + + /** + * Get all discussion comments (and headings) within a DOM subtree. + * + * This returns a flat list, use groupThreads() to associate replies to original messages and + * get a tree structure starting at section headings. + * + * For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, + * the wikitext syntax is just for illustration): + * + * == A == + * B. ~~~~ + * : C. + * : C. ~~~~ + * :: D. ~~~~ + * ::: E. ~~~~ + * ::: F. ~~~~ + * : G. ~~~~ + * H. ~~~~ + * : I. ~~~~ + * + * This function would return a structure like: + * + * [ + * [ 'type' => 'heading', 'level' => 0, 'range' => (h2: A) }, + * [ 'type' => 'comment', 'level' => 1, 'range' => (p: B) }, + * [ 'type' => 'comment', 'level' => 2, 'range' => (li: C, li: C) }, + * [ 'type' => 'comment', 'level' => 3, 'range' => (li: D) }, + * [ 'type' => 'comment', 'level' => 4, 'range' => (li: E) }, + * [ 'type' => 'comment', 'level' => 4, 'range' => (li: F) }, + * [ 'type' => 'comment', 'level' => 2, 'range' => (li: G) }, + * [ 'type' => 'comment', 'level' => 1, 'range' => (p: H) }, + * [ 'type' => 'comment', 'level' => 2, 'range' => (li: I) } + * ] + * + * The elements of the array are stdClass objects with the following fields: + * - 'type' (string): 'heading' or 'comment' + * - 'range' (array): The extent of the comment, including the signature and timestamp. + * Comments can start or end in the middle of a DOM node. + * Keys: 'startContainer', 'startOffset', 'endContainer' and 'endOffset' + * - 'level' (int): Indentation level of the comment. Headings are 0, comments start at 1. + * - 'timestamp' (string): Timestamp (TODO in what format?). Not set for headings. + * - 'author' (string|null): Comment author's username, null for unsigned comments. + * Not set for headings. + * + * @param DOMNode $rootNode + * @return stdClass[] Results. Each result is an object. + */ + public function getComments( DOMNode $rootNode ) : array { + $timestamps = $this->findTimestamps( $rootNode ); + + $xpath = new DOMXPath( $rootNode->ownerDocument ); + $allNodes = $xpath->query( '//text()|//node()', $rootNode ); + $tocNode = $rootNode->ownerDocument->getElementById( 'toc' ); + $comments = []; + $dfParser = $this->getLocalTimestampParser(); + + // Placeholder heading in case there are comments in the 0th section + $range = (object)[ + 'startContainer' => $rootNode, + 'startOffset' => 0, + 'endContainer' => $rootNode, + 'endOffset' => 0 + ]; + $fakeHeading = (object)[ + 'placeholderHeading' => true, + 'type' => 'heading', + 'range' => $range, + 'level' => 0 + ]; + + $curComment = $fakeHeading; + + $nextTimestamp = 0; + foreach ( $allNodes as $node ) { + // Skip nodes inside
+ if ( $tocNode && $this->contains( $tocNode, $node ) ) { + continue; + } + + if ( $node->nodeType === XML_ELEMENT_NODE && preg_match( '/^h[1-6]$/i', $node->nodeName ) ) { + $range = (object)[ + 'startContainer' => $node, + 'startOffset' => 0, + 'endContainer' => $node, + 'endOffset' => $node->childNodes->length + ]; + $curComment = (object)[ + 'type' => 'heading', + 'range' => $range, + 'level' => 0 + ]; + $comments[] = $curComment; + } elseif ( isset( $timestamps[$nextTimestamp] ) && $node === $timestamps[$nextTimestamp][0] ) { + $foundSignature = $this->findSignature( $node, $curComment->range->endContainer ); + $author = $foundSignature[1]; + $firstSigNode = end( $foundSignature[0] ); + + if ( !$author ) { + // Ignore timestamps for which we couldn't find a signature. It's probably not a real + // comment, but just a false match due to a copypasted timestamp. + $nextTimestamp++; + continue; + } + + // Everything from the last comment up to here is the next comment + $startNode = $this->nextInterestingLeafNode( $curComment->range->endContainer, $rootNode ); + $match = $timestamps[$nextTimestamp][1]; + $range = (object)[ + 'startContainer' => $startNode->parentNode, + 'startOffset' => self::childIndexOf( $startNode ), + 'endContainer' => $node, + 'endOffset' => $match[0][1] + mb_strlen( $match[0][0] ) + ]; + $sigRange = (object)[ + 'startContainer' => $firstSigNode->parentNode, + 'startOffset' => self::childIndexOf( $firstSigNode ), + 'endContainer' => $node, + 'endOffset' => $match[0][1] + mb_strlen( $match[0][0] ) + ]; + + $startLevel = $this->getIndentLevel( $startNode, $rootNode ) + 1; + $endLevel = $this->getIndentLevel( $node, $rootNode ) + 1; + if ( $startLevel !== $endLevel ) { + // TODO warn: 'Comment starts and ends with different indentation' + } + + // Avoid generating multiple comments when there is more than one signature on a single "line". + // Often this is done when someone edits their comment later and wants to add a note about that. + // (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments + // within one paragraph/listitem result in a confusing double "Reply" button, and we also have + // no way to indicate which one you're replying to (this might matter in the future for + // notifications or something). + if ( + $curComment->type === 'comment' && + ( self::closestElement( $node, [ 'li', 'dd', 'p' ] ) ?? $node->parentNode ) === + ( + self::closestElement( $curComment->range->endContainer, [ 'li', 'dd', 'p' ] ) ?? + $curComment->range->endContainer->parentNode + ) + ) { + // Merge this with the previous comment. Use that comment's author and timestamp. + $curComment->range->endContainer = $range->endContainer; + $curComment->range->endOffset = $range->endOffset; + $curComment->signatureRanges[] = $sigRange; + $curComment->level = min( min( $startLevel, $endLevel ), $curComment->level ); + + $nextTimestamp++; + continue; + } + + $curComment = (object)[ + 'type' => 'comment', + // Almost DateTimeInterface::RFC3339_EXTENDED + 'timestamp' => $dfParser( $match )->format( 'Y-m-d\TH:i:s.v\Z' ), + 'author' => $author, + 'range' => $range, + 'signatureRanges' => [ $sigRange ], + // Should this use the indent level of $startNode or $node? + 'level' => min( $startLevel, $endLevel ) + ]; + $comments[] = $curComment; + $nextTimestamp++; + } + } + + // Insert the fake placeholder heading if there are any comments in the 0th section + // (before the first real heading) + if ( count( $comments ) && $comments[ 0 ]->type !== 'heading' ) { + array_unshift( $comments, $fakeHeading ); + } + + return $comments; + } + + /** + * Group discussion comments into threads and associate replies to original messages. + * + * Each thread must begin with a heading. Original messages in the thread are treated as replies to + * its heading. Other replies are associated based on the order and indentation level. + * + * Note that the objects in `comments` are extended in-place with the additional data. + * + * For example, for a MediaWiki discussion like this (we're dealing with HTML DOM here, + * the wikitext syntax is just for illustration): + * + * == A == + * B. ~~~~ + * : C. + * : C. ~~~~ + * :: D. ~~~~ + * ::: E. ~~~~ + * ::: F. ~~~~ + * : G. ~~~~ + * H. ~~~~ + * : I. ~~~~ + * + * This function would return a structure like: + * + * [ + * [ 'type' => 'heading', 'level' => 0, 'range' => (h2: A), 'replies' => [ + * [ 'type' => 'comment', 'level' => 1, 'range' => (p: B), 'replies' => [ + * [ 'type' => 'comment', 'level' => 2, 'range' => (li: C, li: C), 'replies' => [ + * [ 'type' => 'comment', 'level' => 3, 'range' => (li: D), 'replies' => [ + * [ 'type' => 'comment', 'level' => 4, 'range' => (li: E), 'replies' => [] ], + * [ 'type' => 'comment', 'level' => 4, 'range' => (li: F), 'replies': [] ], + * ] ], + * ] ], + * [ 'type' => 'comment', 'level' => 2, 'range' => (li: G), 'replies' => [] ], + * ] ], + * [ 'type' => 'comment', 'level' => 1, 'range' => (p: H), 'replies' => [ + * [ 'type' => 'comment', 'level' => 2, 'range' => (li: I), 'replies' => [] ], + * ] ], + * ] ], + * ] + * + * @param stdClass[] &$comments Result of #getComments, will be modified to add more properties + * @return stdClass[] Tree structure of comments, using the same objects as `comments`. Top-level + * items are the headings. The following properties are added: + * - id: Unique ID (within the page) for this comment, intended to be used to + * find this comment in other revisions of the same page + * - replies: Comment objects which are replies to this comment + * - parent: Comment object which this is a reply to (null for headings) + */ + public function groupThreads( array &$comments ) : array { + $threads = []; + $replies = []; + $commentsById = []; + + foreach ( $comments as &$comment ) { + if ( $comment->level === 0 ) { + // We don't need ids for section headings right now, but we might in the future + // e.g. if we allow replying directly to sections (adding top-level comments) + $id = null; + } else { + $id = ( $comment->author ?? '' ) . '|' . $comment->timestamp; + + // If there would be multiple comments with the same ID (i.e. the user left multiple comments + // in one edit, or within a minute), append sequential numbers + $number = 0; + while ( isset( $commentsById["$id|$number"] ) ) { + $number++; + } + $id = "$id|$number"; + } + if ( $id !== null ) { + $commentsById[$id] = $comment; + } + + // This modifies the original objects in $comments! + $comment->id = $id; + $comment->replies = []; + $comment->parent = null; + + if ( count( $replies ) < $comment->level ) { + // Someone skipped an indentation level (or several). Pretend that the previous reply + // covers multiple indentation levels, so that following comments get connected to it. + // TODO warn: 'Comment skips indentation level' + while ( count( $replies ) < $comment->level ) { + // FIXME this will clone the reply, not just set a reference + $replies[] = end( $replies ); + } + } + + if ( $comment->level === 0 ) { + // New root (thread) + $threads[] = $comment; + } elseif ( isset( $replies[ $comment->level - 1 ] ) ) { + // Add as a reply to the closest less-nested comment + $comment->parent = $replies[ $comment->level - 1 ]; + $comment->parent->replies[] = $comment; + } else { + // TODO warn: 'Comment could not be connected to a thread' + } + + $replies[ $comment->level ] = $comment; + // Cut off more deeply nested replies + // TODO look up if there's a more convenient function to truncate arrays + array_splice( $replies, $comment->level + 1, count( $replies ) - $comment->level - 1 ); + } + + return $threads; + } + + /** + * Get the list of authors involved in a comment and its replies. + * + * You probably want to pass a thread root here (a heading). + * + * @param stdClass $comment Comment object, as returned by #groupThreads + * @return string[] Author usernames + */ + public function getAuthors( $comment ) { + $authors = []; + $getAuthorSet = function ( $comment ) use ( &$authors, &$getAuthorSet ) { + if ( $comment->author ?? false ) { + $authors[ $comment->author ] = true; + } + // Get the set of authors in the same format from each reply + array_map( $getAuthorSet, $comment->replies ); + }; + + $getAuthorSet( $comment ); + + ksort( $authors ); + return array_keys( $authors ); + } + + /** + * Get the name of the page from which this comment is transcluded (if any). + * + * @param stdClass $comment Comment object, as returned by #groupThreads + * @return string|bool `false` if this comment is not transcluded. A string if it's transcluded + * from a single page (the page title, in text form with spaces). `true` if it's transcluded, but + * we can't determine the source. + */ + public function getTranscludedFrom( $comment ) { + // If some template is used within the comment (e.g. {{ping|…}} or {{tl|…}}), that *does not* mean + // the comment is transcluded. We only want to consider comments to be transcluded if the wrapper + // element (usually
  • or

    ) is marked as part of a transclusion. + // TODO: This seems to work fine but I'm having a hard time explaining why it is correct... + $node = $comment->range->endContainer; + + // Find the node containing information about the transclusion: + // 1. Find the closest ancestor with an 'about' attribute + // 2. Find the main node of the about-group (first sibling with the same 'about' attribute) + // 3. If this is an mw:Transclusion node, return it; otherwise, go to step 1 + while ( $node ) { + // 1. + if ( + $node->nodeType === XML_ELEMENT_NODE && + $node->getAttribute( 'about' ) && + preg_match( '/^#mwt\d+$/', $node->getAttribute( 'about' ) ) + ) { + $about = $node->getAttribute( 'about' ); + + // 2. + while ( + $node->previousSibling && + $node->previousSibling->nodeType === XML_ELEMENT_NODE && + $node->previousSibling->getAttribute( 'about' ) === $about + ) { + $node = $node->previousSibling; + } + + // 3. + if ( + $node->getAttribute( 'typeof' ) && + inArray( 'mw:Transclusion', explode( ' ', $node->getAttribute( 'typeof' ) ) ) + ) { + break; + } + } + + $node = $node->parentNode; + } + + if ( !$node ) { + // No mw:Transclusion node found, this comment is not transcluded + return false; + } + + $dataMw = json_decode( $node->getAttribute( 'data-mw' ), true ); + + // Only return a page name if this is a simple single-template transclusion. + if ( + $dataMw['parts'] && + count( $dataMw['parts'] ) === 1 && + $dataMw['parts'][0]['template'] && + $dataMw['parts'][0]['template']['target']['href'] + ) { + // TODO: Slice off the './' prefix and convert to text form (underscores to spaces, URL-decoded) + return $dataMw['parts'][0]['template']['target']['href']; + } + + // Multi-template transclusion, or a parser function call, or template-affected wikitext outside + // of a template call, or a mix of the above + return true; + } +} diff --git a/tests/phpunit/DiscussionToolsCommentParserTest.php b/tests/phpunit/DiscussionToolsCommentParserTest.php new file mode 100644 index 000000000..4f307d58e --- /dev/null +++ b/tests/phpunit/DiscussionToolsCommentParserTest.php @@ -0,0 +1,124 @@ +getTimestampRegexp( $format, '\\d', [ 'UTC' => 'UTC' ] ); + self::assertSame( $expected, $result, $message ); + } + + public function provideTimestampRegexps() { + return self::getJson( './cases/timestamp-regex.json' ); + } + + /** + * @dataProvider provideTimestampParser + * @covers ::getTimestampParser + */ + public function testGetTimestampParser( $format, $data, $expected, $message ) { + $parser = TestingAccessWrapper::newFromObject( + DiscussionToolsCommentParser::newFromGlobalState() + ); + + $expected = new DateTimeImmutable( $expected ); + + $tsParser = $parser->getTimestampParser( $format, null, 'UTC', [ 'UTC' => 'UTC' ] ); + self::assertEquals( $expected, $tsParser( $data ), $message ); + } + + public function provideTimestampParser() { + return self::getJson( './cases/timestamp-parser.json' ); + } + + /** + * @dataProvider provideTimestampParserDST + * @covers ::getTimestampParser + */ + public function testGetTimestampParserDST( + $sample, $expected, $expectedUtc, $format, $timezone, $timezoneAbbrs, $message + ) { + $parser = TestingAccessWrapper::newFromObject( + DiscussionToolsCommentParser::newFromGlobalState() + ); + + $regexp = $parser->getTimestampRegexp( $format, '\\d', $timezoneAbbrs ); + $tsParser = $parser->getTimestampParser( $format, null, $timezone, $timezoneAbbrs ); + + $expected = new DateTimeImmutable( $expected ); + $expectedUtc = new DateTimeImmutable( $expectedUtc ); + + preg_match( $regexp, $sample, $match, PREG_OFFSET_CAPTURE ); + $date = $tsParser( $match ); + + self::assertEquals( $expected, $date, $message ); + self::assertEquals( $expectedUtc, $date, $message ); + } + + public function provideTimestampParserDST() { + return self::getJson( './cases/timestamp-parser-dst.json' ); + } + + /** + * @dataProvider provideAuthors + * @covers ::getAuthors + */ + public function testGetAuthors( $thread, $expected ) { + $parser = DiscussionToolsCommentParser::newFromGlobalState(); + + self::assertEquals( $expected, $parser->getAuthors( $thread ) ); + } + + public function provideAuthors() { + return [ + [ + 'thread' => (object)[ + 'replies' => [ + (object)[ + 'author' => 'Eve', + 'replies' => [] + ], + (object)[ + 'author' => 'Bob', + 'replies' => [ + (object)[ + 'author' => 'Alice', + 'replies' => [] + ] + ] + ] + + ] + ], + 'expected' => [ 'Alice', 'Bob', 'Eve' ] + ] + ]; + } +}