From 896a9a32740554e688c377f0745cceec0a01b023 Mon Sep 17 00:00:00 2001 From: Thomas Gries Date: Mon, 27 Feb 2012 01:10:57 +0000 Subject: [PATCH] fix for ultra bug 30028 . The RSS extension can parse RSS and ATOM feeds of different flavours. The php xml dom xpath query uses now a namespace-safe method to find all elements like item (RSS, RDF) or entry (ATOM). Further fixed a hidden problem when the feed url was redirecting, this threw the Cannot parse RSS for XML error, which is now history. Introduced a new parameter wgRSSUrlNumberOfAllowedRedirects which defaults to zero, i.e. no redirects are allowed by default. See Manual page --- RELEASE-NOTES | 11 ++++++++-- RSS.i18n.php | 1 + RSS.php | 13 ++++++++---- RSSData.php | 29 ++++---------------------- RSSHooks.php | 13 ++++++++---- RSSParser.php | 57 +++++++++++++++++++++++++++++++++++++++++++-------- 6 files changed, 80 insertions(+), 44 deletions(-) diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 0d277d9..07d54eb 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -10,13 +10,20 @@ http://www.mediawiki.org/wiki/Extension:RSS coming in. Then you could abort cleanly once it's gotten too much (otherwise using the defaults - PHP will abort the entire program when your memory usage gets too high) -* bug 30028 "Error parsing XML for RSS" - improve and harden Extension:RSS when - parsing differently flavoured RSS feeds and ATOM feeds + +=== Version 2.10 2012-02-27 === +* final solution of bug 30028 "Error parsing XML for RSS" - improve and harden + Extension:RSS when parsing differently flavoured RSS feeds and ATOM feeds +* new parameter $wgRSSUrlNumberOfAllowedRedirects (default = 0) + Some feed urls redirect. The new RSS version can deal with redirects, + but it must be expressly enabled. For example, you can set + $wgRSSUrlNumberOfAllowedRedirects = 1; === Version 2.01 2012-02-24 === * "summary" element of ATOM feed items are shown which is handled like "description" element of RSS * handling of basic HTML layout tags


in item description + === Version 2.00 2012-02-24 === * first version which can parse RSS and at least some ATOM feeds partial solution of bug 30028 "Error parsing XML for RSS" - improve and harden diff --git a/RSS.i18n.php b/RSS.i18n.php index bef4363..042fa98 100644 --- a/RSS.i18n.php +++ b/RSS.i18n.php @@ -22,6 +22,7 @@ $messages['en'] = array( 'rss-ns-permission' => 'RSS is not allowed in this namespace', 'rss-url-is-not-whitelisted' => '"$1" is not in the whitelist of allowed feeds. {{PLURAL:$3|$2 is the only allowed feed|The allowed feeds are as follows: $2}}.', 'rss-empty-whitelist' => '"$1" is not in the whitelist of allowed feeds. There are no allowed feed URLs in the whitelist.', + 'rss-deprecated-wgrssallowedfeeds-found' => 'The deprecated variable $wgRSSAllowedFeeds has been detected. Since RSS version 2.0 this variable has to be replaced by $wgRSSUrlWhitelist as described in the manual page Extension:RSS.', 'rss-item' => '{{$1 | title = {{{title}}} | link = {{{link}}} | date = {{{date}}} | author = {{{author}}} | description = {{{description}}} }}', 'rss-feed' => "; '''[{{{link}}} {{{title}}}]''' : {{{description}}} diff --git a/RSS.php b/RSS.php index 3894f0a..b3f32d7 100644 --- a/RSS.php +++ b/RSS.php @@ -4,7 +4,7 @@ * * @file * @ingroup Extensions - * @version 2.01 + * @version 2.10 * @author mutante, Daniel Kinzler, Rdb, Mafs, Thomas Gries, Alxndr, Chris Reigrut, K001 * @author Kellan Elliott-McCrea -- author of MagpieRSS * @author Jeroen De Dauw @@ -14,7 +14,7 @@ * @link http://www.mediawiki.org/wiki/Extension:RSS Documentation */ -define( "EXTENSION_RSS_VERSION", "2.01 20120224" ); +define( "EXTENSION_RSS_VERSION", "2.10 20120227" ); if ( !defined( 'MEDIAWIKI' ) ) { die( "This is not a valid entry point.\n" ); @@ -52,7 +52,7 @@ $wgRSSCacheAge = 3600; // for debugging set $wgRSSCacheCompare = 1; $wgRSSCacheCompare = false; -// 5 second timeout +// 15 second timeout $wgRSSFetchTimeout = 15; // Ignore the RSS tag in all but the namespaces listed here. @@ -77,6 +77,11 @@ $wgRSSUrlWhitelist = array(); // include "*" if you expressly want to allow all urls (you should not do this) // $wgRSSUrlWhitelist = array( "*" ); +// Maximum number of redirects to follow (defaults to 0) +// Note: this should only be used when the target URLs are trusted, +// to avoid attacks on intranet services accessible by HTTP. +$wgRSSUrlNumberOfAllowedRedirects = 0; + // Agent to use for fetching feeds $wgRSSUserAgent = "MediaWikiRSS/" . strtok( EXTENSION_RSS_VERSION, " " ) . " (+http://www.mediawiki.org/wiki/Extension:RSS) / MediaWiki RSS extension"; @@ -89,4 +94,4 @@ $wgRSSDateDefaultFormat = "(Y-m-d H:i:s)"; // limit the number of characters in the item description // or set to false for unlimited length. // $wgRSSItemMaxLength = false; -// $wgRSSItemMaxLength = 100; +$wgRSSItemMaxLength = 200; diff --git a/RSSData.php b/RSSData.php index 87d275b..6e51a0c 100644 --- a/RSSData.php +++ b/RSSData.php @@ -16,24 +16,11 @@ class RSSData { } $xpath = new DOMXPath( $xml ); - // register namespace as below, and apply a regex to the expression - // http://de3.php.net/manual/en/domxpath.query.php#103461 - $namespaceURI = $xml->lookupnamespaceURI( NULL ); + // namespace-safe method to find all elements + $items = $xpath->query( "//*[local-name() = 'item']" ); - if ( ( null !== $namespaceURI ) ) { - $defaultNS = "defaultNS"; - $xpath->registerNamespace( $defaultNS, $namespaceURI ); - $defaultNS = "defaultNS:"; - } else { - $defaultNS = ""; - } - - // is it an RSS feed ? - $items = $xpath->query( $this->namespacePrefixedQuery( "/rss/channel/item", $defaultNS ) ); - - if ( $items->length === 0 ) { - // or is it an ATOM feed ? - $items = $xpath->query( $this->namespacePrefixedQuery( "/feed/entry", $defaultNS ) ); + if ( $items->length == 0 ) { + $items = $xpath->query( "//*[local-name() = 'entry']" ); } if( $items->length !== 0 ) { @@ -61,14 +48,6 @@ class RSSData { } } - protected function namespacePrefixedQuery( $query, $namespace = "" ) { - if ( $namespace !== "" ) { - $ret = preg_replace( '#(::|/\s*|\A)(?![/@].+?|[a-z\-]+::)#', '$1' . $namespace . '$2', $query ); - } else { - $ret = $query; - } - return $ret; - } /** * Return a string that will be used to map RSS elements that * contain similar data (e.g. dc:date, date, and pubDate) to the diff --git a/RSSHooks.php b/RSSHooks.php index 966d6d4..00fcb48 100644 --- a/RSSHooks.php +++ b/RSSHooks.php @@ -21,19 +21,24 @@ class RSSHooks { * @param $frame PPFrame parser context */ static function renderRss( $input, $args, $parser, $frame ) { - global $wgRSSCacheAge, $wgRSSCacheCompare, $wgRSSNamespaces, $wgRSSUrlWhitelist; + global $wgRSSCacheAge, $wgRSSCacheCompare, $wgRSSNamespaces, + $wgRSSUrlWhitelist,$wgRSSAllowedFeeds; if ( is_array( $wgRSSNamespaces ) && count( $wgRSSNamespaces ) ) { $ns = $parser->getTitle()->getNamespace(); $checkNS = array_flip( $wgRSSNamespaces ); if( !isset( $checkNS[$ns] ) ) { - return wfMsg( 'rss-ns-permission' ); + return RSSUtils::RSSError( 'rss-ns-permission' ); } } switch ( true ) { + case ( isset( $wgRSSAllowedFeeds ) ): + return RSSUtils::RSSError( 'rss-deprecated-wgrssallowedfeeds-found' ); + break; + # disallow because there is no whitelist or empty whitelist case ( !isset( $wgRSSUrlWhitelist ) || !is_array( $wgRSSUrlWhitelist ) @@ -59,7 +64,7 @@ class RSSHooks { } if ( !Http::isValidURI( $input ) ) { - return wfMsg( 'rss-invalid-url', htmlspecialchars( $input ) ); + return RSSutils::RSSError( 'rss-invalid-url', htmlspecialchars( $input ) ); } if ( $wgRSSCacheCompare ) { $timeout = $wgRSSCacheCompare; @@ -79,7 +84,7 @@ class RSSHooks { } if ( !is_object( $rss->rss ) || !is_array( $rss->rss->items ) ) { - return wfMsg( 'rss-empty', htmlspecialchars( $input ) ); + return RSSUtils::RSSError( 'rss-empty', htmlspecialchars( $input ) ); } return $rss->renderFeed( $parser, $frame ); diff --git a/RSSParser.php b/RSSParser.php index 3a8fe28..d7b312b 100644 --- a/RSSParser.php +++ b/RSSParser.php @@ -218,7 +218,8 @@ class RSSParser { * @return Status object */ protected function fetchRemote( $key, array $headers = array()) { - global $wgRSSFetchTimeout, $wgRSSUserAgent, $wgRSSProxy; + global $wgRSSFetchTimeout, $wgRSSUserAgent, $wgRSSProxy, + $wgRSSUrlNumberOfAllowedRedirects; if ( $this->etag ) { wfDebugLog( 'RSS', 'Used etag: ' . $this->etag ); @@ -244,16 +245,54 @@ class RSSParser { */ $url = $this->url; - $noProxy = false; + $noProxy = !isset( $wgRSSProxy ); // Example for disabling proxy use for certain urls // $noProxy = preg_match( '!\.internal\.example\.com$!i', parse_url( $url, PHP_URL_HOST ) ); - + + /** + * Copied from HttpFunctions.php + * Perform an HTTP request + * + * @param $method String: HTTP method. Usually GET/POST + * @param $url String: full URL to act on. If protocol-relative, will be expanded to an http:// URL + * @param $options Array: options to pass to MWHttpRequest object. + * Possible keys for the array: + * - timeout Timeout length in seconds + * - postData An array of key-value pairs or a url-encoded form data + * - proxy The proxy to use. + * Otherwise it will use $wgHTTPProxy (if set) + * Otherwise it will use the environment variable "http_proxy" (if set) + * - noProxy Don't use any proxy at all. Takes precedence over proxy value(s). + * - sslVerifyHost (curl only) Verify hostname against certificate + * - sslVerifyCert (curl only) Verify SSL certificate + * - caInfo (curl only) Provide CA information + * - maxRedirects Maximum number of redirects to follow (defaults to 5) + * - followRedirects Whether to follow redirects (defaults to false). + * Note: this should only be used when the target URL is trusted, + * to avoid attacks on intranet services accessible by HTTP. + * - userAgent A user agent, if you want to override the default + * MediaWiki/$wgVersion + * @return Mixed: (bool)false on failure or a string on success + */ + + if ( isset( $wgRSSUrlNumberOfAllowedRedirects ) + && is_numeric( $wgRSSUrlNumberOfAllowedRedirects ) ) { + $maxRedirects = $wgRSSUrlNumberOfAllowedRedirects; + } else { + $maxRedirects = 0; + } + + // we set followRedirects intentionally to true to see error messages + // in cases where the maximum number of redirects is reached $client = HttpRequest::factory( $url, array( - 'timeout' => $wgRSSFetchTimeout, - 'proxy' => $wgRSSProxy, - 'noProxy' => $noProxy, + 'timeout' => $wgRSSFetchTimeout, + 'followRedirects' => true, + 'maxRedirects' => $maxRedirects, + 'proxy' => $wgRSSProxy, + 'noProxy' => $noProxy, + 'userAgent' => $wgRSSUserAgent, ) ); @@ -506,8 +545,8 @@ class RSSParser { * * @param $text String: the text to examine * @param $filterType String: "filterOut" to check for matches in the - * filterOut member list. - * Otherwise, uses the filter member list. + * filterOut member list. + * Otherwise, uses the filter member list. * @return Boolean: decision to filter or not. */ protected function filter( $text, $filterType ) { @@ -591,7 +630,7 @@ class RSSUtils { * @param String|Array $param Error parameter (or parameters) * @return String Html that is the error. */ - public static function RSSError( $errorMessageName, $param ) { + public static function RSSError( $errorMessageName, $param = false ) { // Anything from a parser tag should use Content lang for message, // since the cache doesn't vary by user language: do not use wfMsgForContent but wfMsgForContent