mediawiki-extensions-SpamBl.../SpamBlacklist_body.php

<?php

if ( !defined( 'MEDIAWIKI' ) ) {
	exit;
}

class SpamBlacklist {
	var $regexes = false;
	var $files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );
	var $warningTime = 600;
	var $expiryTime = 900;
	var $warningChance = 100;
	var $ignoreEditSummary = false;

	function __construct( $settings = array() ) {
		foreach ( $settings as $name => $value ) {
			$this->$name = $value;
		}
	}

	/**
	 * Check if the given local page title is a spam regex source.
	 * @param Title $title
	 * @return bool
	 */
	function isLocalSource( $title ) {
		global $wgDBname;

		if( $title->getNamespace() == NS_MEDIAWIKI ) {
			$sources = array(
				"Spam-blacklist",
				"Spam-whitelist" );
			if( in_array( $title->getDBkey(), $sources ) ) {
				return true;
			}
		}

		$thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
		$thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';

		foreach( $this->files as $fileName ) {
			$matches = array();
			if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
				if ( $wgDBname == $matches[1] ) {
					if( $matches[2] == $title->getPrefixedDbKey() ) {
						// Local DB fetch of this page...
						return true;
					}
				}
			} elseif( preg_match( $thisHttpRegex, $fileName ) ) {
				// Raw view of this page
				return true;
			}
		}

		return false;
	}

	/**
	 * Fetch local and (possibly cached) remote blacklists.
	 * Will be cached locally across multiple invocations.
	 * @return array set of regular expressions, potentially empty.
	 */
	function getBlacklists() {
		if( $this->regexes === false ) {
			$this->regexes = array_merge(
				$this->getLocalBlacklists(),
				$this->getSharedBlacklists() );
		}
		return $this->regexes;
	}

	/**
	 * Fetch (possibly cached) remote blacklists.
	 * @return array
	 */
	function getSharedBlacklists() {
		global $wgMemc, $wgDBname;
		$fname = 'SpamBlacklist::getRegex';
		wfProfileIn( $fname );

		wfDebugLog( 'SpamBlacklist', "Loading spam regex..." );

		if ( count( $this->files ) == 0 ){
			# No lists
			wfDebugLog( 'SpamBlacklist', "no files specified\n" );
			wfProfileOut( $fname );
			return array();
		}

		// This used to be cached per-site, but that could be bad on a shared
		// server where not all wikis have the same configuration.
		$cachedRegexes = $wgMemc->get( "$wgDBname:spam_blacklist_regexes" );
		if( is_array( $cachedRegexes ) ) {
			wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
			wfProfileOut( $fname );
			return $cachedRegexes;
		}

		$regexes = $this->buildSharedBlacklists();
		$wgMemc->set( "$wgDBname:spam_blacklist_regexes", $regexes, $this->expiryTime );

		return $regexes;
	}

	function clearCache() {
		global $wgMemc, $wgDBname;
		$wgMemc->delete( "$wgDBname:spam_blacklist_regexes" );
		wfDebugLog( 'SpamBlacklist', "Spam blacklist local cache cleared.\n" );
	}

	function buildSharedBlacklists() {
		$regexes = array();
		# Load lists
		wfDebugLog( 'SpamBlacklist', "Constructing spam blacklist\n" );
		foreach ( $this->files as $fileName ) {
			$matches = array();
			if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
				$text = $this->getArticleText( $matches[1], $matches[2] );
			} elseif ( preg_match( '/^http:\/\//', $fileName ) ) {
				$text = $this->getHttpText( $fileName );
			} else {
				$text = file_get_contents( $fileName );
				wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
			}

			// Build a separate batch of regexes from each source.
			// While in theory we could squeeze a little efficiency
			// out of combining multiple sources in one regex, if
			// there's a bad line in one of them we'll gain more
			// from only having to break that set into smaller pieces.
			$regexes = array_merge( $regexes,
				SpamRegexBatch::regexesFromText( $text, $fileName ) );
		}

		return $regexes;
	}

	function getHttpText( $fileName ) {
		global $wgDBname, $messageMemc;

		# HTTP request
		# To keep requests to a minimum, we save results into $messageMemc, which is
		# similar to $wgMemc except almost certain to exist. By default, it is stored
		# in the database
		#
		# There are two keys, when the warning key expires, a random thread will refresh
		# the real key. This reduces the chance of multiple requests under high traffic
		# conditions.
		$key = "spam_blacklist_file:$fileName";
		$warningKey = "$wgDBname:spamfilewarning:$fileName";
		$httpText = $messageMemc->get( $key );
		$warning = $messageMemc->get( $warningKey );

		if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
			wfDebugLog( 'SpamBlacklist', "Loading spam blacklist from $fileName\n" );
			$httpText = Http::get( $fileName );
			if( $httpText === false ) {
				wfDebugLog( 'SpamBlacklist', "Error loading blacklist from $fileName\n" );
			}
			$messageMemc->set( $warningKey, 1, $this->warningTime );
			$messageMemc->set( $key, $httpText, $this->expiryTime );
		} else {
			wfDebugLog( 'SpamBlacklist', "Got spam blacklist from HTTP cache for $fileName\n" );
		}
		return $httpText;
	}

	static function getLocalBlacklists() {
		return SpamRegexBatch::regexesFromMessage( 'spam-blacklist' );
	}

	static function getWhitelists() {
		return SpamRegexBatch::regexesFromMessage( 'spam-whitelist' );
	}

	/**
	 * @param Title $title
	 * @param string $text Text of section, or entire text if $editPage!=false
	 * @param string $section Section number or name
	 * @param EditSummary $editSummary Edit summary if one exists, some people use urls there too
	 * @param EditPage $editPage EditPage if EditFilterMerged was called, null otherwise
	 * @return Matched text if the edit should not be allowed, false otherwise
	 */
	function filter( &$title, $text, $section, $editsummary = '', EditPage &$editPage = null ) {
		global $wgParser, $wgUser;

		$fname = 'wfSpamBlacklistFilter';
		wfProfileIn( $fname );

		$this->title = $title;
		$this->text = $text;
		$this->section = $section;
		$text = str_replace( '．', '.', $text ); //@bug 12896

		$blacklists = $this->getBlacklists();
		$whitelists = $this->getWhitelists();

		if ( count( $blacklists ) ) {
			# Run parser to strip SGML comments and such out of the markup
			# This was being used to circumvent the filter (see bug 5185)
			if ( $editPage ) {
				$editInfo = $editPage->mArticle->prepareTextForEdit( $text );
				$out = $editInfo->output;
			} else {
				$options = new ParserOptions();
				$text = $wgParser->preSaveTransform( $text, $title, $wgUser, $options );
				$out = $wgParser->parse( $text, $title, $options );
			}
			$newLinks = array_keys( $out->getExternalLinks() );
			$oldLinks = $this->getCurrentLinks( $title );
			$addedLinks = array_diff( $newLinks, $oldLinks );

			// We add the edit summary if one exists
			if ( !$this->ignoreEditSummary && !empty( $editsummary ) ) {
				$addedLinks[] = $editsummary;
			}

			wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
			wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
			wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );

			$links = implode( "\n", $addedLinks );

			# Strip whitelisted URLs from the match
			if( is_array( $whitelists ) ) {
				wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
					" regexes: " . implode( ', ', $whitelists ) . "\n" );
				foreach( $whitelists as $regex ) {
					wfSuppressWarnings();
					$newLinks = preg_replace( $regex, '', $links );
					wfRestoreWarnings();
					if( is_string( $newLinks ) ) {
						// If there wasn't a regex error, strip the matching URLs
						$links = $newLinks;
					}
				}
			}

			# Do the match
			wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
				" regexes: " . implode( ', ', $blacklists ) . "\n" );
			$retVal = false;
			foreach( $blacklists as $regex ) {
				wfSuppressWarnings();
				$matches = array();
				$check = preg_match( $regex, $links, $matches );
				wfRestoreWarnings();
				if( $check ) {
					wfDebugLog( 'SpamBlacklist', "Match!\n" );
					$ip = wfGetIP();
					wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: {$matches[0]}\n" );
					$retVal = $matches[0];
					break;
				}
			}
		} else {
			$retVal = false;
		}

		wfProfileOut( $fname );
		return $retVal;
	}

	/**
	 * Look up the links currently in the article, so we can
	 * ignore them on a second run.
	 *
	 * WARNING: I can add more *of the same link* with no problem here.
	 */
	function getCurrentLinks( $title ) {
		$dbr = wfGetDB( DB_SLAVE );
		$id = $title->getArticleId(); // should be zero queries
		$res = $dbr->select( 'externallinks', array( 'el_to' ),
			array( 'el_from' => $id ), __METHOD__ );
		$links = array();
		foreach ( $res as $row ) {
			$links[] = $row->el_to;
		}
		return $links;
	}

	/**
	 * Fetch an article from this or another local MediaWiki database.
	 * This is probably *very* fragile, and shouldn't be used perhaps.
	 * @param string $db
	 * @param string $article
	 */
	function getArticleText( $db, $article ) {
		wfDebugLog( 'SpamBlacklist', "Fetching local spam blacklist from '$article' on '$db'...\n" );
		global $wgDBname;
		$dbr = wfGetDB( DB_READ );
		$dbr->selectDB( $db );
		$text = false;
		if ( $dbr->tableExists( 'page' ) ) {
			// 1.5 schema
			$dbw = wfGetDB( DB_READ );
			$dbw->selectDB( $db );
			$revision = Revision::newFromTitle( Title::newFromText( $article ) );
			if ( $revision ) {
				$text = $revision->getText();
			}
			$dbw->selectDB( $wgDBname );
		} else {
			// 1.4 schema
			$title = Title::newFromText( $article );
			$text = $dbr->selectField( 'cur', 'cur_text', array( 'cur_namespace' => $title->getNamespace(),
				'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleText' );
		}
		$dbr->selectDB( $wgDBname );
		return strval( $text );
	}

	/**
	 * Confirm that a local blacklist page being saved is valid,
	 * and toss back a warning to the user if it isn't.
	 * This is an EditFilter hook.
	 */
	function validate( $editPage, $text, $section, &$hookError ) {
		$thisPageName = $editPage->mTitle->getPrefixedDBkey();

		if( !$this->isLocalSource( $editPage->mTitle ) ) {
			wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" );
			return true;
		}

		$lines = explode( "\n", $text );

		$badLines = SpamRegexBatch::getBadLines( $lines );
		if( $badLines ) {
			wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " .
				implode( ', ', $badLines ) . "\n" );

			$badList = "*<tt>" .
				implode( "</tt>\n*<tt>",
					array_map( 'wfEscapeWikiText', $badLines ) ) .
				"</tt>\n";
			$hookError =
				"<div class='errorbox'>" .
				wfMsgExt( 'spam-invalid-lines', array( 'parsemag' ), count( $badLines ) ) . "<br />" .
				$badList .
				"</div>\n" .
				"<br clear='all' />\n";
			return true;
		} else {
			wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" );
			return true;
		}
	}

	function onArticleSave( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) {
		if( $this->isLocalSource( $article->getTitle() ) ) {
			$this->clearCache();
		}
		return true;
	}
}


class SpamRegexBatch {
	/**
	 * Build a set of regular expressions matching URLs with the list of regex fragments.
	 * Returns an empty list if the input list is empty.
	 *
	 * @param array $lines list of fragments which will match in URLs
	 * @param int $batchSize largest allowed batch regex;
	 *                       if 0, will produce one regex per line
	 * @return array
	 * @private
	 * @static
	 */
	static function buildRegexes( $lines, $batchSize=4096 ) {
		# Make regex
		# It's faster using the S modifier even though it will usually only be run once
		//$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
		//return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
		$regexes = array();
		$regexStart = '/https?:\/\/+[a-z0-9_\-.]*(';
		$regexEnd = ($batchSize > 0 ) ? ')/Sim' : ')/im';
		$build = false;
		foreach( $lines as $line ) {
			if( substr( $line, -1, 1 ) == "\\" ) {
				// Final \ will break silently on the batched regexes.
				// Skip it here to avoid breaking the next line;
				// warnings from getBadLines() will still trigger on
				// edit to keep new ones from floating in.
				continue;
			}
			// FIXME: not very robust size check, but should work. :)
			if( $build === false ) {
				$build = $line;
			} elseif( strlen( $build ) + strlen( $line ) > $batchSize ) {
				$regexes[] = $regexStart .
					str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
					$regexEnd;
				$build = $line;
			} else {
				$build .= '|';
				$build .= $line;
			}
		}
		if( $build !== false ) {
			$regexes[] = $regexStart .
				str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
				$regexEnd;
		}
		return $regexes;
	}

	/**
	 * Confirm that a set of regexes is either empty or valid.
	 * @param array $lines set of regexes
	 * @return bool true if ok, false if contains invalid lines
	 * @private
	 * @static
	 */
	static function validateRegexes( $regexes ) {
		foreach( $regexes as $regex ) {
			wfSuppressWarnings();
			$ok = preg_match( $regex, '' );
			wfRestoreWarnings();

			if( $ok === false ) {
				return false;
			}
		}
		return true;
	}

	/**
	 * Strip comments and whitespace, then remove blanks
	 * @private
	 * @static
	 */
	static function stripLines( $lines ) {
		return array_filter(
			array_map( 'trim',
					preg_replace( '/#.*$/', '',
						$lines ) ) );
	}

	/**
	 * Do a sanity check on the batch regex.
	 * @param lines unsanitized input lines
	 * @param string $fileName optional for debug reporting
	 * @return array of regexes
	 * @private
	 * @static
	 */
	static function buildSafeRegexes( $lines, $fileName=false ) {
		$lines = SpamRegexBatch::stripLines( $lines );
		$regexes = SpamRegexBatch::buildRegexes( $lines );
		if( SpamRegexBatch::validateRegexes( $regexes ) ) {
			return $regexes;
		} else {
			// _Something_ broke... rebuild line-by-line; it'll be
			// slower if there's a lot of blacklist lines, but one
			// broken line won't take out hundreds of its brothers.
			if( $fileName ) {
				wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
			}
			return SpamRegexBatch::buildRegexes( $lines, 0 );
		}
	}

	/**
	 * @param array $lines
	 * @return array of input lines which produce invalid input, or empty array if no problems
	 * @static
	 */
	static function getBadLines( $lines ) {
		$lines = SpamRegexBatch::stripLines( $lines );

		$badLines = array();
		foreach( $lines as $line ) {
			if( substr( $line, -1, 1 ) == "\\" ) {
				// Final \ will break silently on the batched regexes.
				$badLines[] = $line;
			}
		}

		$regexes = SpamRegexBatch::buildRegexes( $lines );
		if( SpamRegexBatch::validateRegexes( $regexes ) ) {
			// No other problems!
			return $badLines;
		}

		// Something failed in the batch, so check them one by one.
		foreach( $lines as $line ) {
			$regexes = SpamRegexBatch::buildRegexes( array( $line ) );
			if( !SpamRegexBatch::validateRegexes( $regexes ) ) {
				$badLines[] = $line;
			}
		}
		return $badLines;
	}

	/**
	 * Build a set of regular expressions from the given multiline input text,
	 * with empty lines and comments stripped.
	 *
	 * @param string $source
	 * @param string $fileName optional, for reporting of bad files
	 * @return array of regular expressions, potentially empty
	 * @static
	 */
	static function regexesFromText( $source, $fileName=false ) {
		$lines = explode( "\n", $source );
		return SpamRegexBatch::buildSafeRegexes( $lines, $fileName );
	}

	/**
	 * Build a set of regular expressions from a MediaWiki message.
	 * Will be correctly empty if the message isn't present.
	 * @param string $source
	 * @return array of regular expressions, potentially empty
	 * @static
	 */
	static function regexesFromMessage( $message ) {
		$source = wfMsgForContent( $message );
		if( $source && !wfEmptyMsg( $message, $source ) ) {
			return SpamRegexBatch::regexesFromText( $source );
		} else {
			return array();
		}
	}
}
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+								<?php
-												* Optimised startup
* Use the new EditFilterMerged hook if available, for faster link finding
* Random bits of code were leaking out of the body file into the loader, poked them back in.

											
										
										
											2007-11-12 07:44:17 +00:00
+								if ( !defined( 'MEDIAWIKI' ) ) {
 									exit;
 								}
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
 								class SpamBlacklist {
-												Split giant regexes so PCRE stops screaming about them.
Haven't tested cleanup.php

											
										
										
											2006-09-18 09:56:57 +00:00
+									var $regexes = false;
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									var $files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );
-												Support for HTTP, including working default, to load text from meta once per hour. Special attention paid to reducing load on meta, of course.

											
										
										
											2005-06-25 15:49:21 +00:00
+									var $warningTime = 600;
 									var $expiryTime = 900;
 									var $warningChance = 100;
-												PLEASE TEST: Bug #26332 — Patch that I think should fix the problem
  according to the comments, but needs more testing

* Also, a one line w/s fix up

											
										
										
											2011-05-03 20:23:35 +00:00
+									var $ignoreEditSummary = false;
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
-												More php4-style constructors. I think thats most of them

											
										
										
											2010-08-30 17:11:45 +00:00
+									function __construct( $settings = array() ) {
-												More configuration settings, fixed URL

											
										
										
											2005-07-08 16:29:22 +00:00
+										foreach ( $settings as $name => $value ) {
 											$this->$name = $value;
 										}
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+									}
-												More configuration settings, fixed URL

											
										
										
											2005-07-08 16:29:22 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Check if the given local page title is a spam regex source.
 									 * @param Title $title
 									 * @return bool
 									 */
 									function isLocalSource( $title ) {
 										global $wgDBname;
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										if( $title->getNamespace() == NS_MEDIAWIKI ) {
 											$sources = array(
 												"Spam-blacklist",
 												"Spam-whitelist" );
-												(bug 12608) Unifying the spelling of getDBkey() in the extension code.

											
										
										
											2008-01-14 10:09:08 +00:00
+											if( in_array( $title->getDBkey(), $sources ) ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+												return true;
 											}
 										}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Fix misspelled constant in r95663

											
										
										
											2011-08-31 19:07:44 +00:00
+										$thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										foreach( $this->files as $fileName ) {
-												More undefined variables

											
										
										
											2011-01-23 10:34:56 +00:00
+											$matches = array();
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
 												if ( $wgDBname == $matches[1] ) {
 													if( $matches[2] == $title->getPrefixedDbKey() ) {
 														// Local DB fetch of this page...
 														return true;
 													}
 												}
 											} elseif( preg_match( $thisHttpRegex, $fileName ) ) {
 												// Raw view of this page
 												return true;
 											}
 										}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return false;
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Fetch local and (possibly cached) remote blacklists.
 									 * Will be cached locally across multiple invocations.
 									 * @return array set of regular expressions, potentially empty.
 									 */
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+									function getBlacklists() {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										if( $this->regexes === false ) {
 											$this->regexes = array_merge(
 												$this->getLocalBlacklists(),
 												$this->getSharedBlacklists() );
 										}
 										return $this->regexes;
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Fetch (possibly cached) remote blacklists.
 									 * @return array
 									 */
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+									function getSharedBlacklists() {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										global $wgMemc, $wgDBname;
-												split the regex fetching part of the filter into its own function

											
										
										
											2006-01-19 17:14:10 +00:00
+										$fname = 'SpamBlacklist::getRegex';
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+										wfProfileIn( $fname );
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+										wfDebugLog( 'SpamBlacklist', "Loading spam regex..." );
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
 										if ( count( $this->files ) == 0 ){
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+											# No lists
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+											wfDebugLog( 'SpamBlacklist', "no files specified\n" );
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+											wfProfileOut( $fname );
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+											return array();
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+										}
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										// This used to be cached per-site, but that could be bad on a shared
 										// server where not all wikis have the same configuration.
 										$cachedRegexes = $wgMemc->get( "$wgDBname:spam_blacklist_regexes" );
 										if( is_array( $cachedRegexes ) ) {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+											wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											wfProfileOut( $fname );
 											return $cachedRegexes;
 										}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$regexes = $this->buildSharedBlacklists();
 										$wgMemc->set( "$wgDBname:spam_blacklist_regexes", $regexes, $this->expiryTime );
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return $regexes;
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									function clearCache() {
 										global $wgMemc, $wgDBname;
 										$wgMemc->delete( "$wgDBname:spam_blacklist_regexes" );
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+										wfDebugLog( 'SpamBlacklist', "Spam blacklist local cache cleared.\n" );
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									function buildSharedBlacklists() {
 										$regexes = array();
 										# Load lists
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+										wfDebugLog( 'SpamBlacklist', "Constructing spam blacklist\n" );
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+										foreach ( $this->files as $fileName ) {
-												More undefined variables

											
										
										
											2011-01-23 10:34:56 +00:00
+											$matches = array();
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
 												$text = $this->getArticleText( $matches[1], $matches[2] );
 											} elseif ( preg_match( '/^http:\/\//', $fileName ) ) {
 												$text = $this->getHttpText( $fileName );
 											} else {
 												$text = file_get_contents( $fileName );
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+												wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+											}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											// Build a separate batch of regexes from each source.
 											// While in theory we could squeeze a little efficiency
 											// out of combining multiple sources in one regex, if
 											// there's a bad line in one of them we'll gain more
 											// from only having to break that set into smaller pieces.
 											$regexes = array_merge( $regexes,
 												SpamRegexBatch::regexesFromText( $text, $fileName ) );
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+										}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return $regexes;
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									function getHttpText( $fileName ) {
 										global $wgDBname, $messageMemc;
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										# HTTP request
 										# To keep requests to a minimum, we save results into $messageMemc, which is
 										# similar to $wgMemc except almost certain to exist. By default, it is stored
 										# in the database
 										#
 										# There are two keys, when the warning key expires, a random thread will refresh
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
+										# the real key. This reduces the chance of multiple requests under high traffic
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										# conditions.
 										$key = "spam_blacklist_file:$fileName";
 										$warningKey = "$wgDBname:spamfilewarning:$fileName";
 										$httpText = $messageMemc->get( $key );
 										$warning = $messageMemc->get( $warningKey );
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+											wfDebugLog( 'SpamBlacklist', "Loading spam blacklist from $fileName\n" );
-												Remove getHttp() method and just call Http::get() directly.

											
										
										
											2009-05-18 00:48:07 +00:00
+											$httpText = Http::get( $fileName );
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											if( $httpText === false ) {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+												wfDebugLog( 'SpamBlacklist', "Error loading blacklist from $fileName\n" );
-												split the regex fetching part of the filter into its own function

											
										
										
											2006-01-19 17:14:10 +00:00
+											}
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											$messageMemc->set( $warningKey, 1, $this->warningTime );
 											$messageMemc->set( $key, $httpText, $this->expiryTime );
 										} else {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+											wfDebugLog( 'SpamBlacklist', "Got spam blacklist from HTTP cache for $fileName\n" );
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
+										}
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return $httpText;
-												split the regex fetching part of the filter into its own function

											
										
										
											2006-01-19 17:14:10 +00:00
+									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function getLocalBlacklists() {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return SpamRegexBatch::regexesFromMessage( 'spam-blacklist' );
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function getWhitelists() {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return SpamRegexBatch::regexesFromMessage( 'spam-whitelist' );
-												Add a local whitelist, editable by admins at [[MediaWiki:Spam-whitelist]]

											
										
										
											2006-06-22 19:59:43 +00:00
+									}
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
-												* Optimised startup
* Use the new EditFilterMerged hook if available, for faster link finding
* Random bits of code were leaking out of the body file into the loader, poked them back in.

											
										
										
											2007-11-12 07:44:17 +00:00
+									/**
 									 * @param Title $title
 									 * @param string $text Text of section, or entire text if $editPage!=false
 									 * @param string $section Section number or name
-												And now SpamBlacklist checks the edit summary field.

											
										
										
											2008-06-19 03:14:34 +00:00
+									 * @param EditSummary $editSummary Edit summary if one exists, some people use urls there too
-												Fix mixed up params

											
										
										
											2008-08-16 21:40:30 +00:00
+									 * @param EditPage $editPage EditPage if EditFilterMerged was called, null otherwise
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
+									 * @return Matched text if the edit should not be allowed, false otherwise
-												* Optimised startup
* Use the new EditFilterMerged hook if available, for faster link finding
* Random bits of code were leaking out of the body file into the loader, poked them back in.

											
										
										
											2007-11-12 07:44:17 +00:00
+									 */
-												Fix mixed up params

											
										
										
											2008-08-16 21:40:30 +00:00
+									function filter( &$title, $text, $section, $editsummary = '', EditPage &$editPage = null ) {
-												Remove some more unused globals

Kill a couple of other unused variables

											
										
										
											2010-07-25 17:12:50 +00:00
+										global $wgParser, $wgUser;
-												split the regex fetching part of the filter into its own function

											
										
										
											2006-01-19 17:14:10 +00:00
 										$fname = 'wfSpamBlacklistFilter';
 										wfProfileIn( $fname );
-												Updated DB: for the 1.5 schema, fixed a few bugs

											
										
										
											2006-01-23 01:35:39 +00:00
+										$this->title = $title;
 										$this->text = $text;
 										$this->section = $section;
-												* (bug 12896) A way to bypass Spam Blacklist

											
										
										
											2008-02-03 18:58:27 +00:00
+										$text = str_replace( '．', '.', $text ); //@bug 12896
-												Updated DB: for the 1.5 schema, fixed a few bugs

											
										
										
											2006-01-23 01:35:39 +00:00
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+										$blacklists = $this->getBlacklists();
-												Split giant regexes so PCRE stops screaming about them.
Haven't tested cleanup.php

											
										
										
											2006-09-18 09:56:57 +00:00
+										$whitelists = $this->getWhitelists();
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+										if ( count( $blacklists ) ) {
-												Run text through the parser and get the actual links recorded instead of trying to second-guess behavior

											
										
										
											2006-06-22 20:35:49 +00:00
+											# Run parser to strip SGML comments and such out of the markup
-												(bug 5185) Strip out SGML comments before scanning the text for matches so some nutter can't circumvent the lot with a well placed <!-- -->

											
										
										
											2006-04-12 04:59:27 +00:00
+											# This was being used to circumvent the filter (see bug 5185)
-												* Optimised startup
* Use the new EditFilterMerged hook if available, for faster link finding
* Random bits of code were leaking out of the body file into the loader, poked them back in.

											
										
										
											2007-11-12 07:44:17 +00:00
+											if ( $editPage ) {
 												$editInfo = $editPage->mArticle->prepareTextForEdit( $text );
 												$out = $editInfo->output;
 											} else {
 												$options = new ParserOptions();
 												$text = $wgParser->preSaveTransform( $text, $title, $wgUser, $options );
 												$out = $wgParser->parse( $text, $title, $options );
 											}
-												* (bug 1505) Limit spam blacklist checks to new URLs to reduce disruption of existing pages being legitimately edited by legitimate people which happen to already have some spam on them.

Steals the load-existing-links function out of ConfirmEdit.

											
										
										
											2008-05-13 23:31:33 +00:00
+											$newLinks = array_keys( $out->getExternalLinks() );
 											$oldLinks = $this->getCurrentLinks( $title );
 											$addedLinks = array_diff( $newLinks, $oldLinks );
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												And now SpamBlacklist checks the edit summary field.

											
										
										
											2008-06-19 03:14:34 +00:00
+											// We add the edit summary if one exists
-												More undefined variables

											
										
										
											2011-01-23 10:34:56 +00:00
+											if ( !$this->ignoreEditSummary && !empty( $editsummary ) ) {
 												$addedLinks[] = $editsummary;
 											}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												* (bug 1505) Limit spam blacklist checks to new URLs to reduce disruption of existing pages being legitimately edited by legitimate people which happen to already have some spam on them.

Steals the load-existing-links function out of ConfirmEdit.

											
										
										
											2008-05-13 23:31:33 +00:00
+											wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
 											wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
 											wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												* (bug 1505) Limit spam blacklist checks to new URLs to reduce disruption of existing pages being legitimately edited by legitimate people which happen to already have some spam on them.

Steals the load-existing-links function out of ConfirmEdit.

											
										
										
											2008-05-13 23:31:33 +00:00
+											$links = implode( "\n", $addedLinks );
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
-												Add a local whitelist, editable by admins at [[MediaWiki:Spam-whitelist]]

											
										
										
											2006-06-22 19:59:43 +00:00
+											# Strip whitelisted URLs from the match
-												Split giant regexes so PCRE stops screaming about them.
Haven't tested cleanup.php

											
										
										
											2006-09-18 09:56:57 +00:00
+											if( is_array( $whitelists ) ) {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+												wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
-												Split giant regexes so PCRE stops screaming about them.
Haven't tested cleanup.php

											
										
										
											2006-09-18 09:56:57 +00:00
+													" regexes: " . implode( ', ', $whitelists ) . "\n" );
 												foreach( $whitelists as $regex ) {
-												suppress warnings

											
										
										
											2007-08-08 15:42:36 +00:00
+													wfSuppressWarnings();
-												* (bug 11545) Don't let everything through if there's a bogus whitelist entry

											
										
										
											2007-10-03 00:48:57 +00:00
+													$newLinks = preg_replace( $regex, '', $links );
-												suppress warnings

											
										
										
											2007-08-08 15:42:36 +00:00
+													wfRestoreWarnings();
-												* (bug 11545) Don't let everything through if there's a bogus whitelist entry

											
										
										
											2007-10-03 00:48:57 +00:00
+													if( is_string( $newLinks ) ) {
 														// If there wasn't a regex error, strip the matching URLs
 														$links = $newLinks;
 													}
-												Split giant regexes so PCRE stops screaming about them.
Haven't tested cleanup.php

											
										
										
											2006-09-18 09:56:57 +00:00
+												}
-												Add a local whitelist, editable by admins at [[MediaWiki:Spam-whitelist]]

											
										
										
											2006-06-22 19:59:43 +00:00
+											}
-												(bug 5185) Strip out SGML comments before scanning the text for matches so some nutter can't circumvent the lot with a well placed <!-- -->

											
										
										
											2006-04-12 04:59:27 +00:00
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+											# Do the match
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+											wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+												" regexes: " . implode( ', ', $blacklists ) . "\n" );
-												Split giant regexes so PCRE stops screaming about them.
Haven't tested cleanup.php

											
										
										
											2006-09-18 09:56:57 +00:00
+											$retVal = false;
-												Add a local blacklist at MediaWiki:Spam-blacklist which can always be used, just as the local whitelist at MediaWiki:Spam-whitelist.
Should save some trouble for annoyed people. :)
The regular message cache behavior is used for this message, so it'll also update immediately, without waiting for the shared caches to time out.
Additionally, added a fix for configurations which don't hardcode the PHP include_path by using $IP in an include for HttpFunctions.php.

											
										
										
											2007-07-07 17:21:49 +00:00
+											foreach( $blacklists as $regex ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+												wfSuppressWarnings();
-												More undefined variables

											
										
										
											2011-01-23 10:34:56 +00:00
+												$matches = array();
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+												$check = preg_match( $regex, $links, $matches );
 												wfRestoreWarnings();
 												if( $check ) {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+													wfDebugLog( 'SpamBlacklist', "Match!\n" );
-												apply live hacks: debug logging for spam regex & blacklist hits

											
										
										
											2008-06-19 23:33:45 +00:00
+													$ip = wfGetIP();
 													wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: {$matches[0]}\n" );
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
+													$retVal = $matches[0];
-												Split giant regexes so PCRE stops screaming about them.
Haven't tested cleanup.php

											
										
										
											2006-09-18 09:56:57 +00:00
+													break;
 												}
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+											}
 										} else {
 											$retVal = false;
 										}
 										wfProfileOut( $fname );
 										return $retVal;
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												* (bug 1505) Limit spam blacklist checks to new URLs to reduce disruption of existing pages being legitimately edited by legitimate people which happen to already have some spam on them.

Steals the load-existing-links function out of ConfirmEdit.

											
										
										
											2008-05-13 23:31:33 +00:00
+									/**
 									 * Look up the links currently in the article, so we can
 									 * ignore them on a second run.
 									 *
 									 * WARNING: I can add more *of the same link* with no problem here.
 									 */
 									function getCurrentLinks( $title ) {
-												Get rid of the last (I think) php4-style calls to wfGetDB()

											
										
										
											2010-02-13 23:03:40 +00:00
+										$dbr = wfGetDB( DB_SLAVE );
-												* (bug 1505) Limit spam blacklist checks to new URLs to reduce disruption of existing pages being legitimately edited by legitimate people which happen to already have some spam on them.

Steals the load-existing-links function out of ConfirmEdit.

											
										
										
											2008-05-13 23:31:33 +00:00
+										$id = $title->getArticleId(); // should be zero queries
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
+										$res = $dbr->select( 'externallinks', array( 'el_to' ),
-												* (bug 1505) Limit spam blacklist checks to new URLs to reduce disruption of existing pages being legitimately edited by legitimate people which happen to already have some spam on them.

Steals the load-existing-links function out of ConfirmEdit.

											
										
										
											2008-05-13 23:31:33 +00:00
+											array( 'el_from' => $id ), __METHOD__ );
 										$links = array();
-												Conditionals in loops to foreachs

											
										
										
											2010-10-29 21:30:20 +00:00
+										foreach ( $res as $row ) {
-												* (bug 1505) Limit spam blacklist checks to new URLs to reduce disruption of existing pages being legitimately edited by legitimate people which happen to already have some spam on them.

Steals the load-existing-links function out of ConfirmEdit.

											
										
										
											2008-05-13 23:31:33 +00:00
+											$links[] = $row->el_to;
 										}
 										return $links;
 									}
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Fetch an article from this or another local MediaWiki database.
 									 * This is probably *very* fragile, and shouldn't be used perhaps.
 									 * @param string $db
 									 * @param string $article
 									 */
 									function getArticleText( $db, $article ) {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+										wfDebugLog( 'SpamBlacklist', "Fetching local spam blacklist from '$article' on '$db'...\n" );
-												Updated DB: for the 1.5 schema, fixed a few bugs

											
										
										
											2006-01-23 01:35:39 +00:00
+										global $wgDBname;
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+										$dbr = wfGetDB( DB_READ );
-												Updated DB: for the 1.5 schema, fixed a few bugs

											
										
										
											2006-01-23 01:35:39 +00:00
+										$dbr->selectDB( $db );
 										$text = false;
 										if ( $dbr->tableExists( 'page' ) ) {
 											// 1.5 schema
-												Get rid of the last (I think) php4-style calls to wfGetDB()

											
										
										
											2010-02-13 23:03:40 +00:00
+											$dbw = wfGetDB( DB_READ );
-												Updated DB: for the 1.5 schema, fixed a few bugs

											
										
										
											2006-01-23 01:35:39 +00:00
+											$dbw->selectDB( $db );
 											$revision = Revision::newFromTitle( Title::newFromText( $article ) );
 											if ( $revision ) {
 												$text = $revision->getText();
 											}
 											$dbw->selectDB( $wgDBname );
 										} else {
 											// 1.4 schema
 											$title = Title::newFromText( $article );
 											$text = $dbr->selectField( 'cur', 'cur_text', array( 'cur_namespace' => $title->getNamespace(),
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+												'cur_title' => $title->getDBkey() ), 'SpamBlacklist::getArticleText' );
-												Updated DB: for the 1.5 schema, fixed a few bugs

											
										
										
											2006-01-23 01:35:39 +00:00
+										}
 										$dbr->selectDB( $wgDBname );
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return strval( $text );
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+									}
-												forgot to commit this

											
										
										
											2005-07-02 09:03:53 +00:00
-												* Optimised startup
* Use the new EditFilterMerged hook if available, for faster link finding
* Random bits of code were leaking out of the body file into the loader, poked them back in.

											
										
										
											2007-11-12 07:44:17 +00:00
+									/**
 									 * Confirm that a local blacklist page being saved is valid,
 									 * and toss back a warning to the user if it isn't.
 									 * This is an EditFilter hook.
 									 */
 									function validate( $editPage, $text, $section, &$hookError ) {
 										$thisPageName = $editPage->mTitle->getPrefixedDBkey();
-												* (bug 12896) A way to bypass Spam Blacklist

											
										
										
											2008-02-03 18:58:27 +00:00
-												* Optimised startup
* Use the new EditFilterMerged hook if available, for faster link finding
* Random bits of code were leaking out of the body file into the loader, poked them back in.

											
										
										
											2007-11-12 07:44:17 +00:00
+										if( !$this->isLocalSource( $editPage->mTitle ) ) {
 											wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] not a local blacklist\n" );
 											return true;
 										}
 										$lines = explode( "\n", $text );
 										$badLines = SpamRegexBatch::getBadLines( $lines );
 										if( $badLines ) {
 											wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] given invalid input lines: " .
 												implode( ', ', $badLines ) . "\n" );
 											$badList = "*<tt>" .
 												implode( "</tt>\n*<tt>",
 													array_map( 'wfEscapeWikiText', $badLines ) ) .
 												"</tt>\n";
 											$hookError =
 												"<div class='errorbox'>" .
-												Replace "\n" in 'spam-invalid-lines' by hard coded "<br />".

											
										
										
											2009-01-25 20:28:24 +00:00
+												wfMsgExt( 'spam-invalid-lines', array( 'parsemag' ), count( $badLines ) ) . "<br />" .
-												* Optimised startup
* Use the new EditFilterMerged hook if available, for faster link finding
* Random bits of code were leaking out of the body file into the loader, poked them back in.

											
										
										
											2007-11-12 07:44:17 +00:00
+												$badList .
 												"</div>\n" .
 												"<br clear='all' />\n";
 											return true;
 										} else {
 											wfDebugLog( 'SpamBlacklist', "Spam blacklist validator: [[$thisPageName]] ok or empty blacklist\n" );
 											return true;
 										}
 									}
 									function onArticleSave( &$article, &$user, $text, $summary, $isminor, $iswatch, $section ) {
 										if( $this->isLocalSource( $article->getTitle() ) ) {
 											$this->clearCache();
 										}
 										return true;
 									}
-												from phase3/extensions

											
										
										
											2004-12-11 09:59:06 +00:00
+								}
-												remove some ending whitespaces

											
										
										
											2007-01-06 20:56:46 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+								class SpamRegexBatch {
 									/**
 									 * Build a set of regular expressions matching URLs with the list of regex fragments.
 									 * Returns an empty list if the input list is empty.
 									 *
 									 * @param array $lines list of fragments which will match in URLs
 									 * @param int $batchSize largest allowed batch regex;
 									 *                       if 0, will produce one regex per line
 									 * @return array
 									 * @private
 									 * @static
 									 */
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function buildRegexes( $lines, $batchSize=4096 ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										# Make regex
 										# It's faster using the S modifier even though it will usually only be run once
-												* (bug 11129) Hit spam blacklist on https: links

											
										
										
											2007-09-11 18:13:40 +00:00
+										//$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
-												(bug 21387) Make $ regex work for the URLs. Patch contributed by Platonides.

Bug comment: Set PCRE_MULTILINE on spamblacklist regexes. $ on spam blacklist regex should match the end of the url (not of the text) so it can be used to match only the mainpage. Since the candidate urls are already joined with a new-line separator, it's just setting PCRE_MULTILINE on the regex.

											
										
										
											2010-01-09 18:43:34 +00:00
+										//return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$regexes = array();
-												* (bug 11129) Hit spam blacklist on https: links

											
										
										
											2007-09-11 18:13:40 +00:00
+										$regexStart = '/https?:\/\/+[a-z0-9_\-.]*(';
-												(bug 21387) Make $ regex work for the URLs. Patch contributed by Platonides.

Bug comment: Set PCRE_MULTILINE on spamblacklist regexes. $ on spam blacklist regex should match the end of the url (not of the text) so it can be used to match only the mainpage. Since the candidate urls are already joined with a new-line separator, it's just setting PCRE_MULTILINE on the regex.

											
										
										
											2010-01-09 18:43:34 +00:00
+										$regexEnd = ($batchSize > 0 ) ? ')/Sim' : ')/im';
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$build = false;
 										foreach( $lines as $line ) {
-												* (bug 15099) Bad regexes make the at least some of the blacklist get ignored
Lines with "\" at the end would silently break both that and the following line in the batch, without triggering the overall parse errors.
Added a specific check for this case to skip the bad lines when building, and to check for them and report a warning during editing.

											
										
										
											2008-08-11 01:32:38 +00:00
+											if( substr( $line, -1, 1 ) == "\\" ) {
 												// Final \ will break silently on the batched regexes.
 												// Skip it here to avoid breaking the next line;
 												// warnings from getBadLines() will still trigger on
 												// edit to keep new ones from floating in.
 												continue;
 											}
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											// FIXME: not very robust size check, but should work. :)
 											if( $build === false ) {
 												$build = $line;
 											} elseif( strlen( $build ) + strlen( $line ) > $batchSize ) {
 												$regexes[] = $regexStart .
-												PLEASE TEST: Bug #26332 — Patch that I think should fix the problem
  according to the comments, but needs more testing

* Also, a one line w/s fix up

											
										
										
											2011-05-03 20:23:35 +00:00
+													str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+													$regexEnd;
 												$build = $line;
 											} else {
 												$build .= '|';
 												$build .= $line;
 											}
 										}
 										if( $build !== false ) {
 											$regexes[] = $regexStart .
-												PLEASE TEST: Bug #26332 — Patch that I think should fix the problem
  according to the comments, but needs more testing

* Also, a one line w/s fix up

											
										
										
											2011-05-03 20:23:35 +00:00
+												str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+												$regexEnd;
 										}
 										return $regexes;
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Confirm that a set of regexes is either empty or valid.
 									 * @param array $lines set of regexes
 									 * @return bool true if ok, false if contains invalid lines
 									 * @private
 									 * @static
 									 */
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function validateRegexes( $regexes ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										foreach( $regexes as $regex ) {
 											wfSuppressWarnings();
 											$ok = preg_match( $regex, '' );
 											wfRestoreWarnings();
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											if( $ok === false ) {
 												return false;
 											}
 										}
 										return true;
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Strip comments and whitespace, then remove blanks
 									 * @private
 									 * @static
 									 */
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function stripLines( $lines ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										return array_filter(
 											array_map( 'trim',
-												* (bug 12896) A way to bypass Spam Blacklist

											
										
										
											2008-02-03 18:58:27 +00:00
+													preg_replace( '/#.*$/', '',
 														$lines ) ) );
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Do a sanity check on the batch regex.
 									 * @param lines unsanitized input lines
 									 * @param string $fileName optional for debug reporting
 									 * @return array of regexes
 									 * @private
 									 * @static
 									 */
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function buildSafeRegexes( $lines, $fileName=false ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$lines = SpamRegexBatch::stripLines( $lines );
 										$regexes = SpamRegexBatch::buildRegexes( $lines );
 										if( SpamRegexBatch::validateRegexes( $regexes ) ) {
 											return $regexes;
 										} else {
 											// _Something_ broke... rebuild line-by-line; it'll be
 											// slower if there's a lot of blacklist lines, but one
 											// broken line won't take out hundreds of its brothers.
 											if( $fileName ) {
-												Break spam blacklist log info out to a sep file

											
										
										
											2007-10-03 00:19:36 +00:00
+												wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+											}
 											return SpamRegexBatch::buildRegexes( $lines, 0 );
 										}
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * @param array $lines
 									 * @return array of input lines which produce invalid input, or empty array if no problems
 									 * @static
 									 */
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function getBadLines( $lines ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$lines = SpamRegexBatch::stripLines( $lines );
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												* (bug 15099) Bad regexes make the at least some of the blacklist get ignored
Lines with "\" at the end would silently break both that and the following line in the batch, without triggering the overall parse errors.
Added a specific check for this case to skip the bad lines when building, and to check for them and report a warning during editing.

											
										
										
											2008-08-11 01:32:38 +00:00
+										$badLines = array();
 										foreach( $lines as $line ) {
 											if( substr( $line, -1, 1 ) == "\\" ) {
 												// Final \ will break silently on the batched regexes.
 												$badLines[] = $line;
 											}
 										}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$regexes = SpamRegexBatch::buildRegexes( $lines );
 										if( SpamRegexBatch::validateRegexes( $regexes ) ) {
-												* (bug 15099) Bad regexes make the at least some of the blacklist get ignored
Lines with "\" at the end would silently break both that and the following line in the batch, without triggering the overall parse errors.
Added a specific check for this case to skip the bad lines when building, and to check for them and report a warning during editing.

											
										
										
											2008-08-11 01:32:38 +00:00
+											// No other problems!
 											return $badLines;
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												* (bug 15099) Bad regexes make the at least some of the blacklist get ignored
Lines with "\" at the end would silently break both that and the following line in the batch, without triggering the overall parse errors.
Added a specific check for this case to skip the bad lines when building, and to check for them and report a warning during editing.

											
										
										
											2008-08-11 01:32:38 +00:00
+										// Something failed in the batch, so check them one by one.
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										foreach( $lines as $line ) {
 											$regexes = SpamRegexBatch::buildRegexes( array( $line ) );
 											if( !SpamRegexBatch::validateRegexes( $regexes ) ) {
 												$badLines[] = $line;
 											}
 										}
 										return $badLines;
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Build a set of regular expressions from the given multiline input text,
 									 * with empty lines and comments stripped.
 									 *
 									 * @param string $source
 									 * @param string $fileName optional, for reporting of bad files
 									 * @return array of regular expressions, potentially empty
 									 * @static
 									 */
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function regexesFromText( $source, $fileName=false ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$lines = explode( "\n", $source );
 										return SpamRegexBatch::buildSafeRegexes( $lines, $fileName );
 									}
-												(bug 16120) Prevent death on Spam Blacklist trigger using API. Patch by Brad Jorsch.

An API edit attempt with Spam Blacklist firing will now output something instead of crashing:

<?xml version="1.0"?><api><edit spamblacklist="http://blacklistme.example.com"
result="Failure" /></api>

											
										
										
											2008-11-02 22:40:02 +00:00
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+									/**
 									 * Build a set of regular expressions from a MediaWiki message.
 									 * Will be correctly empty if the message isn't present.
 									 * @param string $source
 									 * @return array of regular expressions, potentially empty
 									 * @static
 									 */
-												Fix E_SCRICT errors and pass-by-ref error. Blank pages getting thrown around.

											
										
										
											2008-05-14 12:44:34 +00:00
+									static function regexesFromMessage( $message ) {
-												Some polishing and refactoring on this monstrosity, it's been allowed to grow without some good snipping in a while. :)
* Handle bad regexes more gracefully:
 - The batched regexes are tested for validity, and if one is bad, the lines from that source are broken out line-by-line. This is slower, but the other lines in that source will still be applied correctly.
 - Suppress warnings and be more verbose in the debug log.
 - Check for bad regexes when a local blacklist page is edited, and prompt the user to fix the bad lines.
* Caching issues:
 - Cache the full regexes per-DB instead of per-site; this should be friendlier to shared environments where not every wiki has the same configuration.
 - Hopefully improve the recaching of local pages, which looked like it would preemptively apply the being-edited text to the cache during the filter callback, even though something else might stop the page from being saved. Now just clearing the cache after save is complete, letting it re-load later.
* Split out some of the regex batch functions for clarity.

There are probably still issues with caching of HTTP bits, and in general the local DB loading looks verrrry fragile.
Test this a bit more before syncing. :)

											
										
										
											2007-07-20 21:13:26 +00:00
+										$source = wfMsgForContent( $message );
 										if( $source && !wfEmptyMsg( $message, $source ) ) {
 											return SpamRegexBatch::regexesFromText( $source );
 										} else {
 											return array();
 										}
 									}
 								}