mediawiki-extensions-SpamBl.../SpamBlacklist_body.php

<?php

if ( defined( 'MEDIAWIKI' ) ) {

class SpamBlacklist {
	var $regex = false;
	var $previousFilter = false;
	var $files = array();
	var $warningTime = 600;
	var $expiryTime = 900;
	var $warningChance = 100;
	
	function SpamBlacklist( $settings = array() ) {
		global $IP;
		$this->files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );

		foreach ( $settings as $name => $value ) {
			$this->$name = $value;
		}
	}

	function &getRegex() {
		global $wgMemc, $wgDBname, $messageMemc;
		$fname = 'SpamBlacklist::getRegex';
		wfProfileIn( $fname );

		if ( $this->regex !== false ) {
			return $this->regex;
		}

		wfDebug( "Loading spam regex..." );
		
		if ( !is_array( $this->files ) ) {
			$this->files = array( $this->files );
		}
		if ( count( $this->files ) == 0 ){ 
			# No lists
			wfDebug( "no files specified\n" );
			wfProfileOut( $fname );
			return false;
		}

		# Refresh cache if we are saving the blacklist
		$recache = false;
		foreach ( $this->files as $fileName ) {
			if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
				if ( $wgDBname == $matches[1] && $title->getPrefixedDBkey() == $matches[2] ) {
					$recache = true;
					break;
				}
			}
		}

		if ( $this->regex === false || $recache ) {
			if ( !$recache ) {
				$this->regex = $wgMemc->get( "spam_blacklist_regex" );
			}
			if ( !$this->regex ) {
				# Load lists
				$lines = array();
				wfDebug( "Constructing spam blacklist\n" );
				foreach ( $this->files as $fileName ) {
					if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
						if ( $wgDBname == $matches[1] && $title->getPrefixedDBkey() == $matches[2] ) {
							$lines = array_merge( $lines, explode( "\n", $text ) );
						} else {
							$lines = array_merge( $lines, $this->getArticleLines( $matches[1], $matches[2] ) );
						}
						wfDebug( "got from DB\n" );
					} elseif ( preg_match( '/^http:\/\//', $fileName ) ) {
						# HTTP request
						# To keep requests to a minimum, we save results into $messageMemc, which is
						# similar to $wgMemc except almost certain to exist. By default, it is stored
						# in the database
						#
						# There are two keys, when the warning key expires, a random thread will refresh
						# the real key. This reduces the chance of multiple requests under high traffic 
						# conditions.
						$key = "spam_blacklist_file:$fileName";
						$warningKey = "$wgDBname:spamfilewarning:$fileName";
						$httpText = $messageMemc->get( $key );
						$warning = $messageMemc->get( $warningKey );

						if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
							wfDebug( "Loading spam blacklist from $fileName\n" );
							$httpText = $this->getHTTP( $fileName );
							$messageMemc->set( $warningKey, 1, $this->warningTime );
							$messageMemc->set( $key, $httpText, $this->expiryTime );
						} else {
							wfDebug( "got from HTTP cache\n" );
						}						
						$lines = array_merge( $lines, explode( "\n", $httpText ) );
					} else {
						$lines = array_merge( $lines, file( $fileName ) );
						wfDebug( "got from file\n" );
					}
				}

				# Strip comments and whitespace, then remove blanks
				$lines = array_filter( array_map( 'trim', preg_replace( '/#.*$/', '', $lines ) ) );

				# No lines, don't make a regex which will match everything
				if ( count( $lines ) == 0 ) {
					wfDebug( "No lines\n" );
					$this->regex = true;
				} else {
					# Make regex
					# It's faster using the S modifier even though it will usually only be run once
					$this->regex = 'http://[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
					$this->regex = '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $this->regex) ) . '/Si';
				}
				$wgMemc->set( "spam_blacklist_regex", $this->regex, 3600 );
			} else {
				wfDebug( "got from cache\n" );
			}
		} 
		if ( $this->regex[0] != '/' || substr( $this->regex, -3 ) != '/Si' ) {
			// Corrupt regex
			wfDebug( "Corrupt regex\n" );
			$this->regex = false;
		}
		wfProfileOut( $fname );
		return $this->regex;
	}
	
	function filter( &$title, $text, $section ) {
		global $wgArticle, $wgVersion, $wgOut;

		$fname = 'wfSpamBlacklistFilter';
		wfProfileIn( $fname );

		# Call the rest of the hook chain first
		if ( $this->previousFilter ) {
			$f = $this->previousFilter;
			if ( $f( $title, $text, $section ) ) {
				wfProfileOut( $fname );
				return true;
			}
		}

		$regex =& $this->getRegex();

		if ( $regex && $regex[0] == '/' ) {
			# Do the match
			wfDebug( "Checking text against regex: $regex\n" );
			if ( preg_match( $regex, $text, $matches ) ) {
				wfDebug( "Match!\n" );
				EditPage::spamPage( $matches[0] );
				$retVal = true;
			} else {
				$retVal = false;
			}
		} else {
			$retVal = false;
		}

		wfProfileOut( $fname );
		return $retVal;
	}

	function getArticleLines( $db, $article ) {
		$dbr = wfGetDB( DB_READ );
		$cur = $dbr->tableName( 'cur' );
		$res = $dbr->query( "SELECT cur_text FROM $db.$cur WHERE cur_namespace=0 AND cur_title='$article'" );
		$row = $dbr->fetchObject( $res );
		if ( $row ) {
			return explode( "\n", $row->cur_text );
		} else {
			return array();
		}
	}

	function getHTTP( $url ) {
		// Use wfGetHTTP from MW 1.5 if it is available
		include_once( 'HttpFunctions.php' );
		if ( function_exists( 'wfGetHTTP' ) ) {
			$text = wfGetHTTP( $url );
		} else {
			$url_fopen = ini_set( 'allow_url_fopen', 1 );
			$text = file_get_contents( $url );
			ini_set( 'allow_url_fopen', $url_fopen );
		}
		return $text;
	}
}

	
} # End invocation guard
?>
from phase3/extensions 2004-12-11 09:59:06 +00:00			`<?php`

			`if ( defined( 'MEDIAWIKI' ) ) {`

			`class SpamBlacklist {`
			`var $regex = false;`
			`var $previousFilter = false;`
			`var $files = array();`
Support for HTTP, including working default, to load text from meta once per hour. Special attention paid to reducing load on meta, of course. 2005-06-25 15:49:21 +00:00			`var $warningTime = 600;`
			`var $expiryTime = 900;`
			`var $warningChance = 100;`
from phase3/extensions 2004-12-11 09:59:06 +00:00
More configuration settings, fixed URL 2005-07-08 16:29:22 +00:00			`function SpamBlacklist( $settings = array() ) {`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`global $IP;`
More configuration settings, fixed URL 2005-07-08 16:29:22 +00:00			`$this->files = array( "http://meta.wikimedia.org/w/index.php?title=Spam_blacklist&action=raw&sb_ver=1" );`

			`foreach ( $settings as $name => $value ) {`
			`$this->$name = $value;`
			`}`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`}`
More configuration settings, fixed URL 2005-07-08 16:29:22 +00:00
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`function &getRegex() {`
			`global $wgMemc, $wgDBname, $messageMemc;`
			`$fname = 'SpamBlacklist::getRegex';`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`wfProfileIn( $fname );`

split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`if ( $this->regex !== false ) {`
			`return $this->regex;`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`}`

split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`wfDebug( "Loading spam regex..." );`

from phase3/extensions 2004-12-11 09:59:06 +00:00			`if ( !is_array( $this->files ) ) {`
			`$this->files = array( $this->files );`
			`}`
			`if ( count( $this->files ) == 0 ){`
			`# No lists`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`wfDebug( "no files specified\n" );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`wfProfileOut( $fname );`
			`return false;`
			`}`

			`# Refresh cache if we are saving the blacklist`
			`$recache = false;`
			`foreach ( $this->files as $fileName ) {`
			`if ( preg_match( '/^DB: (\w) (.)$/', $fileName, $matches ) ) {`
and here 2005-02-20 09:09:29 +00:00			`if ( $wgDBname == $matches[1] && $title->getPrefixedDBkey() == $matches[2] ) {`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`$recache = true;`
			`break;`
			`}`
			`}`
			`}`

			`if ( $this->regex === false \|\| $recache ) {`
			`if ( !$recache ) {`
			`$this->regex = $wgMemc->get( "spam_blacklist_regex" );`
			`}`
			`if ( !$this->regex ) {`
			`# Load lists`
			`$lines = array();`
Support for HTTP, including working default, to load text from meta once per hour. Special attention paid to reducing load on meta, of course. 2005-06-25 15:49:21 +00:00			`wfDebug( "Constructing spam blacklist\n" );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`foreach ( $this->files as $fileName ) {`
			`if ( preg_match( '/^DB: (\w) (.)$/', $fileName, $matches ) ) {`
preg_match keys start at 1, not 0 2005-02-20 09:03:13 +00:00			`if ( $wgDBname == $matches[1] && $title->getPrefixedDBkey() == $matches[2] ) {`
bug #2598: only one blacklist file is parsed by SpamBlacklist extension 2005-11-01 07:25:33 +00:00			`$lines = array_merge( $lines, explode( "\n", $text ) );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`} else {`
bug #2598: only one blacklist file is parsed by SpamBlacklist extension 2005-11-01 07:25:33 +00:00			`$lines = array_merge( $lines, $this->getArticleLines( $matches[1], $matches[2] ) );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`}`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`wfDebug( "got from DB\n" );`
Support for HTTP, including working default, to load text from meta once per hour. Special attention paid to reducing load on meta, of course. 2005-06-25 15:49:21 +00:00			`} elseif ( preg_match( '/^http:\/\//', $fileName ) ) {`
			`# HTTP request`
			`# To keep requests to a minimum, we save results into $messageMemc, which is`
			`# similar to $wgMemc except almost certain to exist. By default, it is stored`
			`# in the database`
			`#`
			`# There are two keys, when the warning key expires, a random thread will refresh`
			`# the real key. This reduces the chance of multiple requests under high traffic`
			`# conditions.`
			`$key = "spam_blacklist_file:$fileName";`
			`$warningKey = "$wgDBname:spamfilewarning:$fileName";`
			`$httpText = $messageMemc->get( $key );`
			`$warning = $messageMemc->get( $warningKey );`

			`if ( !is_string( $httpText ) \|\| ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {`
			`wfDebug( "Loading spam blacklist from $fileName\n" );`
forgot to commit this 2005-07-02 09:03:53 +00:00			`$httpText = $this->getHTTP( $fileName );`
Support for HTTP, including working default, to load text from meta once per hour. Special attention paid to reducing load on meta, of course. 2005-06-25 15:49:21 +00:00			`$messageMemc->set( $warningKey, 1, $this->warningTime );`
			`$messageMemc->set( $key, $httpText, $this->expiryTime );`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`} else {`
			`wfDebug( "got from HTTP cache\n" );`
			`}`
bug #2598: only one blacklist file is parsed by SpamBlacklist extension 2005-11-01 07:25:33 +00:00			`$lines = array_merge( $lines, explode( "\n", $httpText ) );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`} else {`
bug #2598: only one blacklist file is parsed by SpamBlacklist extension 2005-11-01 07:25:33 +00:00			`$lines = array_merge( $lines, file( $fileName ) );`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`wfDebug( "got from file\n" );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`}`
			`}`

			`# Strip comments and whitespace, then remove blanks`
			`$lines = array_filter( array_map( 'trim', preg_replace( '/#.*$/', '', $lines ) ) );`

			`# No lines, don't make a regex which will match everything`
			`if ( count( $lines ) == 0 ) {`
Support for HTTP, including working default, to load text from meta once per hour. Special attention paid to reducing load on meta, of course. 2005-06-25 15:49:21 +00:00			`wfDebug( "No lines\n" );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`$this->regex = true;`
			`} else {`
			`# Make regex`
			`# It's faster using the S modifier even though it will usually only be run once`
* (bug 3934) Check _ in hostname prefixes; it's illegal but seems to be accepted by browsers 2005-11-16 09:56:13 +00:00			`$this->regex = 'http://[a-z0-9_\-.]*(' . implode( '\|', $lines ) . ')';`
discard backslashes prior to slash 2005-02-20 08:02:18 +00:00			`$this->regex = '/' . str_replace( '/', '\/', preg_replace('\|\\\*/\|', '/', $this->regex) ) . '/Si';`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`}`
typo 2004-12-11 11:12:00 +00:00			`$wgMemc->set( "spam_blacklist_regex", $this->regex, 3600 );`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`} else {`
			`wfDebug( "got from cache\n" );`
			`}`
			`}`
some tweaks 2006-01-21 23:27:39 +00:00			`if ( $this->regex[0] != '/' \|\| substr( $this->regex, -3 ) != '/Si' ) {`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`// Corrupt regex`
			`wfDebug( "Corrupt regex\n" );`
			`$this->regex = false;`
			`}`
			`wfProfileOut( $fname );`
			`return $this->regex;`
			`}`

			`function filter( &$title, $text, $section ) {`
			`global $wgArticle, $wgVersion, $wgOut;`

			`$fname = 'wfSpamBlacklistFilter';`
			`wfProfileIn( $fname );`

			`# Call the rest of the hook chain first`
			`if ( $this->previousFilter ) {`
			`$f = $this->previousFilter;`
			`if ( $f( $title, $text, $section ) ) {`
			`wfProfileOut( $fname );`
			`return true;`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`}`
			`}`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00
			`$regex =& $this->getRegex();`

			`if ( $regex && $regex[0] == '/' ) {`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`# Do the match`
split the regex fetching part of the filter into its own function 2006-01-19 17:14:10 +00:00			`wfDebug( "Checking text against regex: $regex\n" );`
			`if ( preg_match( $regex, $text, $matches ) ) {`
Support for HTTP, including working default, to load text from meta once per hour. Special attention paid to reducing load on meta, of course. 2005-06-25 15:49:21 +00:00			`wfDebug( "Match!\n" );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`EditPage::spamPage( $matches[0] );`
			`$retVal = true;`
			`} else {`
			`$retVal = false;`
			`}`
			`} else {`
			`$retVal = false;`
			`}`

			`wfProfileOut( $fname );`
			`return $retVal;`
			`}`

			`function getArticleLines( $db, $article ) {`
			`$dbr = wfGetDB( DB_READ );`
fixed DB name and table name 2005-03-09 14:17:22 +00:00			`$cur = $dbr->tableName( 'cur' );`
and title 2005-03-09 14:20:35 +00:00			`$res = $dbr->query( "SELECT cur_text FROM $db.$cur WHERE cur_namespace=0 AND cur_title='$article'" );`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`$row = $dbr->fetchObject( $res );`
			`if ( $row ) {`
			`return explode( "\n", $row->cur_text );`
			`} else {`
			`return array();`
			`}`
			`}`
forgot to commit this 2005-07-02 09:03:53 +00:00
			`function getHTTP( $url ) {`
			`// Use wfGetHTTP from MW 1.5 if it is available`
			`include_once( 'HttpFunctions.php' );`
			`if ( function_exists( 'wfGetHTTP' ) ) {`
			`$text = wfGetHTTP( $url );`
			`} else {`
			`$url_fopen = ini_set( 'allow_url_fopen', 1 );`
			`$text = file_get_contents( $url );`
			`ini_set( 'allow_url_fopen', $url_fopen );`
			`}`
			`return $text;`
			`}`
from phase3/extensions 2004-12-11 09:59:06 +00:00			`}`


			`} # End invocation guard`
			`?>`