From 2bc8873c300a8872185ec31c8b18c7a677d1102f Mon Sep 17 00:00:00 2001 From: Dayllan Maza Date: Tue, 19 Sep 2017 20:54:03 -0300 Subject: [PATCH] Add ccnorm_contains_any function Normalize and search a string for multiple substrings Bug: T65242 Change-Id: I4034c0054a6849babbf2d96ea13dc97d3660d5b4 --- i18n/en.json | 1 + i18n/qqq.json | 1 + includes/AbuseFilter.class.php | 3 +- includes/AbuseFilter.hooks.php | 1 - includes/parser/AbuseFilterParser.php | 80 ++++++++++++++++++++----- tests/parserTests/ccnorm-contains-any.r | 1 + tests/parserTests/ccnorm-contains-any.t | 1 + tests/parserTests/contains-any.r | 1 + tests/parserTests/contains-any.t | 1 + tests/phpunit/parserTest.php | 2 +- 10 files changed, 73 insertions(+), 19 deletions(-) create mode 100644 tests/parserTests/ccnorm-contains-any.r create mode 100644 tests/parserTests/ccnorm-contains-any.t create mode 100644 tests/parserTests/contains-any.r create mode 100644 tests/parserTests/contains-any.t diff --git a/i18n/en.json b/i18n/en.json index 0d32b9f37..df36d719b 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -225,6 +225,7 @@ "abusefilter-edit-builder-funcs-lcase": "To lower case (lcase)", "abusefilter-edit-builder-funcs-ucase": "To upper case (ucase)", "abusefilter-edit-builder-funcs-ccnorm": "Normalize confusable characters (ccnorm)", + "abusefilter-edit-builder-funcs-ccnorm-contains-any": "Normalize and search a string for multiple substrings (ccnorm_contains_any)", "abusefilter-edit-builder-funcs-rmdoubles": "Remove double-characters (rmdoubles)", "abusefilter-edit-builder-funcs-specialratio": "Special characters / total characters (specialratio)", "abusefilter-edit-builder-funcs-norm": "Normalize (norm)", diff --git a/i18n/qqq.json b/i18n/qqq.json index e2d5aec6f..cbb5edfa1 100644 --- a/i18n/qqq.json +++ b/i18n/qqq.json @@ -256,6 +256,7 @@ "abusefilter-edit-builder-funcs-lcase": "{{doc-important|Do not translate \"'''lcase'''\".}} Abuse filter syntax option in a dropdown from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.", "abusefilter-edit-builder-funcs-ucase": "{{doc-important|Do not translate \"'''ucase'''\".}} Abuse filter syntax option in a dropdown from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.", "abusefilter-edit-builder-funcs-ccnorm": "{{doc-important|Do not translate \"'''ccnorm'''\".}} Abuse filter syntax option in a dropdown from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.", + "abusefilter-edit-builder-funcs-ccnorm-contains-any": "{{doc-important|Do not translate \"'''ccnorm-contains-any'''\".}} Abuse filter syntax option in a dropdown from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.", "abusefilter-edit-builder-funcs-rmdoubles": "{{doc-important|Do not translate \"'''rmdoubles'''\".}}\nAbuse filter syntax option in a dropdown from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.\n\nFunctional explanation: rmdoubles removes repeated characters in the argument, and returns the result. For example: \"foobybboo\" will return \"fobybo\".", "abusefilter-edit-builder-funcs-specialratio": "{{doc-important|Do not translate \"'''specialratio'''\".}} Abuse filter syntax option in a dropdown from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.", "abusefilter-edit-builder-funcs-norm": "{{doc-important|Do not translate \"'''norm'''\".}} Abuse filter syntax option in a dropdown from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.", diff --git a/includes/AbuseFilter.class.php b/includes/AbuseFilter.class.php index 3ce5cdb44..776cdc415 100644 --- a/includes/AbuseFilter.class.php +++ b/includes/AbuseFilter.class.php @@ -69,6 +69,7 @@ class AbuseFilter { 'lcase(string)' => 'lcase', 'ucase(string)' => 'ucase', 'ccnorm(string)' => 'ccnorm', + 'ccnorm_contains_any(haystack,needle1,needle2,..)' => 'ccnorm-contains-any', 'rmdoubles(string)' => 'rmdoubles', 'specialratio(string)' => 'specialratio', 'norm(string)' => 'norm', @@ -77,7 +78,7 @@ class AbuseFilter { 'rmwhitespace(text)' => 'rmwhitespace', 'rmspecials(text)' => 'rmspecials', 'ip_in_range(ip, range)' => 'ip_in_range', - 'contains_any(haystack,needle1,needle2,needle3)' => 'contains-any', + 'contains_any(haystack,needle1,needle2,...)' => 'contains-any', 'substr(subject, offset, length)' => 'substr', 'strpos(haystack, needle)' => 'strpos', 'str_replace(subject, search, replace)' => 'str_replace', diff --git a/includes/AbuseFilter.hooks.php b/includes/AbuseFilter.hooks.php index 93fe208f1..6e14e26e1 100644 --- a/includes/AbuseFilter.hooks.php +++ b/includes/AbuseFilter.hooks.php @@ -1,6 +1,5 @@ 'castBool', 'norm' => 'funcNorm', 'ccnorm' => 'funcCCNorm', + 'ccnorm_contains_any' => 'funcCCNormContainsAny', 'specialratio' => 'funcSpecialRatio', 'rmspecials' => 'funcRMSpecials', 'rmdoubles' => 'funcRMDoubles', @@ -48,7 +49,7 @@ class AbuseFilterParser { 'contains' => 'keywordContains', 'rlike' => 'keywordRegex', 'irlike' => 'keywordRegexInsensitive', - 'regex' => 'keywordRegex' + 'regex' => 'keywordRegex', ]; public static $funcCache = []; @@ -1074,38 +1075,85 @@ class AbuseFilterParser { } $s = array_shift( $args ); - $s = $s->toString(); - $searchStrings = []; + return new AFPData( AFPData::DBOOL, self::containsAny( $s, $args ) ); + } - foreach ( $args as $arg ) { - $searchStrings[] = $arg->toString(); + /** + * Normalize and search a string for multiple substrings + * + * @param $args array + * @return AFPData + * @throws AFPUserVisibleException + */ + protected function funcCCNormContainsAny( $args ) { + if ( count( $args ) < 2 ) { + throw new AFPUserVisibleException( + 'notenoughargs', + $this->mCur->pos, + [ 'ccnorm_contains_any', 2, count( $args ) ] + ); } - if ( function_exists( 'fss_prep_search' ) ) { - $fss = fss_prep_search( $searchStrings ); - $result = fss_exec_search( $fss, $s ); + $s = array_shift( $args ); - $ok = is_array( $result ); + return new AFPData( AFPData::DBOOL, self::containsAny( $s, $args, true ) ); + } + + /** + * Search for substrings in a string + * + * Use normalize = true to make use of ccnorm and + * normalize both sides of the search. + * + * @param AFData $string + * @param AFData[] $values + * @param bool $normalize + * + * @return bool + */ + protected static function containsAny( $string, $values, $normalize = false ) { + $string = $string->toString(); + + if ( $string == '' ) { + return false; + } + + if ( $normalize ) { + $string = self::ccnorm( $string ); + } + + $values = array_map( function ( $val ) use ( $normalize ) { + $str = $val->toString(); + if ( $normalize ) { + $str = AbuseFilterParser::ccnorm( $str ); + } + + return $str; + }, $values ); + + if ( function_exists( 'fss_prep_search' ) ) { + $fss = fss_prep_search( $values ); + $result = fss_exec_search( $fss, $string ); + + return ( $result !== false ); } else { - $ok = false; - foreach ( $searchStrings as $needle ) { + foreach ( $values as $needle ) { // Bug #60203: Keep empty parameters from causing PHP warnings - if ( $needle !== '' && strpos( $s, $needle ) !== false ) { - $ok = true; - break; + if ( $needle !== '' && strpos( $string, $needle ) !== false ) { + return true; } } } - return new AFPData( AFPData::DBOOL, $ok ); + return false; } /** * @param $s * @return mixed */ - protected function ccnorm( $s ) { + protected static function ccnorm( $s ) { if ( is_callable( 'AntiSpoof::normalizeString' ) ) { $s = AntiSpoof::normalizeString( $s ); } else { diff --git a/tests/parserTests/ccnorm-contains-any.r b/tests/parserTests/ccnorm-contains-any.r new file mode 100644 index 000000000..4736e0800 --- /dev/null +++ b/tests/parserTests/ccnorm-contains-any.r @@ -0,0 +1 @@ +MATCH diff --git a/tests/parserTests/ccnorm-contains-any.t b/tests/parserTests/ccnorm-contains-any.t new file mode 100644 index 000000000..6aeac35c9 --- /dev/null +++ b/tests/parserTests/ccnorm-contains-any.t @@ -0,0 +1 @@ +ccnorm_contains_any("like 4ny0ne else", "foo", "aNyon3") & ccnorm_contains_any("street f1ghter","F1ght") diff --git a/tests/parserTests/contains-any.r b/tests/parserTests/contains-any.r new file mode 100644 index 000000000..4736e0800 --- /dev/null +++ b/tests/parserTests/contains-any.r @@ -0,0 +1 @@ +MATCH diff --git a/tests/parserTests/contains-any.t b/tests/parserTests/contains-any.t new file mode 100644 index 000000000..6fee9e8d6 --- /dev/null +++ b/tests/parserTests/contains-any.t @@ -0,0 +1 @@ +contains_any("like anyone else", "else", "someone") & contains_any("street fighter", "fight") diff --git a/tests/phpunit/parserTest.php b/tests/phpunit/parserTest.php index b253e8584..6d7322acf 100644 --- a/tests/phpunit/parserTest.php +++ b/tests/phpunit/parserTest.php @@ -55,7 +55,7 @@ class AbuseFilterParserTest extends MediaWikiTestCase { * @dataProvider readTests */ public function testParser( $testName, $rule, $expected ) { - if ( !class_exists( 'AntiSpoof' ) && preg_match( '/(cc)?norm\(/i', $rule ) ) { + if ( !class_exists( 'AntiSpoof' ) && preg_match( '/(?:cc)?norm(?:\(|_)/i', $rule ) ) { // The norm and ccnorm parser functions aren't working correctly without AntiSpoof $this->markTestSkipped( 'Parser test ' . $testName . ' requires the AntiSpoof extension' ); }