mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/AbuseFilter.git
synced 2024-11-27 15:30:42 +00:00
Unicode conversion: affunctions and filter_evaluator should be templated on character type
This commit is contained in:
parent
df6341487c
commit
04554a30de
|
@ -1,205 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2008 Andrew Garrett.
|
||||
* Copyright (c) 2008 River Tarnell <river@wikimedia.org>
|
||||
* Derived from public domain code contributed by Victor Vasiliev.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely. This software is provided 'as-is', without any express or
|
||||
* implied warranty.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <ios>
|
||||
#include <iostream>
|
||||
|
||||
#include <unicode/uchar.h>
|
||||
|
||||
#include <boost/format.hpp>
|
||||
|
||||
#include "utf8.h"
|
||||
#include "equiv.h"
|
||||
#include "affunctions.h"
|
||||
|
||||
namespace {
|
||||
|
||||
struct too_many_arguments_exception : afp::exception {
|
||||
too_many_arguments_exception(char const *what)
|
||||
: afp::exception(what) {}
|
||||
};
|
||||
|
||||
struct too_few_arguments_exception : afp::exception {
|
||||
too_few_arguments_exception(char const *what)
|
||||
: afp::exception(what) {}
|
||||
};
|
||||
|
||||
void
|
||||
check_args(std::string const &fname, int args, int min, int max = 0)
|
||||
{
|
||||
if (max == 0)
|
||||
max = min;
|
||||
if (args < min) {
|
||||
std::string s = str(boost::format(
|
||||
"too few arguments for function %s (got %d, expected %d)")
|
||||
% fname % args % min);
|
||||
throw too_few_arguments_exception(s.c_str());
|
||||
} else if (args > max) {
|
||||
std::string s = str(boost::format(
|
||||
"too many arguments for function %s (got %d, expected %d)")
|
||||
% fname % args % min);
|
||||
throw too_many_arguments_exception(s.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
namespace afp {
|
||||
|
||||
datum
|
||||
af_count(std::vector<datum> const &args) {
|
||||
check_args("count", args.size(), 1, 2);
|
||||
|
||||
std::string needle, haystack;
|
||||
|
||||
if (args.size() < 2) {
|
||||
needle = ",";
|
||||
haystack = args[0].toString();
|
||||
} else {
|
||||
needle = args[0].toString();
|
||||
haystack = args[1].toString();
|
||||
}
|
||||
|
||||
size_t last_pos = 0;
|
||||
unsigned int count = 0;
|
||||
|
||||
while (last_pos != haystack.npos) {
|
||||
count++;
|
||||
last_pos = haystack.find(needle, last_pos);
|
||||
}
|
||||
|
||||
// One extra was added, but one extra is needed if only one arg was supplied.
|
||||
if (args.size() >= 2)
|
||||
count--;
|
||||
|
||||
return datum::from_int((long int)count);
|
||||
}
|
||||
|
||||
datum
|
||||
af_norm(std::vector<datum> const &args) {
|
||||
check_args("norm", args.size(), 1);
|
||||
|
||||
std::string orig = args[0].toString();
|
||||
|
||||
int lastchr = 0;
|
||||
equiv_set const &equivs = equiv_set::instance();
|
||||
std::string result;
|
||||
|
||||
utf8::utf8_iterator<std::string::const_iterator>
|
||||
it(orig.begin(), orig.end()), end;
|
||||
|
||||
for (; it != end; ++it) {
|
||||
int chr = equivs.get(*it);
|
||||
|
||||
if (chr != lastchr && u_isalnum(chr))
|
||||
result.append(utf8::codepoint_to_utf8(chr));
|
||||
|
||||
lastchr = chr;
|
||||
}
|
||||
|
||||
return datum::from_string(result);
|
||||
}
|
||||
|
||||
std::string
|
||||
rmdoubles(std::string const &orig) {
|
||||
int lastchr = 0;
|
||||
std::string result;
|
||||
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it) {
|
||||
if (*it != lastchr)
|
||||
result.append(utf8::codepoint_to_utf8(*it));
|
||||
|
||||
lastchr = *it;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
datum
|
||||
af_specialratio(std::vector<datum> const &args) {
|
||||
check_args("specialratio", args.size(), 1);
|
||||
|
||||
std::string orig = args[0].toString();
|
||||
int len = 0;
|
||||
int specialcount = 0;
|
||||
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it) {
|
||||
len++;
|
||||
if (!u_isalnum(*it))
|
||||
specialcount++;
|
||||
}
|
||||
|
||||
double ratio = (float)specialcount / len;
|
||||
|
||||
return datum::from_double(ratio);
|
||||
}
|
||||
|
||||
datum
|
||||
af_rmspecials(std::vector<datum> const &args) {
|
||||
check_args("rmspecials", args.size(), 1);
|
||||
return datum::from_string(rmspecials(args[0].toString()));
|
||||
}
|
||||
|
||||
std::string
|
||||
rmspecials(std::string const &orig) {
|
||||
std::string result;
|
||||
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it) {
|
||||
if (u_isalnum(*it))
|
||||
result.append(utf8::codepoint_to_utf8(*it));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
datum
|
||||
af_ccnorm(std::vector<datum> const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
return datum::from_string(confusable_character_normalise( args[0].toString()));
|
||||
}
|
||||
|
||||
datum
|
||||
af_rmdoubles(std::vector<datum> const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
return datum::from_string(rmdoubles(args[0].toString()));
|
||||
}
|
||||
|
||||
datum
|
||||
af_length(std::vector<datum> const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
return datum::from_int((long int)utf8::utf8_strlen(args[0].toString()));
|
||||
}
|
||||
|
||||
datum
|
||||
af_lcase(std::vector<datum> const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
return datum::from_string(utf8::utf8_tolower(args[0].toString()));
|
||||
}
|
||||
|
||||
std::string
|
||||
confusable_character_normalise(std::string const &orig) {
|
||||
equiv_set const &equivs = equiv_set::instance();
|
||||
std::string result;
|
||||
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it)
|
||||
result += utf8::codepoint_to_utf8(equivs.get(*it));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace afp
|
|
@ -12,25 +12,249 @@
|
|||
#ifndef AFFUNCTIONS_H
|
||||
#define AFFUNCTIONS_H
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <ios>
|
||||
#include <iostream>
|
||||
|
||||
#include "aftypes.h"
|
||||
#include <unicode/uchar.h>
|
||||
|
||||
#include <boost/format.hpp>
|
||||
|
||||
#include "aftypes.h"
|
||||
#include "equiv.h"
|
||||
|
||||
namespace afp {
|
||||
|
||||
datum af_length (std::vector<datum> const &args);
|
||||
datum af_lcase (std::vector<datum> const &args);
|
||||
datum af_ccnorm (std::vector<datum> const &args);
|
||||
datum af_rmdoubles (std::vector<datum> const &args);
|
||||
datum af_specialratio (std::vector<datum> const &args);
|
||||
datum af_rmspecials (std::vector<datum> const &args);
|
||||
datum af_norm (std::vector<datum> const &args);
|
||||
datum af_count (std::vector<datum> const &args);
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_length (std::vector<basic_datum<charT> > const &args);
|
||||
|
||||
std::string confusable_character_normalise(std::string const &orig);
|
||||
std::string rmdoubles(std::string const &orig);
|
||||
std::string rmspecials(std::string const &orig);
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_ccnorm (std::vector<basic_datum<charT> > const &args);
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_rmdoubles (std::vector<basic_datum<charT> > const &args);
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_specialratio (std::vector<basic_datum<charT> > const &args);
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_rmspecials (std::vector<basic_datum<charT> > const &args);
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_norm (std::vector<basic_datum<charT> > const &args);
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_count (std::vector<basic_datum<charT> > const &args);
|
||||
|
||||
template<typename charT>
|
||||
std::basic_string<charT>
|
||||
confusable_character_normalise(std::basic_string<charT> const &orig);
|
||||
|
||||
template<typename charT>
|
||||
std::basic_string<charT>
|
||||
rmdoubles(std::basic_string<charT> const &orig);
|
||||
|
||||
template<typename charT>
|
||||
std::basic_string<charT>
|
||||
rmspecials(std::basic_string<charT> const &orig);
|
||||
|
||||
struct too_many_arguments_exception : afp::exception {
|
||||
too_many_arguments_exception(char const *what)
|
||||
: afp::exception(what) {}
|
||||
};
|
||||
|
||||
struct too_few_arguments_exception : afp::exception {
|
||||
too_few_arguments_exception(char const *what)
|
||||
: afp::exception(what) {}
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
void
|
||||
check_args(std::string const &fname, int args, int min, int max = 0)
|
||||
{
|
||||
if (max == 0)
|
||||
max = min;
|
||||
if (args < min) {
|
||||
std::string s = str(boost::format(
|
||||
"too few arguments for function %s (got %d, expected %d)")
|
||||
% fname % args % min);
|
||||
throw too_few_arguments_exception(s.c_str());
|
||||
} else if (args > max) {
|
||||
std::string s = str(boost::format(
|
||||
"too many arguments for function %s (got %d, expected %d)")
|
||||
% fname % args % min);
|
||||
throw too_many_arguments_exception(s.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_count(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("count", args.size(), 1, 2);
|
||||
|
||||
std::basic_string<charT> needle, haystack;
|
||||
|
||||
if (args.size() < 2) {
|
||||
needle = ",";
|
||||
haystack = args[0].toString();
|
||||
} else {
|
||||
needle = args[0].toString();
|
||||
haystack = args[1].toString();
|
||||
}
|
||||
|
||||
size_t last_pos = 0;
|
||||
unsigned int count = 0;
|
||||
|
||||
while (last_pos != haystack.npos) {
|
||||
count++;
|
||||
last_pos = haystack.find(needle, last_pos);
|
||||
}
|
||||
|
||||
// One extra was added, but one extra is needed if only one arg was supplied.
|
||||
if (args.size() >= 2)
|
||||
count--;
|
||||
|
||||
return basic_datum<charT>::from_int((long int)count);
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_norm(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("norm", args.size(), 1);
|
||||
|
||||
std::basic_string<charT> orig = args[0].toString();
|
||||
|
||||
int lastchr = 0;
|
||||
equiv_set const &equivs = equiv_set::instance();
|
||||
std::basic_string<charT> result;
|
||||
|
||||
for (std::size_t i = 0; i < orig.size(); ++i) {
|
||||
int chr = equivs.get(orig[i]);
|
||||
|
||||
if (chr != lastchr && u_isalnum(chr))
|
||||
result += chr;
|
||||
|
||||
lastchr = chr;
|
||||
}
|
||||
|
||||
return basic_datum<charT>::from_string(result);
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
std::basic_string<charT>
|
||||
rmdoubles(std::basic_string<charT> const &orig) {
|
||||
int lastchr = 0;
|
||||
std::basic_string<charT> result;
|
||||
|
||||
for (std::size_t i = 0; i < orig.size(); ++i) {
|
||||
if (orig[i] != lastchr)
|
||||
result += orig[i];
|
||||
|
||||
lastchr = orig[i];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_specialratio(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("specialratio", args.size(), 1);
|
||||
|
||||
std::basic_string<charT> orig = args[0].toString();
|
||||
int len = 0;
|
||||
int specialcount = 0;
|
||||
|
||||
for (std::size_t i = 0; i < orig.size(); ++i) {
|
||||
len++;
|
||||
if (!u_isalnum(orig[i]))
|
||||
specialcount++;
|
||||
}
|
||||
|
||||
double ratio = (float)specialcount / len;
|
||||
|
||||
return basic_datum<charT>::from_double(ratio);
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_rmspecials(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("rmspecials", args.size(), 1);
|
||||
return basic_datum<charT>::from_string(rmspecials(args[0].toString()));
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
std::basic_string<charT>
|
||||
rmspecials(std::basic_string<charT> const &orig) {
|
||||
std::basic_string<charT> result;
|
||||
|
||||
for (std::size_t i = 0; i < orig.size(); ++i) {
|
||||
if (u_isalnum(orig[i]))
|
||||
result += orig[i];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_ccnorm(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
return basic_datum<charT>::from_string(confusable_character_normalise(args[0].toString()));
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_rmdoubles(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
return basic_datum<charT>::from_string(rmdoubles(args[0].toString()));
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_length(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
return basic_datum<charT>::from_int(args[0].toString().size());
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
basic_datum<charT>
|
||||
af_lcase(std::vector<basic_datum<charT> > const &args) {
|
||||
check_args("ccnorm", args.size(), 1);
|
||||
std::basic_string<charT> result;
|
||||
std::basic_string<charT> const orig = args[0].toString();
|
||||
|
||||
for (std::size_t i = 0; i < orig.size(); ++i)
|
||||
result += u_tolower(orig[i]);
|
||||
|
||||
return basic_datum<charT>::from_string(result);
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
std::basic_string<charT>
|
||||
confusable_character_normalise(std::basic_string<charT> const &orig) {
|
||||
equiv_set const &equivs = equiv_set::instance();
|
||||
std::basic_string<charT> result;
|
||||
|
||||
for (std::size_t i = 0; i < orig.size(); ++i)
|
||||
result += equivs.get(orig[i]);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace afp
|
||||
|
||||
|
|
|
@ -15,22 +15,66 @@
|
|||
#include <string>
|
||||
#include <map>
|
||||
|
||||
#include <unicode/uchar.h>
|
||||
|
||||
#include "aftypes.h"
|
||||
#include "parser.h"
|
||||
#include "affunctions.h"
|
||||
|
||||
namespace afp {
|
||||
|
||||
struct filter_evaluator {
|
||||
filter_evaluator();
|
||||
template<typename charT>
|
||||
struct basic_filter_evaluator {
|
||||
basic_filter_evaluator();
|
||||
|
||||
bool evaluate(std::string const &filter) const;
|
||||
bool evaluate(std::basic_string<charT> const &filter) const;
|
||||
|
||||
void add_variable(std::string const &key, datum value);
|
||||
void add_variable(
|
||||
std::basic_string<charT> const &key,
|
||||
basic_datum<charT> value);
|
||||
|
||||
private:
|
||||
expressor e;
|
||||
basic_expressor<charT> e;
|
||||
};
|
||||
|
||||
typedef basic_filter_evaluator<char> filter_evaluator;
|
||||
typedef basic_filter_evaluator<UChar> u32filter_evaluator;
|
||||
|
||||
template<typename charT>
|
||||
basic_filter_evaluator<charT>::basic_filter_evaluator()
|
||||
{
|
||||
e.add_function("length", af_length<charT>);
|
||||
e.add_function("lcase", af_lcase<charT>);
|
||||
e.add_function("ccnorm", af_ccnorm<charT>);
|
||||
e.add_function("rmdoubles", af_rmdoubles<charT>);
|
||||
e.add_function("specialratio", af_specialratio<charT>);
|
||||
e.add_function("rmspecials", af_rmspecials<charT>);
|
||||
e.add_function("norm", af_norm<charT>);
|
||||
e.add_function("count", af_count<charT>);
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
bool
|
||||
basic_filter_evaluator<charT>::evaluate(
|
||||
std::basic_string<charT> const &filter) const
|
||||
{
|
||||
try {
|
||||
return e.evaluate(filter).toBool();
|
||||
} catch (std::exception &e) {
|
||||
std::cerr << "can't evaluate filter: " << e.what() << '\n';
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename charT>
|
||||
void
|
||||
basic_filter_evaluator<charT>::add_variable(
|
||||
std::basic_string<charT> const &key,
|
||||
basic_datum<charT> value)
|
||||
{
|
||||
e.add_variable(key, value);
|
||||
}
|
||||
|
||||
} // namespace afp
|
||||
|
||||
#endif /* !FILTER_EVALUATOR_H */
|
||||
|
|
|
@ -17,31 +17,23 @@ LIBS = -lboost_regex$(BOOST_TAG) -licuuc -licui18n -licudata
|
|||
expr: CPPFLAGS+=-DTEST_PARSER
|
||||
|
||||
af_expr_objs = \
|
||||
af_expr-affunctions.o \
|
||||
af_expr-filter_evaluator.o \
|
||||
af_expr-eval.o \
|
||||
af_expr-utf8.o \
|
||||
af_expr-equiv.o \
|
||||
af_expr-request.o
|
||||
|
||||
af_parser_objs = \
|
||||
af_parser-affunctions.o \
|
||||
af_parser-main.o \
|
||||
af_parser-request.o \
|
||||
af_parser-utf8.o \
|
||||
af_parser-equiv.o \
|
||||
af_parser-filter_evaluator.o
|
||||
af_parser-equiv.o
|
||||
|
||||
check_objs = \
|
||||
check-affunctions.o \
|
||||
check-check.o \
|
||||
check-utf8.o \
|
||||
check-equiv.o \
|
||||
check-filter_evaluator.o
|
||||
check-equiv.o
|
||||
|
||||
syntax_check_objs = \
|
||||
syntax_check-affunctions.o \
|
||||
syntax_check-filter_evaluator.o \
|
||||
syntax_check-utf8.o \
|
||||
syntax_check-equiv.o \
|
||||
syntax_check-syntax_check.o
|
||||
|
|
Loading…
Reference in a new issue