mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/AbuseFilter.git
synced 2024-11-23 21:53:35 +00:00
- add missing files from last commit
- convert next_utf8_char into an iterator
This commit is contained in:
parent
ff2465007f
commit
3b7ae936ce
|
@ -45,22 +45,20 @@ af_count(std::vector<datum> const &args) {
|
|||
|
||||
datum
|
||||
af_norm(std::vector<datum> const &args) {
|
||||
if (!args.size()) {
|
||||
if (!args.size())
|
||||
throw exception( "Not enough arguments to norm" );
|
||||
}
|
||||
|
||||
std::string orig = args[0].toString();
|
||||
|
||||
std::string::const_iterator p, charStart, end;
|
||||
int chr = 0, lastchr = 0;
|
||||
int lastchr = 0;
|
||||
equiv_set const &equivs = equiv_set::instance();
|
||||
std::string result;
|
||||
|
||||
p = orig.begin();
|
||||
end = orig.end();
|
||||
|
||||
while (chr = utf8::next_utf8_char( p, charStart, end )) {
|
||||
chr = equivs.get(chr);
|
||||
utf8::utf8_iterator<std::string::const_iterator>
|
||||
it(orig.begin(), orig.end()), end;
|
||||
|
||||
for (; it != end; ++it) {
|
||||
int chr = equivs.get(*it);
|
||||
|
||||
if (chr != lastchr && isalnum(chr))
|
||||
result.append(utf8::codepoint_to_utf8(chr));
|
||||
|
@ -73,18 +71,15 @@ af_norm(std::vector<datum> const &args) {
|
|||
|
||||
std::string
|
||||
rmdoubles(std::string const &orig) {
|
||||
std::string::const_iterator p, charStart, end;
|
||||
int chr,lastchr = 0;
|
||||
int lastchr = 0;
|
||||
std::string result;
|
||||
|
||||
p = orig.begin();
|
||||
end = orig.end();
|
||||
while (chr = utf8::next_utf8_char( p, charStart, end )) {
|
||||
if (chr != lastchr) {
|
||||
result.append(utf8::codepoint_to_utf8(chr));
|
||||
}
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it) {
|
||||
if (*it != lastchr)
|
||||
result.append(utf8::codepoint_to_utf8(*it));
|
||||
|
||||
lastchr = chr;
|
||||
lastchr = *it;
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -92,24 +87,21 @@ rmdoubles(std::string const &orig) {
|
|||
|
||||
datum
|
||||
af_specialratio(std::vector<datum> const &args) {
|
||||
if (!args.size()) {
|
||||
if (!args.size())
|
||||
throw exception( "Not enough arguments to specialratio" );
|
||||
}
|
||||
|
||||
std::string orig = args[0].toString();
|
||||
std::string::const_iterator p, charStart, end;
|
||||
int chr;
|
||||
int len = 0;
|
||||
int specialcount = 0;
|
||||
|
||||
p = orig.begin();
|
||||
end = orig.end();
|
||||
while (chr = utf8::next_utf8_char( p, charStart, end )) {
|
||||
if (!isalnum(chr)) {
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it) {
|
||||
len++;
|
||||
if (!isalnum(*it))
|
||||
specialcount++;
|
||||
}
|
||||
}
|
||||
|
||||
double ratio = (float)(specialcount) / (float)(utf8::utf8_strlen(orig));
|
||||
double ratio = (float)specialcount / len;
|
||||
|
||||
return datum(ratio);
|
||||
}
|
||||
|
@ -125,16 +117,12 @@ af_rmspecials(std::vector<datum> const &args) {
|
|||
|
||||
std::string
|
||||
rmspecials(std::string const &orig) {
|
||||
std::string::const_iterator p, charStart, end;
|
||||
int chr = 0;
|
||||
std::string result;
|
||||
|
||||
p = orig.begin();
|
||||
end = orig.end();
|
||||
while (chr = utf8::next_utf8_char( p, charStart, end )) {
|
||||
if (isalnum(chr)) {
|
||||
result.append(utf8::codepoint_to_utf8(chr));
|
||||
}
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it) {
|
||||
if (isalnum(*it))
|
||||
result.append(utf8::codepoint_to_utf8(*it));
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -178,16 +166,12 @@ af_lcase(std::vector<datum> const &args) {
|
|||
|
||||
std::string
|
||||
confusable_character_normalise(std::string const &orig) {
|
||||
std::string::const_iterator p, charStart, end;
|
||||
int chr;
|
||||
equiv_set const &equivs = equiv_set::instance();
|
||||
std::string result;
|
||||
|
||||
p = orig.begin();
|
||||
end = orig.end();
|
||||
|
||||
while (chr = utf8::next_utf8_char( p, charStart, end )) {
|
||||
chr = equivs.get(chr);
|
||||
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
||||
for (; it != end; ++it) {
|
||||
int chr = equivs.get(*it);
|
||||
result.append(utf8::codepoint_to_utf8(chr));
|
||||
}
|
||||
|
||||
|
|
57
parser_native/equiv.cpp
Normal file
57
parser_native/equiv.cpp
Normal file
|
@ -0,0 +1,57 @@
|
|||
#include <fstream>
|
||||
#include <string>
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
#include "equiv.h"
|
||||
#include "aftypes.h"
|
||||
|
||||
#define EQUIVSET_LOC "equivset.txt"
|
||||
|
||||
namespace afp {
|
||||
|
||||
equiv_set::equiv_set()
|
||||
{
|
||||
// Map of codepoint:codepoint
|
||||
|
||||
std::ifstream eqsFile(EQUIVSET_LOC);
|
||||
|
||||
if (!eqsFile)
|
||||
throw exception( "Unable to open equivalence sets!" );
|
||||
|
||||
std::string line;
|
||||
|
||||
while (getline(eqsFile, line)) {
|
||||
size_t pos = line.find(':');
|
||||
|
||||
if (pos != line.npos) try {
|
||||
// We have a codepoint:codepoint thing.
|
||||
int actual = boost::lexical_cast<int>(line.substr(0, pos));
|
||||
int canonical = boost::lexical_cast<int>(line.substr(pos + 1));
|
||||
|
||||
if (actual != 0 && canonical != 0)
|
||||
equivs_[actual] = canonical;
|
||||
} catch (boost::bad_lexical_cast &) {}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
equiv_set::get(int c) const
|
||||
{
|
||||
std::map<int, int>::const_iterator it;
|
||||
|
||||
if ((it = equivs_.find(c)) == equivs_.end())
|
||||
return c;
|
||||
|
||||
return it->second;
|
||||
}
|
||||
|
||||
equiv_set const &
|
||||
equiv_set::instance()
|
||||
{
|
||||
static equiv_set inst;
|
||||
return inst;
|
||||
}
|
||||
|
||||
} // namespace afp
|
||||
|
23
parser_native/equiv.h
Normal file
23
parser_native/equiv.h
Normal file
|
@ -0,0 +1,23 @@
|
|||
#ifndef EQUIV_H
|
||||
#define EQUIV_H
|
||||
|
||||
#include <map>
|
||||
|
||||
#include <boost/noncopyable.hpp>
|
||||
|
||||
namespace afp {
|
||||
|
||||
struct equiv_set : boost::noncopyable {
|
||||
static equiv_set const &instance();
|
||||
|
||||
int get(int) const;
|
||||
|
||||
private:
|
||||
equiv_set();
|
||||
|
||||
std::map<int, int> equivs_;
|
||||
};
|
||||
|
||||
} // namespace afp
|
||||
|
||||
#endif /* !EQUIV_H */
|
94
parser_native/utf8.cpp
Normal file
94
parser_native/utf8.cpp
Normal file
|
@ -0,0 +1,94 @@
|
|||
#include "utf8.h"
|
||||
|
||||
#include <unicode/utf8.h>
|
||||
#include <unicode/ustring.h>
|
||||
|
||||
#include "aftypes.h"
|
||||
|
||||
namespace utf8 {
|
||||
|
||||
// Ported from MediaWiki core function in PHP.
|
||||
std::string
|
||||
codepoint_to_utf8(int codepoint) {
|
||||
std::string ret;
|
||||
|
||||
if(codepoint < 0x80) {
|
||||
ret.append(1, codepoint);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if(codepoint < 0x800) {
|
||||
ret.append(1, codepoint >> 6 & 0x3f | 0xc0);
|
||||
ret.append(1, codepoint & 0x3f | 0x80);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if(codepoint < 0x10000) {
|
||||
ret.append(1, codepoint >> 12 & 0x0f | 0xe0);
|
||||
ret.append(1, codepoint >> 6 & 0x3f | 0x80);
|
||||
ret.append(1, codepoint & 0x3f | 0x80);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if(codepoint < 0x110000) {
|
||||
ret.append(1, codepoint >> 18 & 0x07 | 0xf0);
|
||||
ret.append(1, codepoint >> 12 & 0x3f | 0x80);
|
||||
ret.append(1, codepoint >> 6 & 0x3f | 0x80);
|
||||
ret.append(1, codepoint & 0x3f | 0x80);
|
||||
return ret;
|
||||
}
|
||||
|
||||
throw afp::exception("Asked for code outside of range ($codepoint)\n");
|
||||
}
|
||||
|
||||
std::size_t
|
||||
utf8_strlen(std::string const &s)
|
||||
{
|
||||
std::size_t ret = 0;
|
||||
for (std::string::const_iterator it = s.begin(), end = s.end();
|
||||
it < end; ++it)
|
||||
{
|
||||
int skip = 1;
|
||||
|
||||
skip = U8_LENGTH(*it);
|
||||
if (it + skip >= end)
|
||||
return ret; /* end of string */
|
||||
|
||||
it += skip;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This could almost certainly be done in a nicer way.
|
||||
*/
|
||||
std::string
|
||||
utf8_tolower(std::string const &s)
|
||||
{
|
||||
std::vector<UChar> ustring;
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
|
||||
for (int i = 0, end = s.size(); i < end; ) {
|
||||
UChar32 c;
|
||||
U8_NEXT(s.data(), i, end, c);
|
||||
ustring.push_back(c);
|
||||
}
|
||||
|
||||
std::vector<UChar> dest;
|
||||
u_strToLower(&dest[0], dest.size(), &ustring[0], ustring.size(),
|
||||
NULL, &error);
|
||||
|
||||
if (U_FAILURE(error))
|
||||
return s;
|
||||
|
||||
std::vector<unsigned char> u8string;
|
||||
int i, j, end;
|
||||
for (i = 0, j = 0, end = dest.size(); i < end; j++) {
|
||||
U8_APPEND_UNSAFE(&u8string[0], i, dest[j]);
|
||||
}
|
||||
return std::string(u8string.begin(), u8string.begin() + i);
|
||||
}
|
||||
|
||||
|
||||
} // namespace utf8
|
109
parser_native/utf8.h
Normal file
109
parser_native/utf8.h
Normal file
|
@ -0,0 +1,109 @@
|
|||
#ifndef UTF8_H
|
||||
#define UTF8_H
|
||||
|
||||
#include <string>
|
||||
#include <cstddef>
|
||||
|
||||
namespace utf8 {
|
||||
|
||||
int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end);
|
||||
std::string codepoint_to_utf8(int codepoint);
|
||||
std::size_t utf8_strlen(std::string const &s);
|
||||
std::string utf8_tolower(std::string const &s);
|
||||
|
||||
// Weak UTF-8 decoder
|
||||
// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
|
||||
// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel)
|
||||
template<typename InputIterator>
|
||||
struct utf8_iterator {
|
||||
utf8_iterator(InputIterator begin, InputIterator end)
|
||||
: cur_(begin)
|
||||
, end_(end)
|
||||
, atend_(false)
|
||||
{
|
||||
advance();
|
||||
}
|
||||
|
||||
utf8_iterator()
|
||||
: atend_(true)
|
||||
{
|
||||
}
|
||||
|
||||
int operator* (void) const {
|
||||
return curval;
|
||||
}
|
||||
|
||||
bool operator==(utf8_iterator<InputIterator> const &other) const {
|
||||
if (atend_ || other.atend_)
|
||||
return atend_ == other.atend_;
|
||||
|
||||
return cur_ == other.cur_;
|
||||
}
|
||||
|
||||
utf8_iterator<InputIterator> &operator++(void) {
|
||||
advance();
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
int curval;
|
||||
InputIterator cur_, end_;
|
||||
bool atend_;
|
||||
|
||||
void advance();
|
||||
};
|
||||
|
||||
template<typename InputIterator>
|
||||
void
|
||||
utf8_iterator<InputIterator>::advance()
|
||||
{
|
||||
int c=0;
|
||||
unsigned char byte;
|
||||
int bytes = 0;
|
||||
|
||||
if (cur_ == end_) {
|
||||
atend_ = true;
|
||||
curval = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
byte = (unsigned char)*cur_;
|
||||
if (byte < 0x80) {
|
||||
c = byte;
|
||||
bytes = 0;
|
||||
} else if (byte >= 0xc0) {
|
||||
// Start of UTF-8 character
|
||||
// If this is unexpected, due to an overshort sequence, we ignore the invalid
|
||||
// sequence and resynchronise here
|
||||
if (byte < 0xe0) {
|
||||
bytes = 1;
|
||||
c = byte & 0x1f;
|
||||
} else if (byte < 0xf0) {
|
||||
bytes = 2;
|
||||
c = byte & 0x0f;
|
||||
} else {
|
||||
bytes = 3;
|
||||
c = byte & 7;
|
||||
}
|
||||
} else if (bytes) {
|
||||
c <<= 6;
|
||||
c |= byte & 0x3f;
|
||||
--bytes;
|
||||
} else {
|
||||
// Unexpected continuation, ignore
|
||||
}
|
||||
++cur_;
|
||||
} while (bytes && cur_ != end_);
|
||||
curval = c;
|
||||
}
|
||||
|
||||
template<typename InputIterator>
|
||||
bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b)
|
||||
{
|
||||
return !(a == b);
|
||||
}
|
||||
|
||||
} // namespace utf8
|
||||
|
||||
#endif /* !UTF8_H */
|
Loading…
Reference in a new issue