mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/AbuseFilter.git
synced 2024-12-05 10:58:18 +00:00
110 lines
2.1 KiB
C
110 lines
2.1 KiB
C
|
#ifndef UTF8_H
|
||
|
#define UTF8_H
|
||
|
|
||
|
#include <string>
|
||
|
#include <cstddef>
|
||
|
|
||
|
namespace utf8 {
|
||
|
|
||
|
int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end);
|
||
|
std::string codepoint_to_utf8(int codepoint);
|
||
|
std::size_t utf8_strlen(std::string const &s);
|
||
|
std::string utf8_tolower(std::string const &s);
|
||
|
|
||
|
// Weak UTF-8 decoder
|
||
|
// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
|
||
|
// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel)
|
||
|
template<typename InputIterator>
|
||
|
struct utf8_iterator {
|
||
|
utf8_iterator(InputIterator begin, InputIterator end)
|
||
|
: cur_(begin)
|
||
|
, end_(end)
|
||
|
, atend_(false)
|
||
|
{
|
||
|
advance();
|
||
|
}
|
||
|
|
||
|
utf8_iterator()
|
||
|
: atend_(true)
|
||
|
{
|
||
|
}
|
||
|
|
||
|
int operator* (void) const {
|
||
|
return curval;
|
||
|
}
|
||
|
|
||
|
bool operator==(utf8_iterator<InputIterator> const &other) const {
|
||
|
if (atend_ || other.atend_)
|
||
|
return atend_ == other.atend_;
|
||
|
|
||
|
return cur_ == other.cur_;
|
||
|
}
|
||
|
|
||
|
utf8_iterator<InputIterator> &operator++(void) {
|
||
|
advance();
|
||
|
return *this;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
int curval;
|
||
|
InputIterator cur_, end_;
|
||
|
bool atend_;
|
||
|
|
||
|
void advance();
|
||
|
};
|
||
|
|
||
|
template<typename InputIterator>
|
||
|
void
|
||
|
utf8_iterator<InputIterator>::advance()
|
||
|
{
|
||
|
int c=0;
|
||
|
unsigned char byte;
|
||
|
int bytes = 0;
|
||
|
|
||
|
if (cur_ == end_) {
|
||
|
atend_ = true;
|
||
|
curval = 0;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
do {
|
||
|
byte = (unsigned char)*cur_;
|
||
|
if (byte < 0x80) {
|
||
|
c = byte;
|
||
|
bytes = 0;
|
||
|
} else if (byte >= 0xc0) {
|
||
|
// Start of UTF-8 character
|
||
|
// If this is unexpected, due to an overshort sequence, we ignore the invalid
|
||
|
// sequence and resynchronise here
|
||
|
if (byte < 0xe0) {
|
||
|
bytes = 1;
|
||
|
c = byte & 0x1f;
|
||
|
} else if (byte < 0xf0) {
|
||
|
bytes = 2;
|
||
|
c = byte & 0x0f;
|
||
|
} else {
|
||
|
bytes = 3;
|
||
|
c = byte & 7;
|
||
|
}
|
||
|
} else if (bytes) {
|
||
|
c <<= 6;
|
||
|
c |= byte & 0x3f;
|
||
|
--bytes;
|
||
|
} else {
|
||
|
// Unexpected continuation, ignore
|
||
|
}
|
||
|
++cur_;
|
||
|
} while (bytes && cur_ != end_);
|
||
|
curval = c;
|
||
|
}
|
||
|
|
||
|
template<typename InputIterator>
|
||
|
bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b)
|
||
|
{
|
||
|
return !(a == b);
|
||
|
}
|
||
|
|
||
|
} // namespace utf8
|
||
|
|
||
|
#endif /* !UTF8_H */
|