mediawiki-extensions-AbuseF.../parser_native/utf8.h
River Tarnell 3b7ae936ce - add missing files from last commit
- convert next_utf8_char into an iterator
2008-08-08 03:46:02 +00:00

110 lines
2.1 KiB
C++

#ifndef UTF8_H
#define UTF8_H
#include <string>
#include <cstddef>
namespace utf8 {
int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end);
std::string codepoint_to_utf8(int codepoint);
std::size_t utf8_strlen(std::string const &s);
std::string utf8_tolower(std::string const &s);
// Weak UTF-8 decoder
// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel)
template<typename InputIterator>
struct utf8_iterator {
utf8_iterator(InputIterator begin, InputIterator end)
: cur_(begin)
, end_(end)
, atend_(false)
{
advance();
}
utf8_iterator()
: atend_(true)
{
}
int operator* (void) const {
return curval;
}
bool operator==(utf8_iterator<InputIterator> const &other) const {
if (atend_ || other.atend_)
return atend_ == other.atend_;
return cur_ == other.cur_;
}
utf8_iterator<InputIterator> &operator++(void) {
advance();
return *this;
}
private:
int curval;
InputIterator cur_, end_;
bool atend_;
void advance();
};
template<typename InputIterator>
void
utf8_iterator<InputIterator>::advance()
{
int c=0;
unsigned char byte;
int bytes = 0;
if (cur_ == end_) {
atend_ = true;
curval = 0;
return;
}
do {
byte = (unsigned char)*cur_;
if (byte < 0x80) {
c = byte;
bytes = 0;
} else if (byte >= 0xc0) {
// Start of UTF-8 character
// If this is unexpected, due to an overshort sequence, we ignore the invalid
// sequence and resynchronise here
if (byte < 0xe0) {
bytes = 1;
c = byte & 0x1f;
} else if (byte < 0xf0) {
bytes = 2;
c = byte & 0x0f;
} else {
bytes = 3;
c = byte & 7;
}
} else if (bytes) {
c <<= 6;
c |= byte & 0x3f;
--bytes;
} else {
// Unexpected continuation, ignore
}
++cur_;
} while (bytes && cur_ != end_);
curval = c;
}
template<typename InputIterator>
bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b)
{
return !(a == b);
}
} // namespace utf8
#endif /* !UTF8_H */