mediawiki-extensions-AbuseF.../parser_native/utf8.h
River Tarnell 305efb0961 - implement 'rlike' and 'regex' operators
- copyright header for all files
2008-08-08 12:35:13 +00:00

121 lines
2.5 KiB
C++

/*
* Copyright (c) 2008 Andrew Garrett.
* Copyright (c) 2008 River Tarnell <river@wikimedia.org>
* Derived from public domain code contributed by Victor Vasiliev.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely. This software is provided 'as-is', without any express or
* implied warranty.
*/
#ifndef UTF8_H
#define UTF8_H
#include <string>
#include <cstddef>
namespace utf8 {
int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end);
std::string codepoint_to_utf8(int codepoint);
std::size_t utf8_strlen(std::string const &s);
std::string utf8_tolower(std::string const &s);
// Weak UTF-8 decoder
// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel)
template<typename InputIterator>
struct utf8_iterator {
utf8_iterator(InputIterator begin, InputIterator end)
: cur_(begin)
, end_(end)
, atend_(false)
{
advance();
}
utf8_iterator()
: atend_(true)
{
}
int operator* (void) const {
return curval;
}
bool operator==(utf8_iterator<InputIterator> const &other) const {
if (atend_ || other.atend_)
return atend_ == other.atend_;
return cur_ == other.cur_;
}
utf8_iterator<InputIterator> &operator++(void) {
advance();
return *this;
}
private:
int curval;
InputIterator cur_, end_;
bool atend_;
void advance();
};
template<typename InputIterator>
void
utf8_iterator<InputIterator>::advance()
{
int c=0;
unsigned char byte;
int bytes = 0;
if (cur_ == end_) {
atend_ = true;
curval = 0;
return;
}
do {
byte = (unsigned char)*cur_;
if (byte < 0x80) {
c = byte;
bytes = 0;
} else if (byte >= 0xc0) {
// Start of UTF-8 character
// If this is unexpected, due to an overshort sequence, we ignore the invalid
// sequence and resynchronise here
if (byte < 0xe0) {
bytes = 1;
c = byte & 0x1f;
} else if (byte < 0xf0) {
bytes = 2;
c = byte & 0x0f;
} else {
bytes = 3;
c = byte & 7;
}
} else if (bytes) {
c <<= 6;
c |= byte & 0x3f;
--bytes;
} else {
// Unexpected continuation, ignore
}
++cur_;
} while (bytes && cur_ != end_);
curval = c;
}
template<typename InputIterator>
bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b)
{
return !(a == b);
}
} // namespace utf8
#endif /* !UTF8_H */