- add missing files from last commit

- convert next_utf8_char into an iterator
2024-11-23 21:53:35 +00:00 · 2008-08-08 03:46:02 +00:00 · 2008-08-08 03:46:02 +00:00 · 3b7ae936ce
parent ff2465007f
commit 3b7ae936ce
5 changed files with 310 additions and 43 deletions
--- a/parser_native/affunctions.cpp
+++ b/parser_native/affunctions.cpp
@ -45,22 +45,20 @@ af_count(std::vector<datum> const &args) {

 datum
 af_norm(std::vector<datum> const &args) {
-	if (!args.size()) {
+	if (!args.size())
 		throw exception( "Not enough arguments to norm" );
-	}
 	
 	std::string orig = args[0].toString();
 	
-	std::string::const_iterator p, charStart, end;
-	int chr = 0, lastchr = 0;
+	int lastchr = 0;
 	equiv_set const &equivs = equiv_set::instance();
 	std::string result;
 	
-	p = orig.begin();
-	end = orig.end();
-	
-	while (chr = utf8::next_utf8_char( p, charStart, end )) {
-		chr = equivs.get(chr);
+	utf8::utf8_iterator<std::string::const_iterator>
+	       	it(orig.begin(), orig.end()), end;
+
+	for (; it != end; ++it) {
+		int chr = equivs.get(*it);
 		
 		if (chr != lastchr && isalnum(chr))
 			result.append(utf8::codepoint_to_utf8(chr));
@ -73,18 +71,15 @@ af_norm(std::vector<datum> const &args) {

 std::string 
 rmdoubles(std::string const &orig) {
-	std::string::const_iterator p, charStart, end;
-	int chr,lastchr = 0;
+	int lastchr = 0;
 	std::string result;
 	
-	p = orig.begin();
-	end = orig.end();
-	while (chr = utf8::next_utf8_char( p, charStart, end )) {
-		if (chr != lastchr) {
-			result.append(utf8::codepoint_to_utf8(chr));
-		}
+	utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
+	for (; it != end; ++it) {
+		if (*it != lastchr)
+			result.append(utf8::codepoint_to_utf8(*it));
 		
-		lastchr = chr;
+		lastchr = *it;
 	}
 	
 	return result;
@ -92,24 +87,21 @@ rmdoubles(std::string const &orig) {

 datum
 af_specialratio(std::vector<datum> const &args) {
-	if (!args.size()) {
+	if (!args.size())
 		throw exception( "Not enough arguments to specialratio" );
-	}
 	
 	std::string orig = args[0].toString();
-	std::string::const_iterator p, charStart, end;
-	int chr;
+	int len = 0;
 	int specialcount = 0;
 	
-	p = orig.begin();
-	end = orig.end();
-	while (chr = utf8::next_utf8_char( p, charStart, end )) {
-		if (!isalnum(chr)) {
+	utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
+	for (; it != end; ++it) {
+		len++;
+		if (!isalnum(*it))
 			specialcount++;
-		}
 	}
 	
-	double ratio = (float)(specialcount) / (float)(utf8::utf8_strlen(orig));
+	double ratio = (float)specialcount / len;
 		
 	return datum(ratio);
 }
@ -125,16 +117,12 @@ af_rmspecials(std::vector<datum> const &args) {

 std::string 
 rmspecials(std::string const &orig) {
-	std::string::const_iterator p, charStart, end;
-	int chr = 0;
 	std::string result;
 	
-	p = orig.begin();
-	end = orig.end();
-	while (chr = utf8::next_utf8_char( p, charStart, end )) {
-		if (isalnum(chr)) {
-			result.append(utf8::codepoint_to_utf8(chr));
-		}
+	utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
+	for (; it != end; ++it) {
+		if (isalnum(*it))
+			result.append(utf8::codepoint_to_utf8(*it));
 	}
 	
 	return result;
@ -178,16 +166,12 @@ af_lcase(std::vector<datum> const &args) {

 std::string 
 confusable_character_normalise(std::string const &orig) {
-	std::string::const_iterator p, charStart, end;
-	int chr;
 	equiv_set const &equivs = equiv_set::instance();
 	std::string result;
 	
-	p = orig.begin();
-	end = orig.end();
-	
-	while (chr = utf8::next_utf8_char( p, charStart, end )) {
-		chr = equivs.get(chr);
+	utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
+	for (; it != end; ++it) {
+		int chr = equivs.get(*it);
 		result.append(utf8::codepoint_to_utf8(chr));
 	}
 	
--- a/parser_native/equiv.cpp
+++ b/parser_native/equiv.cpp
@ -0,0 +1,57 @@
+#include	<fstream>
+#include	<string>
+
+#include	<boost/lexical_cast.hpp>
+
+#include	"equiv.h"
+#include	"aftypes.h"
+
+#define EQUIVSET_LOC "equivset.txt"
+
+namespace afp {
+
+equiv_set::equiv_set()
+{
+	// Map of codepoint:codepoint
+	
+	std::ifstream eqsFile(EQUIVSET_LOC);
+		
+	if (!eqsFile)
+		throw exception( "Unable to open equivalence sets!" );
+	
+	std::string line;
+		
+	while (getline(eqsFile, line)) {			
+		size_t pos = line.find(':');
+			
+		if (pos != line.npos) try {
+			// We have a codepoint:codepoint thing.
+			int actual = boost::lexical_cast<int>(line.substr(0, pos));
+			int canonical = boost::lexical_cast<int>(line.substr(pos + 1));
+				
+			if (actual != 0 && canonical != 0)
+				equivs_[actual] = canonical;
+		} catch (boost::bad_lexical_cast &) {}
+	}
+}
+
+int
+equiv_set::get(int c) const
+{
+	std::map<int, int>::const_iterator it;
+
+	if ((it = equivs_.find(c)) == equivs_.end())
+		return c;
+
+	return it->second;
+}
+
+equiv_set const &
+equiv_set::instance()
+{
+	static equiv_set inst;
+	return inst;
+}
+
+} // namespace afp
+
--- a/parser_native/equiv.h
+++ b/parser_native/equiv.h
@ -0,0 +1,23 @@
+#ifndef EQUIV_H
+#define EQUIV_H
+
+#include	<map>
+
+#include	<boost/noncopyable.hpp>
+
+namespace afp {
+
+struct equiv_set : boost::noncopyable {
+	static equiv_set const &instance();
+
+	int get(int) const;
+
+private:
+	equiv_set();
+
+	std::map<int, int> equivs_;
+};
+
+} // namespace afp
+
+#endif	/* !EQUIV_H */
--- a/parser_native/utf8.cpp
+++ b/parser_native/utf8.cpp
@ -0,0 +1,94 @@
+#include	"utf8.h"
+
+#include	<unicode/utf8.h>
+#include	<unicode/ustring.h>
+
+#include	"aftypes.h"
+
+namespace utf8 {
+
+// Ported from MediaWiki core function in PHP.
+std::string
+codepoint_to_utf8(int codepoint) {
+	std::string ret;
+	
+	if(codepoint < 0x80) {
+		ret.append(1, codepoint);
+		return ret;
+	}
+	
+	if(codepoint < 0x800) {
+		ret.append(1, codepoint >> 6 & 0x3f | 0xc0);
+		ret.append(1, codepoint & 0x3f | 0x80);
+		return ret;
+	}
+	
+	if(codepoint <  0x10000) {
+		ret.append(1, codepoint >> 12 & 0x0f | 0xe0);
+		ret.append(1, codepoint >> 6 & 0x3f | 0x80);
+		ret.append(1, codepoint & 0x3f | 0x80);
+		return ret;
+	}
+	
+	if(codepoint < 0x110000) {
+		ret.append(1, codepoint >> 18 & 0x07 | 0xf0);
+		ret.append(1, codepoint >> 12 & 0x3f | 0x80);
+		ret.append(1, codepoint >> 6 & 0x3f | 0x80);
+		ret.append(1, codepoint & 0x3f | 0x80);
+		return ret;
+	}
+
+	throw afp::exception("Asked for code outside of range ($codepoint)\n");
+}
+
+std::size_t
+utf8_strlen(std::string const &s)
+{
+std::size_t	ret = 0;
+	for (std::string::const_iterator it = s.begin(), end = s.end();
+		it < end; ++it)
+	{
+	int	skip = 1;
+
+		skip = U8_LENGTH(*it);
+		if (it + skip >= end)
+			return ret;	/* end of string */
+				
+		it += skip;
+	}
+
+	return ret;
+}
+
+/*
+ * This could almost certainly be done in a nicer way.
+ */
+std::string
+utf8_tolower(std::string const &s)
+{
+	std::vector<UChar> ustring;
+	UErrorCode error = U_ZERO_ERROR;
+
+	for (int i = 0, end = s.size(); i < end; ) {
+		UChar32 c;
+		U8_NEXT(s.data(), i, end, c);
+		ustring.push_back(c);
+	}
+
+	std::vector<UChar> dest;
+	u_strToLower(&dest[0], dest.size(), &ustring[0], ustring.size(),
+			NULL, &error);
+	
+	if (U_FAILURE(error))
+		return s;
+
+	std::vector<unsigned char> u8string;
+	int i, j, end;
+	for (i = 0, j = 0, end = dest.size(); i < end; j++) {
+		U8_APPEND_UNSAFE(&u8string[0], i, dest[j]);
+	}
+	return std::string(u8string.begin(), u8string.begin() + i);
+}
+
+
+} // namespace utf8
--- a/parser_native/utf8.h
+++ b/parser_native/utf8.h
@ -0,0 +1,109 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include	<string>
+#include	<cstddef>
+
+namespace utf8 {
+
+int next_utf8_char(std::string::const_iterator &p, std::string::const_iterator &charStart, std::string::const_iterator end);
+std::string codepoint_to_utf8(int codepoint);
+std::size_t utf8_strlen(std::string const &s);
+std::string utf8_tolower(std::string const &s);
+
+// Weak UTF-8 decoder
+// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
+// Stolen from wikidiff2 extension by Tim Starling (no point in reinventing the wheel)
+template<typename InputIterator>
+struct utf8_iterator {
+	utf8_iterator(InputIterator begin, InputIterator end)
+		: cur_(begin)
+		, end_(end)
+		, atend_(false)
+	{
+		advance();
+	}
+
+	utf8_iterator()
+		: atend_(true)
+	{
+	}
+
+	int operator* (void) const {
+		return curval;
+	}
+
+	bool operator==(utf8_iterator<InputIterator> const &other) const {
+		if (atend_ || other.atend_)
+			return atend_ == other.atend_;
+
+		return cur_ == other.cur_;
+	}
+
+	utf8_iterator<InputIterator> &operator++(void) {
+		advance();
+		return *this;
+	}
+
+private:
+	int curval;
+	InputIterator cur_, end_;
+	bool atend_;
+
+	void advance();
+};
+
+template<typename InputIterator>
+void
+utf8_iterator<InputIterator>::advance()
+{
+	int c=0;
+	unsigned char byte;
+	int bytes = 0;
+
+	if (cur_ == end_) {
+		atend_ = true;
+		curval = 0;
+		return;
+	}
+
+	do {
+		byte = (unsigned char)*cur_;
+		if (byte < 0x80) {
+			c = byte;
+			bytes = 0;
+		} else if (byte >= 0xc0) {
+			// Start of UTF-8 character
+			// If this is unexpected, due to an overshort sequence, we ignore the invalid
+			// sequence and resynchronise here
+			if (byte < 0xe0) {
+				bytes = 1;
+				c = byte & 0x1f;
+			} else if (byte < 0xf0) {
+				bytes = 2;
+				c = byte & 0x0f;
+			} else {
+				bytes = 3;
+				c = byte & 7;
+			}
+		} else if (bytes) {
+			c <<= 6;
+			c |= byte & 0x3f;
+			--bytes;
+		} else {
+			// Unexpected continuation, ignore
+		}
+		++cur_;
+	} while (bytes && cur_ != end_);
+	curval = c;
+}
+
+template<typename InputIterator>
+bool operator!= (utf8_iterator<InputIterator> const &a, utf8_iterator<InputIterator> const &b)
+{
+	return !(a == b);
+}
+
+} // namespace utf8
+
+#endif	/* !UTF8_H */