2008-08-08 12:35:13 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2008 Andrew Garrett.
|
|
|
|
* Copyright (c) 2008 River Tarnell <river@wikimedia.org>
|
|
|
|
* Derived from public domain code contributed by Victor Vasiliev.
|
|
|
|
*
|
|
|
|
* Permission is granted to anyone to use this software for any purpose,
|
|
|
|
* including commercial applications, and to alter it and redistribute it
|
|
|
|
* freely. This software is provided 'as-is', without any express or
|
|
|
|
* implied warranty.
|
|
|
|
*/
|
|
|
|
|
2008-08-08 03:57:25 +00:00
|
|
|
#include <algorithm>
|
|
|
|
#include <fstream>
|
|
|
|
#include <sstream>
|
|
|
|
#include <ios>
|
|
|
|
#include <iostream>
|
2008-08-07 13:57:40 +00:00
|
|
|
|
2008-08-09 11:16:45 +00:00
|
|
|
#include <unicode/uchar.h>
|
|
|
|
|
|
|
|
#include <boost/format.hpp>
|
|
|
|
|
2008-08-08 03:57:25 +00:00
|
|
|
#include "utf8.h"
|
|
|
|
#include "equiv.h"
|
|
|
|
#include "affunctions.h"
|
|
|
|
|
2008-08-09 11:16:45 +00:00
|
|
|
namespace {
|
|
|
|
|
|
|
|
struct too_many_arguments_exception : afp::exception {
|
|
|
|
too_many_arguments_exception(char const *what)
|
|
|
|
: afp::exception(what) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct too_few_arguments_exception : afp::exception {
|
|
|
|
too_few_arguments_exception(char const *what)
|
|
|
|
: afp::exception(what) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
void
|
|
|
|
check_args(std::string const &fname, int args, int min, int max = 0)
|
|
|
|
{
|
|
|
|
if (max == 0)
|
|
|
|
max = min;
|
|
|
|
if (args < min) {
|
|
|
|
std::string s = str(boost::format(
|
|
|
|
"too few arguments for function %s (got %d, expected %d)")
|
|
|
|
% fname % args % min);
|
|
|
|
throw too_few_arguments_exception(s.c_str());
|
|
|
|
} else if (args > max) {
|
|
|
|
std::string s = str(boost::format(
|
|
|
|
"too many arguments for function %s (got %d, expected %d)")
|
|
|
|
% fname % args % min);
|
|
|
|
throw too_many_arguments_exception(s.c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // anonymous namespace
|
2008-07-29 11:03:26 +00:00
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
namespace afp {
|
2008-07-29 11:03:26 +00:00
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_count(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("count", args.size(), 1, 2);
|
2008-08-02 11:10:42 +00:00
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
std::string needle, haystack;
|
2008-08-02 11:10:42 +00:00
|
|
|
|
|
|
|
if (args.size() < 2) {
|
|
|
|
needle = ",";
|
|
|
|
haystack = args[0].toString();
|
|
|
|
} else {
|
|
|
|
needle = args[0].toString();
|
|
|
|
haystack = args[1].toString();
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t last_pos = 0;
|
|
|
|
unsigned int count = 0;
|
|
|
|
|
|
|
|
while (last_pos != haystack.npos) {
|
|
|
|
count++;
|
2008-08-08 00:23:30 +00:00
|
|
|
last_pos = haystack.find(needle, last_pos);
|
2008-08-02 11:10:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// One extra was added, but one extra is needed if only one arg was supplied.
|
2008-08-08 05:39:51 +00:00
|
|
|
if (args.size() >= 2)
|
2008-08-02 11:10:42 +00:00
|
|
|
count--;
|
|
|
|
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_int((long int)count);
|
2008-07-31 16:28:24 +00:00
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_norm(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("norm", args.size(), 1);
|
2008-07-31 16:28:24 +00:00
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
std::string orig = args[0].toString();
|
2008-07-31 16:28:24 +00:00
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
int lastchr = 0;
|
2008-08-08 03:23:34 +00:00
|
|
|
equiv_set const &equivs = equiv_set::instance();
|
|
|
|
std::string result;
|
2008-07-31 16:28:24 +00:00
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
utf8::utf8_iterator<std::string::const_iterator>
|
|
|
|
it(orig.begin(), orig.end()), end;
|
|
|
|
|
|
|
|
for (; it != end; ++it) {
|
|
|
|
int chr = equivs.get(*it);
|
2008-07-31 16:28:24 +00:00
|
|
|
|
2008-08-08 03:57:25 +00:00
|
|
|
if (chr != lastchr && u_isalnum(chr))
|
2008-08-08 03:23:34 +00:00
|
|
|
result.append(utf8::codepoint_to_utf8(chr));
|
2008-07-31 16:28:24 +00:00
|
|
|
|
|
|
|
lastchr = chr;
|
|
|
|
}
|
|
|
|
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_string(result);
|
2008-07-31 16:28:24 +00:00
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
std::string
|
2008-08-08 00:23:30 +00:00
|
|
|
rmdoubles(std::string const &orig) {
|
2008-08-08 03:46:02 +00:00
|
|
|
int lastchr = 0;
|
2008-08-08 03:23:34 +00:00
|
|
|
std::string result;
|
2008-07-31 16:28:24 +00:00
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
|
|
|
for (; it != end; ++it) {
|
|
|
|
if (*it != lastchr)
|
|
|
|
result.append(utf8::codepoint_to_utf8(*it));
|
2008-07-31 16:28:24 +00:00
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
lastchr = *it;
|
2008-07-31 16:28:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_specialratio(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("specialratio", args.size(), 1);
|
2008-07-29 11:03:26 +00:00
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
std::string orig = args[0].toString();
|
2008-08-08 03:46:02 +00:00
|
|
|
int len = 0;
|
2008-07-29 11:03:26 +00:00
|
|
|
int specialcount = 0;
|
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
|
|
|
for (; it != end; ++it) {
|
|
|
|
len++;
|
2008-08-08 03:57:25 +00:00
|
|
|
if (!u_isalnum(*it))
|
2008-07-29 11:03:26 +00:00
|
|
|
specialcount++;
|
|
|
|
}
|
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
double ratio = (float)specialcount / len;
|
2008-07-29 11:03:26 +00:00
|
|
|
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_double(ratio);
|
2008-07-29 11:03:26 +00:00
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_rmspecials(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("rmspecials", args.size(), 1);
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_string(rmspecials(args[0].toString()));
|
2008-07-29 11:03:26 +00:00
|
|
|
}
|
|
|
|
|
2008-08-08 00:23:30 +00:00
|
|
|
std::string
|
|
|
|
rmspecials(std::string const &orig) {
|
2008-08-08 03:23:34 +00:00
|
|
|
std::string result;
|
2008-07-29 11:03:26 +00:00
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
|
|
|
for (; it != end; ++it) {
|
2008-08-08 03:57:25 +00:00
|
|
|
if (u_isalnum(*it))
|
2008-08-08 03:46:02 +00:00
|
|
|
result.append(utf8::codepoint_to_utf8(*it));
|
2008-07-29 11:03:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_ccnorm(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("ccnorm", args.size(), 1);
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_string(confusable_character_normalise( args[0].toString()));
|
2008-07-31 16:28:24 +00:00
|
|
|
}
|
2008-07-29 15:11:59 +00:00
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_rmdoubles(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("ccnorm", args.size(), 1);
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_string(rmdoubles(args[0].toString()));
|
2008-07-31 16:28:24 +00:00
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_length(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("ccnorm", args.size(), 1);
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_int((long int)utf8::utf8_strlen(args[0].toString()));
|
2008-07-29 11:03:26 +00:00
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
datum
|
|
|
|
af_lcase(std::vector<datum> const &args) {
|
2008-08-09 11:16:45 +00:00
|
|
|
check_args("ccnorm", args.size(), 1);
|
2008-08-09 12:14:08 +00:00
|
|
|
return datum::from_string(utf8::utf8_tolower(args[0].toString()));
|
2008-07-29 11:03:26 +00:00
|
|
|
}
|
|
|
|
|
2008-08-08 00:23:30 +00:00
|
|
|
std::string
|
|
|
|
confusable_character_normalise(std::string const &orig) {
|
2008-08-08 03:23:34 +00:00
|
|
|
equiv_set const &equivs = equiv_set::instance();
|
|
|
|
std::string result;
|
2008-07-29 11:03:26 +00:00
|
|
|
|
2008-08-08 03:46:02 +00:00
|
|
|
utf8::utf8_iterator<std::string::const_iterator> it(orig.begin(), orig.end()), end;
|
2008-08-08 05:39:51 +00:00
|
|
|
for (; it != end; ++it)
|
|
|
|
result += utf8::codepoint_to_utf8(equivs.get(*it));
|
2008-07-29 11:03:26 +00:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2008-08-08 03:23:34 +00:00
|
|
|
} // namespace afp
|