// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer // // // Copyright(C) 2001-2006 Taku Kudo // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation #ifndef MECAB_UTILS_H #define MECAB_UTILS_H #include #include #include #include #include #include #include "common.h" #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifdef HAVE_STDINT_H #include #else // HAVE_STDINT_H #if defined(_WIN32) && !defined(__CYGWIN__) #if defined(_MSC_VER) && (_MSC_VER <= 1500) typedef unsigned char uint8_t; typedef unsigned long uint32_t; typedef unsigned long long uint64_t; #else // _MSC_VER #include #endif // _MSC_VER #else // _WIN32 typedef unsigned char uint8_t; typedef unsigned long uint32_t; typedef unsigned __int64 uint64_t; #endif // _WIN32 #endif // HAVE_STDINT_H namespace MeCab { class Param; enum { EUC_JP, CP932, UTF8, UTF16, UTF16LE, UTF16BE, ASCII }; int decode_charset(const char *charset); void inline dtoa(double val, char *s) { std::sprintf(s, "%-16f", val); char *p = s; for (; *p != ' '; ++p) {} *p = '\0'; return; } template inline void itoa(T val, char *s) { char *t; T mod; if (val < 0) { *s++ = '-'; val = -val; } t = s; while (val) { mod = val % 10; *t++ = static_cast(mod) + '0'; val /= 10; } if (s == t) *t++ = '0'; *t = '\0'; std::reverse(s, t); return; } template inline void uitoa(T val, char *s) { char *t; T mod; t = s; while (val) { mod = val % 10; *t++ = static_cast(mod) + '0'; val /= 10; } if (s == t) *t++ = '0'; *t = '\0'; std::reverse(s, t); return; } inline const char *read_ptr(const char **ptr, size_t size) { const char *r = *ptr; *ptr += size; return r; } template inline void read_static(const char **ptr, T& value) { const char *r = read_ptr(ptr, sizeof(T)); memcpy(&value, r, sizeof(T)); } bool file_exists(const char *filename); int load_request_type(const Param ¶m); bool load_dictionary_resource(Param *); bool escape_csv_element(std::string *w); void enum_csv_dictionaries(const char *path, std::vector *dics); int progress_bar(const char* message, size_t current, size_t total); bool toLower(std::string *); std::string create_filename(const std::string &path, const std::string &file); void remove_filename(std::string *s); void remove_pathname(std::string *s); void replace_string(std::string *s, const std::string &src, const std::string &dst); template inline size_t tokenizeCSV(char *str, Iterator out, size_t max) { char *eos = str + std::strlen(str); char *start = 0; char *end = 0; size_t n = 0; for (; str < eos; ++str) { // skip white spaces while (*str == ' ' || *str == '\t') ++str; if (*str == '"') { start = ++str; end = start; for (; str < eos; ++str) { if (*str == '"') { str++; if (*str != '"') break; } *end++ = *str; } str = std::find(str, eos, ','); } else { start = str; str = std::find(str, eos, ','); end = str; } if (max-- > 1) *end = '\0'; *out++ = start; ++n; if (max == 0) break; } return n; } template inline size_t tokenize(char *str, const char *del, Iterator out, size_t max) { char *stre = str + std::strlen(str); const char *dele = del + std::strlen(del); size_t size = 0; while (size < max) { char *n = std::find_first_of(str, stre, del, dele); *n = '\0'; *out++ = str; ++size; if (n == stre) break; str = n + 1; } return size; } // continus run of space is regarded as one space template inline size_t tokenize2(char *str, const char *del, Iterator out, size_t max) { char *stre = str + std::strlen(str); const char *dele = del + std::strlen(del); size_t size = 0; while (size < max) { char *n = std::find_first_of(str, stre, del, dele); *n = '\0'; if (*str != '\0') { *out++ = str; ++size; } if (n == stre) break; str = n + 1; } return size; } inline double logsumexp(double x, double y, bool flg) { #define MINUS_LOG_EPSILON 50 if (flg) return y; // init mode double vmin = std::min(x, y); double vmax = std::max(x, y); if (vmax > vmin + MINUS_LOG_EPSILON) { return vmax; } else { return vmax + std::log(std::exp(vmin - vmax) + 1.0); } } inline short int tocost(double d, int n) { static const short max = +32767; static const short min = -32767; return static_cast(std::max( std::min( -n * d, static_cast(max)), static_cast(min)) ); } inline char getEscapedChar(const char p) { switch (p) { case '0': return '\0'; case 'a': return '\a'; case 'b': return '\b'; case 't': return '\t'; case 'n': return '\n'; case 'v': return '\v'; case 'f': return '\f'; case 'r': return '\r'; case 's': return ' '; case '\\': return '\\'; default: break; } return '\0'; // never be here } // return 64 bit hash uint64_t fingerprint(const char *str, size_t size); uint64_t fingerprint(const std::string &str); #if defined(_WIN32) && !defined(__CYGWIN__) std::wstring Utf8ToWide(const std::string &input); std::string WideToUtf8(const std::wstring &input); #endif } #endif