519 lines
13 KiB
C++
519 lines
13 KiB
C++
/*
|
|
Darts -- Double-ARray Trie System
|
|
|
|
|
|
Copyright(C) 2001-2007 Taku Kudo <taku@chasen.org>
|
|
*/
|
|
#ifndef DARTS_H_
|
|
#define DARTS_H_
|
|
|
|
#define DARTS_VERSION "0.31"
|
|
#include <vector>
|
|
#include <cstring>
|
|
#include <cstdio>
|
|
|
|
#ifdef HAVE_ZLIB_H
|
|
namespace zlib {
|
|
#include <zlib.h>
|
|
}
|
|
#define SH(p)((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8))
|
|
#define LG(p)((unsigned long)(SH(p)) |((unsigned long)(SH((p)+2)) << 16))
|
|
#endif
|
|
|
|
namespace MeCab {
|
|
|
|
namespace Darts {
|
|
|
|
template <class T> inline T _max(T x, T y) { return(x > y) ? x : y; }
|
|
template <class T> inline T* _resize(T* ptr, size_t n, size_t l, T v) {
|
|
T *tmp = new T[l];
|
|
for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i];
|
|
for (size_t i = n; i < l; ++i) tmp[i] = v;
|
|
delete [] ptr;
|
|
return tmp;
|
|
}
|
|
|
|
template <class T>
|
|
class Length {
|
|
public: size_t operator()(const T *key) const
|
|
{ size_t i; for (i = 0; key[i] != (T)0; ++i) {} return i; }
|
|
};
|
|
|
|
template <> class Length<char> {
|
|
public: size_t operator()(const char *key) const
|
|
{ return std::strlen(key); }
|
|
};
|
|
|
|
template <class node_type_, class node_u_type_,
|
|
class array_type_, class array_u_type_,
|
|
class length_func_ = Length<node_type_> >
|
|
class DoubleArrayImpl {
|
|
private:
|
|
|
|
struct node_t {
|
|
array_u_type_ code;
|
|
size_t depth;
|
|
size_t left;
|
|
size_t right;
|
|
};
|
|
|
|
struct unit_t {
|
|
array_type_ base;
|
|
array_u_type_ check;
|
|
};
|
|
|
|
unit_t *array_;
|
|
unsigned char *used_;
|
|
size_t size_;
|
|
size_t alloc_size_;
|
|
node_type_ **key_;
|
|
size_t key_size_;
|
|
size_t *length_;
|
|
array_type_ *value_;
|
|
size_t progress_;
|
|
size_t next_check_pos_;
|
|
bool no_delete_;
|
|
int error_;
|
|
int (*progress_func_)(size_t, size_t);
|
|
|
|
size_t resize(const size_t new_size) {
|
|
unit_t tmp;
|
|
tmp.base = 0;
|
|
tmp.check = 0;
|
|
array_ = _resize(array_, alloc_size_, new_size, tmp);
|
|
used_ = _resize(used_, alloc_size_, new_size,
|
|
static_cast<unsigned char>(0));
|
|
alloc_size_ = new_size;
|
|
return new_size;
|
|
}
|
|
|
|
size_t fetch(const node_t &parent, std::vector <node_t> &siblings) {
|
|
if (error_ < 0) return 0;
|
|
|
|
array_u_type_ prev = 0;
|
|
|
|
for (size_t i = parent.left; i < parent.right; ++i) {
|
|
if ((length_ ? length_[i] : length_func_()(key_[i])) < parent.depth)
|
|
continue;
|
|
|
|
const node_u_type_ *tmp = reinterpret_cast<node_u_type_ *>(key_[i]);
|
|
|
|
array_u_type_ cur = 0;
|
|
if ((length_ ? length_[i] : length_func_()(key_[i])) != parent.depth)
|
|
cur = (array_u_type_)tmp[parent.depth] + 1;
|
|
|
|
if (prev > cur) {
|
|
error_ = -3;
|
|
return 0;
|
|
}
|
|
|
|
if (cur != prev || siblings.empty()) {
|
|
node_t tmp_node;
|
|
tmp_node.depth = parent.depth + 1;
|
|
tmp_node.code = cur;
|
|
tmp_node.left = i;
|
|
if (!siblings.empty()) siblings[siblings.size()-1].right = i;
|
|
|
|
siblings.push_back(tmp_node);
|
|
}
|
|
|
|
prev = cur;
|
|
}
|
|
|
|
if (!siblings.empty())
|
|
siblings[siblings.size()-1].right = parent.right;
|
|
|
|
return siblings.size();
|
|
}
|
|
|
|
size_t insert(const std::vector <node_t> &siblings) {
|
|
if (error_ < 0) return 0;
|
|
|
|
size_t begin = 0;
|
|
size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1;
|
|
size_t nonzero_num = 0;
|
|
int first = 0;
|
|
|
|
if (alloc_size_ <= pos) resize(pos + 1);
|
|
|
|
while (true) {
|
|
next:
|
|
++pos;
|
|
|
|
if (alloc_size_ <= pos) resize(pos + 1);
|
|
|
|
if (array_[pos].check) {
|
|
++nonzero_num;
|
|
continue;
|
|
} else if (!first) {
|
|
next_check_pos_ = pos;
|
|
first = 1;
|
|
}
|
|
|
|
begin = pos - siblings[0].code;
|
|
if (alloc_size_ <= (begin + siblings[siblings.size()-1].code))
|
|
resize(static_cast<size_t>(alloc_size_ *
|
|
_max(1.05, 1.0 * key_size_ / progress_)));
|
|
|
|
if (used_[begin]) continue;
|
|
|
|
for (size_t i = 1; i < siblings.size(); ++i)
|
|
if (array_[begin + siblings[i].code].check != 0) goto next;
|
|
|
|
break;
|
|
}
|
|
|
|
// -- Simple heuristics --
|
|
// if the percentage of non-empty contents in check between the index
|
|
// 'next_check_pos' and 'check' is greater than some constant
|
|
// value(e.g. 0.9),
|
|
// new 'next_check_pos' index is written by 'check'.
|
|
if (1.0 * nonzero_num/(pos - next_check_pos_ + 1) >= 0.95)
|
|
next_check_pos_ = pos;
|
|
|
|
used_[begin] = 1;
|
|
size_ = _max(size_,
|
|
begin +
|
|
static_cast<size_t>(siblings[siblings.size() - 1].code + 1));
|
|
|
|
for (size_t i = 0; i < siblings.size(); ++i)
|
|
array_[begin + siblings[i].code].check = begin;
|
|
|
|
for (size_t i = 0; i < siblings.size(); ++i) {
|
|
std::vector <node_t> new_siblings;
|
|
|
|
if (!fetch(siblings[i], new_siblings)) {
|
|
array_[begin + siblings[i].code].base =
|
|
value_ ?
|
|
static_cast<array_type_>(-value_[siblings[i].left]-1) :
|
|
static_cast<array_type_>(-siblings[i].left-1);
|
|
|
|
if (value_ && (array_type_)(-value_[siblings[i].left]-1) >= 0) {
|
|
error_ = -2;
|
|
return 0;
|
|
}
|
|
|
|
++progress_;
|
|
if (progress_func_)(*progress_func_)(progress_, key_size_);
|
|
|
|
} else {
|
|
size_t h = insert(new_siblings);
|
|
array_[begin + siblings[i].code].base = h;
|
|
}
|
|
}
|
|
|
|
return begin;
|
|
}
|
|
|
|
public:
|
|
|
|
typedef array_type_ value_type;
|
|
typedef node_type_ key_type;
|
|
typedef array_type_ result_type; // for compatibility
|
|
|
|
struct result_pair_type {
|
|
value_type value;
|
|
size_t length;
|
|
};
|
|
|
|
explicit DoubleArrayImpl(): array_(0), used_(0),
|
|
size_(0), alloc_size_(0),
|
|
no_delete_(0), error_(0) {}
|
|
~DoubleArrayImpl() { clear(); }
|
|
|
|
void set_result(value_type& x, value_type r, size_t) const {
|
|
x = r;
|
|
}
|
|
|
|
void set_result(result_pair_type& x, value_type r, size_t l) const {
|
|
x.value = r;
|
|
x.length = l;
|
|
}
|
|
|
|
void set_array(void *ptr, size_t size = 0) {
|
|
clear();
|
|
array_ = reinterpret_cast<unit_t *>(ptr);
|
|
no_delete_ = true;
|
|
size_ = size;
|
|
}
|
|
|
|
const void *array() const {
|
|
return const_cast<const void *>(reinterpret_cast<void *>(array_));
|
|
}
|
|
|
|
void clear() {
|
|
if (!no_delete_)
|
|
delete [] array_;
|
|
delete [] used_;
|
|
array_ = 0;
|
|
used_ = 0;
|
|
alloc_size_ = 0;
|
|
size_ = 0;
|
|
no_delete_ = false;
|
|
}
|
|
|
|
size_t unit_size() const { return sizeof(unit_t); }
|
|
size_t size() const { return size_; }
|
|
size_t total_size() const { return size_ * sizeof(unit_t); }
|
|
|
|
size_t nonzero_size() const {
|
|
size_t result = 0;
|
|
for (size_t i = 0; i < size_; ++i)
|
|
if (array_[i].check) ++result;
|
|
return result;
|
|
}
|
|
|
|
int build(size_t key_size,
|
|
key_type **key,
|
|
size_t *length = 0,
|
|
value_type *value = 0,
|
|
int (*progress_func)(size_t, size_t) = 0) {
|
|
if (!key_size || !key) return 0;
|
|
|
|
progress_func_ = progress_func;
|
|
key_ = key;
|
|
length_ = length;
|
|
key_size_ = key_size;
|
|
value_ = value;
|
|
progress_ = 0;
|
|
|
|
resize(8192);
|
|
|
|
array_[0].base = 1;
|
|
next_check_pos_ = 0;
|
|
|
|
node_t root_node;
|
|
root_node.left = 0;
|
|
root_node.right = key_size;
|
|
root_node.depth = 0;
|
|
|
|
std::vector <node_t> siblings;
|
|
fetch(root_node, siblings);
|
|
insert(siblings);
|
|
|
|
size_ += (1 << 8 * sizeof(key_type)) + 1;
|
|
if (size_ >= alloc_size_) resize(size_);
|
|
|
|
delete [] used_;
|
|
used_ = 0;
|
|
|
|
return error_;
|
|
}
|
|
|
|
int open(const char *file,
|
|
const char *mode = "rb",
|
|
size_t offset = 0,
|
|
size_t size = 0) {
|
|
std::FILE *fp = std::fopen(file, mode);
|
|
if (!fp) return -1;
|
|
if (std::fseek(fp, offset, SEEK_SET) != 0) return -1;
|
|
|
|
if (!size) {
|
|
if (std::fseek(fp, 0L, SEEK_END) != 0) return -1;
|
|
size = std::ftell(fp);
|
|
if (std::fseek(fp, offset, SEEK_SET) != 0) return -1;
|
|
}
|
|
|
|
clear();
|
|
|
|
size_ = size;
|
|
size_ /= sizeof(unit_t);
|
|
array_ = new unit_t[size_];
|
|
if (size_ != std::fread(reinterpret_cast<unit_t *>(array_),
|
|
sizeof(unit_t), size_, fp)) return -1;
|
|
std::fclose(fp);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int save(const char *file,
|
|
const char *mode = "wb",
|
|
size_t offset = 0) {
|
|
if (!size_) return -1;
|
|
std::FILE *fp = std::fopen(file, mode);
|
|
if (!fp) return -1;
|
|
if (size_ != std::fwrite(reinterpret_cast<unit_t *>(array_),
|
|
sizeof(unit_t), size_, fp))
|
|
return -1;
|
|
std::fclose(fp);
|
|
return 0;
|
|
}
|
|
|
|
#ifdef HAVE_ZLIB_H
|
|
int gzopen(const char *file,
|
|
const char *mode = "rb",
|
|
size_t offset = 0,
|
|
size_t size = 0) {
|
|
std::FILE *fp = std::fopen(file, mode);
|
|
if (!fp) return -1;
|
|
clear();
|
|
|
|
size_ = size;
|
|
if (!size_) {
|
|
if (-1L != static_cast<long>(std::fseek(fp, -8, SEEK_END))) {
|
|
char buf[8];
|
|
if (std::fread(static_cast<char*>(buf),
|
|
1, 8, fp) != sizeof(buf)) {
|
|
std::fclose(fp);
|
|
return -1;
|
|
}
|
|
size_ = LG(buf+4);
|
|
size_ /= sizeof(unit_t);
|
|
}
|
|
}
|
|
std::fclose(fp);
|
|
|
|
if (!size_) return -1;
|
|
|
|
zlib::gzFile gzfp = zlib::gzopen(file, mode);
|
|
if (!gzfp) return -1;
|
|
array_ = new unit_t[size_];
|
|
if (zlib::gzseek(gzfp, offset, SEEK_SET) != 0) return -1;
|
|
zlib::gzread(gzfp, reinterpret_cast<unit_t *>(array_),
|
|
sizeof(unit_t) * size_);
|
|
zlib::gzclose(gzfp);
|
|
return 0;
|
|
}
|
|
|
|
int gzsave(const char *file, const char *mode = "wb",
|
|
size_t offset = 0) {
|
|
zlib::gzFile gzfp = zlib::gzopen(file, mode);
|
|
if (!gzfp) return -1;
|
|
zlib::gzwrite(gzfp, reinterpret_cast<unit_t *>(array_),
|
|
sizeof(unit_t) * size_);
|
|
zlib::gzclose(gzfp);
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
template <class T>
|
|
inline void exactMatchSearch(const key_type *key,
|
|
T & result,
|
|
size_t len = 0,
|
|
size_t node_pos = 0) const {
|
|
result = exactMatchSearch<T>(key, len, node_pos);
|
|
return;
|
|
}
|
|
|
|
template <class T>
|
|
inline T exactMatchSearch(const key_type *key,
|
|
size_t len = 0,
|
|
size_t node_pos = 0) const {
|
|
if (!len) len = length_func_()(key);
|
|
|
|
T result;
|
|
set_result(result, -1, 0);
|
|
|
|
register array_type_ b = array_[node_pos].base;
|
|
register array_u_type_ p;
|
|
|
|
for (register size_t i = 0; i < len; ++i) {
|
|
p = b +(node_u_type_)(key[i]) + 1;
|
|
if (static_cast<array_u_type_>(b) == array_[p].check)
|
|
b = array_[p].base;
|
|
else
|
|
return result;
|
|
}
|
|
|
|
p = b;
|
|
array_type_ n = array_[p].base;
|
|
if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
|
|
set_result(result, -n-1, len);
|
|
|
|
return result;
|
|
}
|
|
|
|
template <class T>
|
|
size_t commonPrefixSearch(const key_type *key,
|
|
T* result,
|
|
size_t result_len,
|
|
size_t len = 0,
|
|
size_t node_pos = 0) const {
|
|
if (!len) len = length_func_()(key);
|
|
|
|
register array_type_ b = array_[node_pos].base;
|
|
register size_t num = 0;
|
|
register array_type_ n;
|
|
register array_u_type_ p;
|
|
|
|
for (register size_t i = 0; i < len; ++i) {
|
|
p = b; // + 0;
|
|
n = array_[p].base;
|
|
if ((array_u_type_) b == array_[p].check && n < 0) {
|
|
// result[num] = -n-1;
|
|
if (num < result_len) set_result(result[num], -n-1, i);
|
|
++num;
|
|
}
|
|
|
|
p = b +(node_u_type_)(key[i]) + 1;
|
|
if ((array_u_type_) b == array_[p].check)
|
|
b = array_[p].base;
|
|
else
|
|
return num;
|
|
}
|
|
|
|
p = b;
|
|
n = array_[p].base;
|
|
|
|
if ((array_u_type_)b == array_[p].check && n < 0) {
|
|
if (num < result_len) set_result(result[num], -n-1, len);
|
|
++num;
|
|
}
|
|
|
|
return num;
|
|
}
|
|
|
|
value_type traverse(const key_type *key,
|
|
size_t &node_pos,
|
|
size_t &key_pos,
|
|
size_t len = 0) const {
|
|
if (!len) len = length_func_()(key);
|
|
|
|
register array_type_ b = array_[node_pos].base;
|
|
register array_u_type_ p;
|
|
|
|
for (; key_pos < len; ++key_pos) {
|
|
p = b +(node_u_type_)(key[key_pos]) + 1;
|
|
if (static_cast<array_u_type_>(b) == array_[p].check) {
|
|
node_pos = p;
|
|
b = array_[p].base;
|
|
} else {
|
|
return -2; // no node
|
|
}
|
|
}
|
|
|
|
p = b;
|
|
array_type_ n = array_[p].base;
|
|
if (static_cast<array_u_type_>(b) == array_[p].check && n < 0)
|
|
return -n-1;
|
|
|
|
return -1; // found, but no value
|
|
}
|
|
};
|
|
|
|
#if 4 == 2
|
|
typedef Darts::DoubleArrayImpl<char, unsigned char, short,
|
|
unsigned short> DoubleArray;
|
|
#define DARTS_ARRAY_SIZE_IS_DEFINED 1
|
|
#endif
|
|
|
|
#if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
|
|
typedef Darts::DoubleArrayImpl<char, unsigned char, int,
|
|
unsigned int> DoubleArray;
|
|
#define DARTS_ARRAY_SIZE_IS_DEFINED 1
|
|
#endif
|
|
|
|
#if 4 == 4 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
|
|
typedef Darts::DoubleArrayImpl<char, unsigned char, long,
|
|
unsigned long> DoubleArray;
|
|
#define DARTS_ARRAY_SIZE_IS_DEFINED 1
|
|
#endif
|
|
|
|
#if 4 == 8 && !defined(DARTS_ARRAY_SIZE_IS_DEFINED)
|
|
typedef Darts::DoubleArrayImpl<char, unsigned char, long long,
|
|
unsigned long long> DoubleArray;
|
|
#endif
|
|
}
|
|
}
|
|
#endif
|