// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer // // // Copyright(C) 2001-2006 Taku Kudo // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation #ifndef MECAB_FEATUREINDEX_H_ #define MECAB_FEATUREINDEX_H_ #include #include #include "mecab.h" #include "mmap.h" #include "darts.h" #include "freelist.h" #include "common.h" #include "learner_node.h" #include "string_buffer.h" #include "dictionary_rewriter.h" namespace MeCab { class Param; class FeatureIndex { public: virtual bool open(const Param ¶m) = 0; virtual void clear() = 0; virtual void close() = 0; virtual bool buildFeature(LearnerPath *path) = 0; void set_alpha(const double *alpha); size_t size() const { return maxid_; } bool buildUnigramFeature(LearnerPath *, const char *); bool buildBigramFeature(LearnerPath *, const char *, const char*); void calcCost(LearnerPath *path); void calcCost(LearnerNode *node); const char *strdup(const char *str); static bool convert(const Param ¶m, const char *text_filename, std::string *output); static bool compile(const Param ¶m, const char *text_filename, const char *binary_filename); explicit FeatureIndex(): feature_freelist_(8192 * 32), char_freelist_(8192 * 32), maxid_(0), alpha_(0) {} virtual ~FeatureIndex() {} protected: std::vector feature_; ChunkFreeList feature_freelist_; ChunkFreeList char_freelist_; std::vector unigram_templs_; std::vector bigram_templs_; DictionaryRewriter rewrite_; StringBuffer os_; size_t maxid_; const double *alpha_; virtual int id(const char *key) = 0; const char* getIndex(char **, char **, size_t); bool openTemplate(const Param ¶m); }; class EncoderFeatureIndex: public FeatureIndex { public: bool open(const Param ¶m); void close(); void clear(); bool reopen(const char *filename, const char *charset, std::vector *alpha, Param *param); bool save(const char *filename, const char *header) const; void shrink(size_t freq, std::vector *observed); bool buildFeature(LearnerPath *path); void clearcache(); private: std::map dic_; std::map > feature_cache_; int id(const char *key); }; class DecoderFeatureIndex: public FeatureIndex { public: bool open(const Param ¶m); void clear(); void close(); bool buildFeature(LearnerPath *path); const char *charset() const { return charset_; } private: bool openFromArray(const char *begin, const char *end); bool openBinaryModel(const Param ¶m); bool openTextModel(const Param ¶m); int id(const char *key); Mmap mmap_; std::string model_buffer_; const uint64_t *key_; const char *charset_; }; } #endif