00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef TOKEN_H
00023 #define TOKEN_H TOKEN_H
00024
00025 #include <iostream>
00026 #include <sstream>
00027 #include <set>
00028 #include <boost/shared_ptr.hpp>
00029
00030 #include "support/safe_bool.hpp"
00031 #include "support/unicodehelpers.h"
00032
00033 #include "sqlitepp/sqlitepp.h"
00034
00035 #include "ph_types.h"
00036
00037 namespace PhraseHunter {
00038
00039
00040
00041
00042
00043
00044
00045
00046 class Token: public safe_bool<>
00047 {
00048 protected:
00049 schma::UnicodePtr m_tokenstring;
00050 OccurrenceMap m_occurrences;
00051 Token(const char* token): m_tokenstring(new UnicodeString(token))
00052 {}
00053 Token(schma::UnicodePtr tokenstring) : m_tokenstring(tokenstring)
00054 {}
00055
00056 static const int SPACE_BETWEEN_TWO_TOKENS = 2;
00057
00058 public:
00059 virtual ~Token() {}
00060
00061
00062 virtual size_t length() const
00063 {
00064 return m_tokenstring->length() + SPACE_BETWEEN_TWO_TOKENS;
00065 }
00066
00067 bool isUniform() const
00068 {
00069 return false;
00070 }
00071
00072 virtual bool isEmpty() const { return m_occurrences.empty(); }
00073
00074
00075 virtual unsigned int corpusFrequency() const = 0;
00076
00077
00078 schma::UnicodePtr tokenString() const { return m_tokenstring; }
00079
00080
00081 virtual unsigned int documentFrequency() const { return m_occurrences.size(); }
00082
00083
00084 bool inDoc(DocID docID) const { return m_occurrences.count(docID) > 0; }
00085
00086
00087 const PositionList& documentOccurrences(DocID docID) { return m_occurrences[docID]; }
00088
00089
00090 const OccurrenceMap& allOccurrences() const { return m_occurrences; }
00091
00092
00093 virtual std::vector<DocID> documentIDs() const;
00094 virtual unsigned int numTokens() const { return 1; }
00095
00096 virtual TokenID id() const { return InvalidTokenID; }
00097 };
00098
00099
00100 class EmptyToken: public Token {
00101 private:
00102 EmptyToken(): Token("") {}
00103 static TokenPtr s_inst;
00104
00105 protected:
00106 bool boolean_test() const
00107 { return false; }
00108
00109 public:
00110
00111 static TokenPtr instance()
00112 { return s_inst; }
00113
00114
00115 unsigned int corpusFrequency() const { return 0; }
00116
00117 bool isEmpty() const { return true; }
00118
00119 unsigned int numTokens() const { return 0; }
00120 };
00121
00122
00123 class CorpusTokenBase: public Token
00124 {
00125 protected:
00126 CorpusTokenBase(const char* token, TokenID id):
00127 Token(token), m_id(id), m_corpusfreq(0) {}
00128 CorpusTokenBase(schma::UnicodePtr token, TokenID id):
00129 Token(token), m_id(id), m_corpusfreq(0) {}
00130
00131 bool boolean_test() const
00132 { return m_corpusfreq > 0; }
00133
00134 TokenID m_id;
00135 unsigned int m_corpusfreq;
00136
00137 public:
00138
00139 TokenID id() const { return m_id; }
00140
00141 unsigned int corpusFrequency() const { return m_corpusfreq; }
00142 };
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152 class CorpusToken: public CorpusTokenBase
00153 {
00154 private:
00155 void insertPositions(SQLitePP::ResultIterator);
00156 CorpusToken(const char*, TokenID, SQLitePP::SqliteDB&);
00157 CorpusToken(const char*, TokenID, SQLitePP::SqliteDB&, const std::set<DocID>&);
00158 CorpusToken(schma::UnicodePtr, TokenID, SQLitePP::SqliteDB&);
00159 CorpusToken(schma::UnicodePtr, TokenID, SQLitePP::SqliteDB&, const std::set<DocID>&);
00160
00161 void init(TokenID, SQLitePP::SqliteDB&);
00162 void init(TokenID, SQLitePP::SqliteDB&, const std::set<DocID>&);
00163
00164 public:
00165
00166 template<typename T>
00167 static TokenPtr loadFromCorpus(T tokenstr, TokenID id, SQLitePP::SqliteDB& db)
00168 {
00169 p_assert(id != InvalidTokenID, "Invalid token id");
00170 return TokenPtr(new CorpusToken(tokenstr, id, db));
00171 }
00172
00173
00174 template<typename T>
00175 static TokenPtr loadFromCorpus(T tokenstr, TokenID id, SQLitePP::SqliteDB& db,
00176 const std::set<DocID>& docs)
00177 {
00178 p_assert(id != InvalidTokenID, "Invalid token id");
00179 TokenPtr t(new CorpusToken(tokenstr, id, db, docs));
00180 return (t->isEmpty())
00181 ? EmptyToken::instance()
00182 : t;
00183 }
00184
00185
00186
00187
00188 static TokenPtr loadFromToken(const TokenPtr& token, SQLitePP::SqliteDB& db)
00189 {
00190 return loadFromCorpus<schma::UnicodePtr>(token->tokenString(), token->id(), db);
00191 }
00192
00193
00194
00195
00196 static TokenPtr loadFromToken(const TokenPtr& token, SQLitePP::SqliteDB& db, const std::set<DocID>& docs)
00197 {
00198 return loadFromCorpus<schma::UnicodePtr>(token->tokenString(), token->id(), db, docs);
00199 }
00200
00201
00202 bool isEmpty() const
00203 { return m_corpusfreq == 0; }
00204
00205 };
00206
00207
00208 class LightCorpusToken: public CorpusTokenBase
00209 {
00210 private:
00211 unsigned int m_documentFreq;
00212 LightCorpusToken(const char* tokenstr, TokenID id,
00213 unsigned int freq, unsigned int docFreq)
00214 : CorpusTokenBase(tokenstr, id), m_documentFreq(docFreq)
00215 {
00216 m_corpusfreq = freq;
00217 }
00218
00219 public:
00220
00221
00222
00223
00224
00225 static TokenPtr search(const char* tokenstr, SQLitePP::SqliteDB& db);
00226
00227 unsigned int documentFrequency() const { return m_documentFreq; }
00228 };
00229
00230
00231
00232
00233
00234
00235
00236 class MutableToken: public Token
00237 {
00238 protected:
00239 unsigned int m_totalOccurrences;
00240 bool boolean_test() const
00241 { return m_totalOccurrences > 0; }
00242
00243 MutableToken(const char* tokenstring): Token(tokenstring), m_totalOccurrences(0)
00244 {}
00245 MutableToken(schma::UnicodePtr tokenstring): Token(tokenstring), m_totalOccurrences(0)
00246 {}
00247
00248 public:
00249
00250 unsigned int corpusFrequency() const { return m_totalOccurrences; }
00251
00252 void removeDocument(DocID docID);
00253
00254
00255
00256
00257
00258 void addOccurrence(DocID docID, IdxPos position);
00259
00260 };
00261
00262
00263 class Phrase: public MutableToken
00264 {
00265 public:
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276 enum Direction {
00277 LeftToRight,
00278 RightToLeft
00279 };
00280
00281 private:
00282 template<Direction d>
00283 struct adja
00284 {};
00285 int m_tokencount;
00286
00287 Phrase(const char* token, int tokencount)
00288 : MutableToken(token), m_tokencount(tokencount)
00289 {}
00290 Phrase(schma::UnicodePtr token, int tokencount)
00291 : MutableToken(token), m_tokencount(tokencount)
00292 {}
00293
00294 public:
00295
00296
00297
00298
00299 template<Direction d>
00300 static TokenPtr getAdjacent(TokenPtr left, TokenPtr right);
00301
00302
00303 static TokenPtr mergeTokens(TokenPtr left, TokenPtr right);
00304
00305
00306 inline size_t length() const
00307 {
00308 return m_tokenstring->length() + m_tokencount + 1;
00309 }
00310
00311
00312 inline unsigned int numTokens() const
00313 {
00314 return m_tokencount;
00315 }
00316 };
00317
00318 }
00319
00320 #endif // TOKEN_H