00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <fstream>
00022 #include <tr1/unordered_map>
00023
00024
00025 #include "sqlitepp/sqlitepp.h"
00026
00027 #include "phrasehunter/indexer.h"
00028 #include "phrasehunter/tokenizer.h"
00029 #include "support/unicodehelpers.h"
00030
00031 using namespace SQLitePP;
00032
00033 namespace PhraseHunter {
00034
00035 void IndexManager::removeFromIndex(DocID docID)
00036 {
00037 (*m_db.statement("DELETE FROM occurrences WHERE docid = ?"))
00038 .bindArgs(docID)
00039 .exec();
00040 }
00041
00042
00043 typedef std::tr1::unordered_map<std::string, std::pair<TokenID, PositionList> > idx_map;
00044
00045 void IndexManager::addToIndex(TextSaver& token_stream, DocID docID)
00046 throw(Exceptions::FileError)
00047 {
00048 char buf[1024];
00049 idx_map tokens;
00050
00051 TextSaver::TokenInformation ti;
00052
00053 m_db.begin();
00054
00055 Statement::Pointer getid = m_db.statement("SELECT id FROM tokens WHERE word = ?");
00056 Statement::Pointer insertTok = m_db.statement("INSERT INTO tokens (word) VALUES (?)");
00057 Statement::Pointer insertPos = m_db.statement("\
00058 INSERT INTO occurrences (wordid, docid, positions) VALUES (?,?,?)");
00059
00060 UErrorCode status;
00061
00062 while(token_stream.hasMoreTokens()) {
00063 ti = token_stream.nextToken();
00064
00065 ti.token->toLower();
00066 status = U_ZERO_ERROR;
00067
00068 ti.token->extract(buf, 1023, schma::UTF8Converter(), status);
00069
00070 idx_map::iterator it = tokens.find(buf);
00071
00072 if(it == tokens.end()) {
00073 int id;
00074 getid->bindArg(buf);
00075 ResultIterator ri = getid->query();
00076
00077 if(ri.hasMoreRows()) {
00078 id = ri.get<int>(0);
00079 } else {
00080 id = (*insertTok)
00081 .bindArg(buf)
00082 .exec()
00083 .lastInsertId();
00084
00085 insertTok->reset();
00086 }
00087 getid->reset();
00088 PositionList v;
00089 v.push_back(ti.offset);
00090
00091 tokens[buf] = std::make_pair(id, v);
00092 } else {
00093 it->second.second.push_back(ti.offset);
00094 }
00095 }
00096
00097 for(idx_map::iterator it = tokens.begin();
00098 it != tokens.end();
00099 ++it) {
00100 (*insertPos)
00101 .bindArgs(it->second.first, docID,
00102 Blob(reinterpret_cast<const void*>(&it->second.second[0]),
00103 it->second.second.size() * sizeof(PositionList::value_type)))
00104 .exec()
00105 .reset();
00106 }
00107 m_db.commit();
00108 }
00109
00110 IndexManager::~IndexManager()
00111 {
00112 (*m_db.statement("\
00113 UPDATE tokens SET \
00114 frequency = (SELECT SUM(LENGTH(positions)) / ? FROM occurrences WHERE wordid = tokens.id ), \
00115 documentfrequency = (SELECT DISTINCT docid FROM occurrences WHERE wordid = tokens.id)"))
00116 .bindArgs(sizeof(IdxPos))
00117 .exec();
00118
00119
00120 m_db.statement("UPDATE tokens SET rank = NULL")->exec();
00121
00122 Statement::Pointer getByFrequency =
00123 m_db.statement("SELECT id FROM tokens ORDER BY frequency DESC");
00124
00125 Statement::Pointer setRank =
00126 m_db.statement("UPDATE tokens SET rank = ? WHERE id = ?");
00127
00128 int rank = 1;
00129 for(ResultIterator ri = getByFrequency->query();
00130 ri.hasMoreRows();
00131 ri.next()) {
00132 (*setRank)
00133 .bindArgs(rank++, ri.get<int>(0))
00134 .exec()
00135 .reset();
00136 }
00137 }
00138
00139
00140 }