00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <boost/lambda/lambda.hpp>
00022 #include <algorithm>
00023
00024 #include "phrasehunter/token.h"
00025
00026 using namespace SQLitePP;
00027 using namespace boost::lambda;
00028
00029 namespace PhraseHunter {
00030
00031 TokenPtr EmptyToken::s_inst = TokenPtr(new EmptyToken);
00032
00033 std::vector<DocID> Token::documentIDs() const
00034 {
00035 std::vector<DocID> ids;
00036 for(OccurrenceMap::const_iterator docs = m_occurrences.begin();
00037 docs != m_occurrences.end(); ++docs) {
00038 ids.push_back(docs->first);
00039 }
00040 return ids;
00041 }
00042
00043 void CorpusToken::insertPositions(ResultIterator occ_it)
00044 {
00045 for(; occ_it.hasMoreRows(); occ_it.next()) {
00046 DocID docID = occ_it.get<int>(0);
00047 Blob posblob = occ_it.get<Blob>(1);
00048
00049 unsigned int numOcc = posblob.second / sizeof(IdxPos);
00050 PositionList& pl = m_occurrences[docID];
00051 pl.resize(numOcc);
00052
00053 m_corpusfreq += numOcc;
00054 memcpy(&pl[0], posblob.first, posblob.second);
00055
00056
00057
00058 }
00059 }
00060
00061 CorpusToken::CorpusToken(const char* token, TokenID id, SqliteDB& db)
00062 : CorpusTokenBase(token, id)
00063 {
00064 init(id, db);
00065 }
00066
00067 CorpusToken::CorpusToken(const char* token, TokenID id, SqliteDB& db, const std::set<DocID>& documentIDs)
00068 : CorpusTokenBase(token, id)
00069 {
00070 init(id, db, documentIDs);
00071 }
00072
00073 CorpusToken::CorpusToken(schma::UnicodePtr token, TokenID id, SqliteDB& db)
00074 : CorpusTokenBase(token, id)
00075 {
00076 init(id, db);
00077 }
00078
00079 CorpusToken::CorpusToken(schma::UnicodePtr token, TokenID id, SqliteDB& db, const std::set<DocID>& documentIDs)
00080 : CorpusTokenBase(token, id)
00081 {
00082 init(id, db, documentIDs);
00083 }
00084
00085 void CorpusToken::init(TokenID id, SqliteDB& db)
00086 {
00087 Statement::Pointer occs = db.cachedStatement("SELECT docid, positions FROM occurrences WHERE wordid = ?");
00088 occs->bindArgs(id);
00089 insertPositions(occs->query());
00090 }
00091
00092 void CorpusToken::init(TokenID id, SqliteDB& db, const std::set<DocID>& documentIDs)
00093 {
00094 std::stringstream valueList;
00095 std::for_each(documentIDs.begin(), documentIDs.end(), valueList << _1 << ',');
00096
00097 Statement::Pointer occs =
00098 db.statement(("SELECT docid, positions FROM occurrences\
00099 WHERE wordid = ? AND docid IN (" + valueList.str() + ")").c_str());
00100 occs->bindArgs(id);
00101 insertPositions(occs->query());
00102 }
00103
00104 TokenPtr LightCorpusToken::search(const char* tokenstr, SQLitePP::SqliteDB& db)
00105 {
00106 Statement::Pointer getID = db.cachedStatement("SELECT id, frequency, documentfrequency FROM tokens WHERE word = ?");
00107 getID->bindArgs(tokenstr);
00108
00109 ResultIterator ri = getID->query();
00110 return (ri.hasMoreRows())
00111 ? TokenPtr(new LightCorpusToken(tokenstr, ri.get<int>(0), ri.get<int>(1), ri.get<int>(2)))
00112 : EmptyToken::instance();
00113 }
00114
00115 inline void MutableToken::removeDocument(DocID docID)
00116 {
00117 if(inDoc(docID)) {
00118 m_occurrences.erase(docID);
00119 }
00120 }
00121
00122 void MutableToken::addOccurrence(DocID docID, IdxPos position)
00123 {
00124 OccurrenceMap::iterator it = m_occurrences.find(docID);
00125 if(it == m_occurrences.end()) {
00126 PositionList p;
00127 p.push_back(position);
00128 m_occurrences[docID] = p;
00129 } else {
00130 if(position >= *it->second.rend()) {
00131 it->second.push_back(position);
00132 } else {
00133 it->second.push_back(it->second.back());
00134 int pos;
00135 for(pos = it->second.size()-1; it->second[pos]-1 > position; --pos)
00136 it->second[pos] = it->second[pos]-1;
00137 it->second[pos] = position;
00138 }
00139 }
00140 ++m_totalOccurrences;
00141 }
00142
00143 template<>
00144 struct Phrase::adja<Phrase::LeftToRight>
00145 {
00146 adja(TokenPtr left, TokenPtr right):
00147 occ_outer(left->allOccurrences()),
00148 occ_inner(right->allOccurrences()),
00149 innerOffset(left->length()),
00150 startOffset(0)
00151 {}
00152 const OccurrenceMap& occ_outer, occ_inner;
00153 const int innerOffset;
00154 const int startOffset;
00155 };
00156
00157 template<>
00158 struct Phrase::adja<Phrase::RightToLeft>
00159 {
00160 adja(TokenPtr left, TokenPtr right):
00161 occ_outer(right->allOccurrences()),
00162 occ_inner(left->allOccurrences()),
00163 innerOffset(-left->length()),
00164 startOffset(-left->length())
00165 {}
00166 const OccurrenceMap& occ_outer, occ_inner;
00167 const int innerOffset;
00168 const int startOffset;
00169 };
00170
00171 template<Phrase::Direction d>
00172 TokenPtr Phrase::getAdjacent(TokenPtr left, TokenPtr right)
00173 {
00174 schma::UnicodePtr phrase(new UnicodeString(*(left->tokenString())));
00175 phrase->append(' ');
00176 phrase->append(*(right->tokenString()));
00177
00178 Phrase* result = new Phrase(phrase, left->numTokens() + right->numTokens());
00179
00180 adja<d> a(left, right);
00181
00182 const OccurrenceMap& occ_outer = a.occ_outer;
00183 const OccurrenceMap& occ_inner = a.occ_inner;
00184
00185 for (OccurrenceMap::const_iterator outer_doc = occ_outer.begin();
00186 outer_doc != occ_outer.end();
00187 ++outer_doc) {
00188
00189 OccurrenceMap::const_iterator inner_doc =
00190 occ_inner.find(outer_doc->first);
00191
00192 if (inner_doc == occ_inner.end())
00193 continue;
00194
00195 for (PositionList::const_iterator outer_pos = outer_doc->second.begin();
00196 outer_pos != outer_doc->second.end();
00197 ++outer_pos) {
00198
00199 if(binary_search(inner_doc->second.begin(), inner_doc->second.end(),
00200 *outer_pos + a.innerOffset)) {
00201 result->addOccurrence(outer_doc->first, *outer_pos + a.startOffset);
00202 }
00203 }
00204 }
00205 return (result->isEmpty())
00206 ? EmptyToken::instance()
00207 : TokenPtr(result);
00208 }
00209
00210 TokenPtr Phrase::mergeTokens(TokenPtr left, TokenPtr right)
00211 {
00212 return (left->corpusFrequency() <= right->corpusFrequency())
00213 ? getAdjacent<LeftToRight>(left, right)
00214 : getAdjacent<RightToLeft>(left, right);
00215
00216 }
00217
00218 }