The Phrasehunter: phrasehunter/lib/token.cpp Source File

00001 /*
00002   Phrasehunter - index and query text corpora
00003   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00004   Armin Schmidt (armin.sch@gmail.com)
00005   
00006   This program is free software; you can redistribute it and/or
00007   modify it under the terms of the GNU General Public License
00008   as published by the Free Software Foundation; either version 2
00009   of the License, or (at your option) any later version.
00010   
00011   This program is distributed in the hope that it will be useful,
00012   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014   GNU General Public License for more details.
00015   
00016   You should have received a copy of the GNU General Public License
00017   along with this program; if not, write to the Free Software
00018   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00019 */
00020 
00021 #include <boost/lambda/lambda.hpp>
00022 #include <algorithm>
00023 
00024 #include "phrasehunter/token.h"
00025 
00026 using namespace SQLitePP;
00027 using namespace boost::lambda;
00028 
00029 namespace PhraseHunter {
00030 
00031 TokenPtr EmptyToken::s_inst = TokenPtr(new EmptyToken);
00032 
00033 std::vector<DocID> Token::documentIDs() const 
00034 {
00035     std::vector<DocID> ids;
00036     for(OccurrenceMap::const_iterator docs = m_occurrences.begin();
00037         docs != m_occurrences.end(); ++docs) {
00038         ids.push_back(docs->first);
00039     }
00040     return ids;
00041 }
00042 
00043 void CorpusToken::insertPositions(ResultIterator occ_it) 
00044 {
00045     for(; occ_it.hasMoreRows(); occ_it.next()) {
00046         DocID docID = occ_it.get<int>(0);
00047         Blob posblob = occ_it.get<Blob>(1);
00048         //const IdxPos* positions = reinterpret_cast<const IdxPos*>(posblob.first);
00049         unsigned int numOcc = posblob.second / sizeof(IdxPos);
00050         PositionList& pl = m_occurrences[docID];
00051         pl.resize(numOcc);
00052         // compute corpus frequency
00053         m_corpusfreq += numOcc;
00054         memcpy(&pl[0], posblob.first, posblob.second);
00055         /*for(unsigned int i = 0; i < numOcc; ++i, ++positions) {
00056           pl.push_back(*positions);
00057           }*/
00058     }
00059 }
00060 
00061 CorpusToken::CorpusToken(const char* token, TokenID id, SqliteDB& db)
00062     : CorpusTokenBase(token, id)
00063 {
00064     init(id, db);
00065 }
00066 
00067 CorpusToken::CorpusToken(const char* token, TokenID id, SqliteDB& db, const std::set<DocID>& documentIDs)
00068     : CorpusTokenBase(token, id)
00069 {
00070     init(id, db, documentIDs);
00071 }
00072 
00073 CorpusToken::CorpusToken(schma::UnicodePtr token, TokenID id, SqliteDB& db)
00074     : CorpusTokenBase(token, id)
00075 {
00076     init(id, db);
00077 }
00078 
00079 CorpusToken::CorpusToken(schma::UnicodePtr token, TokenID id, SqliteDB& db, const std::set<DocID>& documentIDs)
00080     : CorpusTokenBase(token, id)
00081 {
00082     init(id, db, documentIDs);
00083 }
00084 
00085 void CorpusToken::init(TokenID id, SqliteDB& db)
00086 {
00087     Statement::Pointer occs = db.cachedStatement("SELECT docid, positions FROM occurrences WHERE wordid = ?");
00088     occs->bindArgs(id);
00089     insertPositions(occs->query());
00090 }
00091 
00092 void CorpusToken::init(TokenID id, SqliteDB& db, const std::set<DocID>& documentIDs)
00093 {
00094     std::stringstream valueList;
00095     std::for_each(documentIDs.begin(), documentIDs.end(), valueList << _1 << ',');
00096     
00097     Statement::Pointer occs = 
00098         db.statement(("SELECT docid, positions FROM occurrences\
00099 WHERE wordid = ? AND docid IN (" + valueList.str() + ")").c_str());
00100     occs->bindArgs(id);
00101     insertPositions(occs->query());
00102 }
00103 
00104 TokenPtr LightCorpusToken::search(const char* tokenstr, SQLitePP::SqliteDB& db) 
00105 {
00106     Statement::Pointer getID = db.cachedStatement("SELECT id, frequency, documentfrequency FROM tokens WHERE word = ?");
00107     getID->bindArgs(tokenstr);
00108       
00109     ResultIterator ri = getID->query();
00110     return (ri.hasMoreRows())
00111         ? TokenPtr(new LightCorpusToken(tokenstr, ri.get<int>(0), ri.get<int>(1), ri.get<int>(2)))
00112         : EmptyToken::instance();
00113 }
00114 
00115 inline void MutableToken::removeDocument(DocID docID) 
00116 {
00117     if(inDoc(docID)) {
00118         m_occurrences.erase(docID);
00119     }
00120 }
00121 
00122 void MutableToken::addOccurrence(DocID docID, IdxPos position)
00123 {
00124     OccurrenceMap::iterator it = m_occurrences.find(docID);
00125     if(it == m_occurrences.end()) {
00126         PositionList p;
00127         p.push_back(position);
00128         m_occurrences[docID] = p;
00129     } else {
00130         if(position >= *it->second.rend()) {
00131             it->second.push_back(position);
00132         } else {
00133             it->second.push_back(it->second.back());
00134             int pos;
00135             for(pos = it->second.size()-1; it->second[pos]-1 > position; --pos) 
00136                 it->second[pos] = it->second[pos]-1;
00137             it->second[pos] = position;
00138         }
00139     }
00140     ++m_totalOccurrences;
00141 }
00142 
00143 template<>
00144 struct Phrase::adja<Phrase::LeftToRight>
00145 {
00146     adja(TokenPtr left, TokenPtr right):
00147         occ_outer(left->allOccurrences()),
00148         occ_inner(right->allOccurrences()),
00149         innerOffset(left->length()),
00150         startOffset(0) 
00151     {}
00152     const OccurrenceMap& occ_outer, occ_inner;
00153     const int innerOffset;
00154     const int startOffset;
00155 };
00156 
00157 template<>
00158 struct Phrase::adja<Phrase::RightToLeft>
00159 {
00160     adja(TokenPtr left, TokenPtr right):
00161         occ_outer(right->allOccurrences()),
00162         occ_inner(left->allOccurrences()),
00163         innerOffset(-left->length()),
00164         startOffset(-left->length())
00165     {}
00166     const OccurrenceMap& occ_outer, occ_inner;
00167     const int innerOffset;
00168     const int startOffset;
00169 };
00170 
00171 template<Phrase::Direction d>
00172 TokenPtr Phrase::getAdjacent(TokenPtr left, TokenPtr right)
00173 {
00174     schma::UnicodePtr phrase(new UnicodeString(*(left->tokenString())));
00175     phrase->append(' ');
00176     phrase->append(*(right->tokenString()));
00177     
00178     Phrase* result = new Phrase(phrase, left->numTokens() + right->numTokens());
00179     
00180     adja<d> a(left, right);
00181     
00182     const OccurrenceMap& occ_outer = a.occ_outer;
00183     const OccurrenceMap& occ_inner = a.occ_inner;
00184     
00185     for (OccurrenceMap::const_iterator outer_doc = occ_outer.begin();
00186          outer_doc != occ_outer.end();
00187          ++outer_doc) {
00188       
00189         OccurrenceMap::const_iterator inner_doc =
00190             occ_inner.find(outer_doc->first);
00191       
00192         if (inner_doc == occ_inner.end())
00193             continue;
00194       
00195         for (PositionList::const_iterator outer_pos = outer_doc->second.begin();
00196              outer_pos != outer_doc->second.end();
00197              ++outer_pos) {
00198           
00199             if(binary_search(inner_doc->second.begin(), inner_doc->second.end(),
00200                              *outer_pos + a.innerOffset)) {
00201                 result->addOccurrence(outer_doc->first, *outer_pos + a.startOffset);
00202             }
00203         }
00204     }
00205     return (result->isEmpty())
00206         ? EmptyToken::instance()
00207         : TokenPtr(result);
00208 }
00209 
00210 TokenPtr Phrase::mergeTokens(TokenPtr left, TokenPtr right)
00211 {
00212     return (left->corpusFrequency() <= right->corpusFrequency())
00213         ? getAdjacent<LeftToRight>(left, right)
00214         : getAdjacent<RightToLeft>(left, right);
00215 
00216 }
00217 
00218 }