The Phrasehunter: phrasehunter/include/phrasehunter/token.h Source File

00001 // -*- C++ -*-
00002 /*
00003   Phrasehunter - index and query text corpora
00004   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00005   Armin Schmidt (armin.sch@gmail.com)
00006   
00007   This program is free software; you can redistribute it and/or
00008   modify it under the terms of the GNU General Public License
00009   as published by the Free Software Foundation; either version 2
00010   of the License, or (at your option) any later version.
00011   
00012   This program is distributed in the hope that it will be useful,
00013   but WITHOUT ANY WARRANTY; without even the implied warranty of
00014   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015   GNU General Public License for more details.
00016   
00017   You should have received a copy of the GNU General Public License
00018   along with this program; if not, write to the Free Software
00019   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00020 */
00021 
00022 #ifndef TOKEN_H
00023 #define TOKEN_H TOKEN_H
00024 
00025 #include <iostream>
00026 #include <sstream>
00027 #include <set>
00028 #include <boost/shared_ptr.hpp>
00029 
00030 #include "support/safe_bool.hpp"
00031 #include "support/unicodehelpers.h"
00032 
00033 #include "sqlitepp/sqlitepp.h"
00034 
00035 #include "ph_types.h"
00036 
00037 namespace PhraseHunter {
00038 
00039 /**
00040    \brief Virtual base class for all Token descendants.
00041 
00042    A Token consists of a particular string (i.e. word type) and a map of its
00043    occurrences, i.e. documents and offsets, in the corpus. Tokens may be used in boolean
00044    expressions.
00045 */
00046 class Token: public safe_bool<>
00047 {
00048 protected:
00049     schma::UnicodePtr m_tokenstring;
00050     OccurrenceMap m_occurrences;
00051     Token(const char* token): m_tokenstring(new UnicodeString(token))
00052     {}
00053     Token(schma::UnicodePtr tokenstring) : m_tokenstring(tokenstring)
00054     {}
00055 
00056     static const int SPACE_BETWEEN_TWO_TOKENS = 2;
00057     
00058 public:
00059     virtual ~Token() {}
00060 
00061     //! Return the real length (bytes) of a token string in the index.
00062     virtual size_t length() const 
00063     {
00064         return m_tokenstring->length() + SPACE_BETWEEN_TWO_TOKENS;
00065     }
00066     //! Returns true if all occurrences in all files are expected to look *exactly* the same.
00067     bool isUniform() const 
00068     {
00069         return false; 
00070     }
00071     //! Returns true if there are no actual occurrences of this Token in a corpus, i.e. its occurrence map is empty.
00072     virtual bool isEmpty() const { return m_occurrences.empty(); }
00073 
00074     //! Return the number of times this Token occurs in the corpus.
00075     virtual unsigned int corpusFrequency() const = 0;
00076     
00077     //! Returns the string to which this Token belongs as a UnicodePtr.
00078     schma::UnicodePtr tokenString() const { return m_tokenstring; }
00079 
00080     //! Returns the number of documents a Token occurs in.
00081     virtual unsigned int documentFrequency() const { return m_occurrences.size(); }
00082 
00083     //! Returns true if this Token occurs in a document.
00084     bool inDoc(DocID docID) const { return m_occurrences.count(docID) > 0; }
00085 
00086     //! Returns a reference to all offsets of a Token in a particular document as a PositionList.
00087     const PositionList& documentOccurrences(DocID docID) { return m_occurrences[docID]; }
00088 
00089     //! Return a reference to the entire OccurrenceMap of a Token.
00090     const OccurrenceMap& allOccurrences() const { return m_occurrences; }
00091 
00092     //! Returns the DocIDs of all documents, in which this Token occurs.
00093     virtual std::vector<DocID> documentIDs() const;
00094     virtual unsigned int numTokens() const { return 1; }
00095     //! Returns the ID for this Token.
00096     virtual TokenID id() const { return InvalidTokenID; }
00097 };
00098 
00099 //! A Token without a token string nor any occurrences.
00100 class EmptyToken: public Token {
00101 private:
00102     EmptyToken(): Token("") {}
00103     static TokenPtr s_inst;
00104 
00105 protected:
00106     bool boolean_test() const
00107     { return false; }
00108     
00109 public:
00110     //! Return instance() anywhere you would otherwise return NULL for a TokenPtr
00111     static TokenPtr instance() 
00112     { return s_inst; }
00113 
00114     //! Always returns 0.
00115     unsigned int corpusFrequency() const { return 0; }
00116     //! Always returns true.
00117     bool isEmpty() const { return true; }
00118     //! Always returns 0.
00119     unsigned int numTokens() const { return 0; }
00120 };
00121 
00122 //! Base class for all CorpusToken classes.
00123 class CorpusTokenBase: public Token
00124 {
00125 protected:
00126     CorpusTokenBase(const char* token, TokenID id): 
00127         Token(token), m_id(id), m_corpusfreq(0) {}
00128     CorpusTokenBase(schma::UnicodePtr token, TokenID id): 
00129         Token(token), m_id(id), m_corpusfreq(0) {}
00130     
00131     bool boolean_test() const 
00132     { return m_corpusfreq > 0; }
00133     
00134     TokenID m_id;
00135     unsigned int m_corpusfreq;
00136     
00137 public:
00138     //! Returns the ID of this Token.
00139     TokenID id() const { return m_id; }
00140     //! Return the number of times this Token occurs in the corpus.
00141     unsigned int corpusFrequency() const { return m_corpusfreq; }
00142 };
00143 
00144     
00145 /**
00146    \brief A Token of only one word.
00147 
00148    This is a non-phrasal Token. For reading only. A PhraseHunter::CorpusToken
00149    has an individual id, by which it can uniquely be identified and called from
00150    the database.
00151 */
00152 class CorpusToken: public CorpusTokenBase
00153 {
00154 private:
00155     void insertPositions(SQLitePP::ResultIterator);
00156     CorpusToken(const char*, TokenID, SQLitePP::SqliteDB&);
00157     CorpusToken(const char*, TokenID, SQLitePP::SqliteDB&, const std::set<DocID>&);
00158     CorpusToken(schma::UnicodePtr, TokenID, SQLitePP::SqliteDB&);
00159     CorpusToken(schma::UnicodePtr, TokenID, SQLitePP::SqliteDB&, const std::set<DocID>&);
00160 
00161     void init(TokenID, SQLitePP::SqliteDB&);
00162     void init(TokenID, SQLitePP::SqliteDB&, const std::set<DocID>&);
00163 
00164 public:
00165     //! Don't call this template function directly. Instead, use the non-template version below.
00166     template<typename T>
00167     static TokenPtr loadFromCorpus(T tokenstr, TokenID id, SQLitePP::SqliteDB& db)
00168     {
00169         p_assert(id != InvalidTokenID, "Invalid token id");
00170         return TokenPtr(new CorpusToken(tokenstr, id, db));
00171     }
00172 
00173     //! Don't call this template function directly. Instead, use the non-template version below.
00174     template<typename T>
00175     static TokenPtr loadFromCorpus(T tokenstr, TokenID id, SQLitePP::SqliteDB& db, 
00176                                    const std::set<DocID>& docs)
00177     {
00178         p_assert(id != InvalidTokenID, "Invalid token id");
00179         TokenPtr t(new CorpusToken(tokenstr, id, db, docs));
00180         return (t->isEmpty()) 
00181             ? EmptyToken::instance()
00182             : t;
00183     }
00184     /**
00185        \brief Loads a CorpusToken. For initializing the OccurrenceMap of a LightCorpusToken.
00186        \return The so-(re)loaded CorpusToken
00187     */
00188     static TokenPtr loadFromToken(const TokenPtr& token, SQLitePP::SqliteDB& db)
00189     {
00190         return loadFromCorpus<schma::UnicodePtr>(token->tokenString(), token->id(), db);
00191     }
00192     /**
00193        \brief Like above function, but ignoring all occurrences that are not in docs.
00194        \return The so-(re)loaded CorpusToken
00195     */
00196     static TokenPtr loadFromToken(const TokenPtr& token, SQLitePP::SqliteDB& db, const std::set<DocID>& docs)
00197     {
00198         return loadFromCorpus<schma::UnicodePtr>(token->tokenString(), token->id(), db, docs);
00199     }
00200 
00201     //! Returns true if this Token has no occurrences in the corpus.
00202     bool isEmpty() const 
00203     { return m_corpusfreq == 0; }
00204     
00205 };
00206 
00207 //! A corpus token that only carries corpus frequency information (opposed to CorpusToken)
00208 class LightCorpusToken: public CorpusTokenBase
00209 {
00210 private:
00211     unsigned int m_documentFreq;
00212     LightCorpusToken(const char* tokenstr, TokenID id, 
00213                      unsigned int freq, unsigned int docFreq) 
00214         : CorpusTokenBase(tokenstr, id), m_documentFreq(docFreq) 
00215     {
00216         m_corpusfreq = freq;
00217     }
00218     
00219 public:
00220     /**
00221        \brief Search and initialize a LightCorpusToken.
00222        Special search function for LightCorpusToken if you don't want to use
00223        SearchEngine::searchToken() which always loads the whole OccurrenceMap
00224     */
00225     static TokenPtr search(const char* tokenstr, SQLitePP::SqliteDB& db);
00226     //! Returns the number of documents a Token occurs in.
00227     unsigned int documentFrequency() const { return m_documentFreq; }
00228 };
00229 
00230 /**
00231    \brief A Token whose OccurrenceMap can be modified.
00232    The main purpose of MutableToken is to be a base class for phrases,
00233    whose occurrence matrix needs to be modified as phrases are themselve
00234    not indexed.
00235 */
00236 class MutableToken: public Token
00237 {
00238 protected:
00239     unsigned int m_totalOccurrences;
00240     bool boolean_test() const 
00241     { return m_totalOccurrences > 0; }
00242     
00243     MutableToken(const char* tokenstring): Token(tokenstring), m_totalOccurrences(0)
00244     {}
00245     MutableToken(schma::UnicodePtr tokenstring): Token(tokenstring), m_totalOccurrences(0)
00246     {}
00247     
00248 public:
00249     //! Returns the number of documents a Token occurs in.
00250     unsigned int corpusFrequency() const { return m_totalOccurrences; }
00251     //! Remove the document referred to by docID from the occurrence matrix.
00252     void removeDocument(DocID docID);
00253     /**
00254        \brief Add a position to the occurrence matrix of this Token.
00255        \param docID The document ID referring to the document of the new occurrence.
00256        \param position The byte offset, i.e. byte position, of the new occurrence.
00257     */
00258     void addOccurrence(DocID docID, IdxPos position);
00259     
00260 };
00261 
00262 //! A phrasal token consists of several words or tokens
00263 class Phrase: public MutableToken
00264 {
00265 public:
00266     /**
00267        \brief The direction which should be used to merge a Token to this Phrase.
00268        When merging two Tokens into a Phrase, we need to go through the entire
00269        occurrence matrix of one Token and see if its position is next to an occurrence
00270        of the other Token. This process can be quite time consuming but may be
00271        considerably speed up by going through the Token with less occurrences as a
00272        starting point. Phrase::Direction indicates the Token to start with: If the left
00273        Token is less frequent, call getAdjacent<LeftToRight>(TokenPtr left, TokenPtr right),
00274        otherwise call getAdjacent<RightToLeft>(TokenPtr left, TokenPtr right).
00275     */
00276     enum Direction {
00277         LeftToRight,
00278         RightToLeft
00279     };
00280 
00281 private:
00282     template<Direction d>
00283     struct adja
00284     {};
00285     int m_tokencount;
00286     
00287     Phrase(const char* token, int tokencount)
00288         : MutableToken(token), m_tokencount(tokencount) 
00289     {}
00290     Phrase(schma::UnicodePtr token, int tokencount)
00291         : MutableToken(token), m_tokencount(tokencount)
00292     {}
00293     
00294 public:
00295     /**
00296        \brief Merge two Tokens and specify the merging Direction.
00297        Usually, you want to call mergeTokens() instead.
00298     */
00299     template<Direction d>
00300     static TokenPtr getAdjacent(TokenPtr left, TokenPtr right);
00301     
00302     //! Merge two Token objects into a Phrase.
00303     static TokenPtr mergeTokens(TokenPtr left, TokenPtr right);
00304 
00305     //! Returns the length of the Phrase as it is in the text repository.
00306     inline size_t length() const 
00307     {
00308         return m_tokenstring->length() + m_tokencount + 1;
00309     }
00310 
00311     //! Returns how often this Phrase occurs in the corpus.
00312     inline unsigned int numTokens() const
00313     {
00314         return m_tokencount;
00315     }
00316 };
00317 
00318 }
00319 
00320 #endif // TOKEN_H