The Phrasehunter: phrasehunter/include/phrasehunter/statistics.h Source File

00001 // -*- C++ -*-
00002 /*
00003   Phrasehunter - index and query text corpora
00004   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00005   Armin Schmidt (armin.sch@gmail.com)
00006   
00007   This program is free software; you can redistribute it and/or
00008   modify it under the terms of the GNU General Public License
00009   as published by the Free Software Foundation; either version 2
00010   of the License, or (at your option) any later version.
00011   
00012   This program is distributed in the hope that it will be useful,
00013   but WITHOUT ANY WARRANTY; without even the implied warranty of
00014   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015   GNU General Public License for more details.
00016   
00017   You should have received a copy of the GNU General Public License
00018   along with this program; if not, write to the Free Software
00019   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00020 */
00021 
00022 #ifndef STATISTICS_H
00023 #define STATISTICS_H STATISTICS_H
00024 
00025 #include "ph_types.h"
00026 #include "searchengine.h"
00027 #include "support/unicodehelpers.h"
00028 
00029 namespace PhraseHunter {
00030 
00031 //! Class that provides functionality to do statistical calculations on a corpus
00032 class StatisticsEngine
00033 {
00034 public:
00035     //! Constructor. Should not be called directly. Instead, use CorpusManager::statisticsEngine().
00036     StatisticsEngine(SearchEngine*, ContextReader*, SQLitePP::SqliteDB& db);
00037     ~StatisticsEngine() {};
00038 
00039     //! Returns the rank of a Token.
00040     unsigned int rank(TokenPtr) const;
00041     //! Returns the frequency count of a Token.
00042     unsigned int overallFrequency(TokenPtr) const;
00043     /**
00044        \brief Returns the frequency count of the intersection of two Token.
00045        The size of the intersection is determined by the number of times the
00046        offsets of the two tokens are not further apart than getContextSize().
00047     */
00048     unsigned int frequency_x_intersection_y(TokenPtr, TokenPtr) const;
00049     //! Returns the relative frequency of the Token.
00050     double relative_frequency(TokenPtr) const;
00051     //! Returns the probability of the intersection of two tokens according to maximum likelyhood estimation.
00052     double mle_x_intersection_y(TokenPtr, TokenPtr) const;
00053     //! Returns the probability of one Token occuring given the other had occurred according to maximum likelyhood estimation.
00054     double mle_x_given_y(TokenPtr, TokenPtr) const;
00055 
00056     //! Returns the size of the sample space, i.e. the number of all word tokens and everything else that is considered a token.
00057     unsigned int getSizeOfSampleSpace() const { return m_sizeOfSampleSpace; }
00058     //! Returns the max number of bytes two Tokens may be apart to be considered intersecting.
00059     int getContextSize() const { return m_contextWindow;}
00060     //! Returns the average word length in bytes.
00061     double getAverageWordLength() const { return m_averageWordLength; }
00062     //! Returns the number of Documents in the corpus.
00063     unsigned int getNumberOfDocuments() const { return m_numberOfDocuments; }
00064     //! Returns the number of word types (and everything else considered a token) as opposed to word tokens.
00065     unsigned int getNumberOfTypes() const { return m_numberOfTypes; }
00066 
00067     /**
00068        \brief Set how large the context may be within which two tokens may be considered intersecting.
00069        \param words The context size in number of words.
00070     */
00071     void setNumberContextWords(int words) {
00072         m_contextWindow = static_cast<int>(m_averageWordLength) * words + words;
00073     }
00074 
00075     /**
00076        \brief Get all context strings for a particular Token.
00077        \return A map with all context strings as UnicodePtr and their frequencies.
00078        \param t A Token.
00079        \param contextlen The maximum size of the context in bytes.
00080     */
00081     std::map<schma::UnicodePtr, int> getContextVector(TokenPtr t, int contextlen) const;
00082     /**
00083        \brief Returns the pointwise mutual information of a Token and a particular context.
00084        \param token A Token.
00085        \param contextVec The context vector of token.
00086        \param context A particular context.
00087     */
00088     double mutual_information(const TokenPtr& token, const std::map<schma::UnicodePtr,int>& contextVec, schma::UnicodePtr context) const;
00089     /**
00090        \brief Calculate the similarity between two tokens.
00091        \param t1 First Token.
00092        \param t2 Second Token.
00093        \param contextlen The maximum size of the context in bytes.
00094     */
00095     double similarity(TokenPtr t1, TokenPtr t2, int contextlen) const;
00096     /**
00097        \brief Find all Tokens that are semantically similar to a particular Token.
00098        \param t The Token for which to find similar ones.
00099        \param contextlen The maximum size of the context in bytes.
00100     */
00101     std::multimap<double,TokenPtr> similarTokens(TokenPtr t, int contextlen) const;
00102 
00103     static unsigned int sizeOfDocSet(const TokenVector&);
00104     
00105 private:
00106     static const int MAX_CANDIDATES = 1000;
00107     static const int MAX_DEPENDENTS = 20;
00108     
00109     unsigned int m_sizeOfSampleSpace;
00110     double m_averageWordLength;
00111     unsigned int m_numberOfDocuments;
00112     unsigned int m_numberOfTypes; 
00113     unsigned int m_contextWindow;
00114     
00115     SearchEngine* m_searcher;
00116     ContextReader* m_reader;
00117     SQLitePP::SqliteDB& m_db;
00118 
00119     unsigned freq_intersection_in_doc(const PositionList&, const PositionList&) const;
00120     TokenVector getCandidates(TokenPtr) const;
00121     std::pair<unsigned int, unsigned int> contextBounds(unsigned int position, int numWord) const;
00122 
00123 };
00124 } // namespace PhraseHunter
00125 
00126 #endif // STATISTICS_H