00001 // -*- C++ -*- 00002 /* 00003 Phrasehunter - index and query text corpora 00004 Copyright (C) 2006 Torsten Marek (shlomme@gmx.de) & 00005 Armin Schmidt (armin.sch@gmail.com) 00006 00007 This program is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU General Public License 00009 as published by the Free Software Foundation; either version 2 00010 of the License, or (at your option) any later version. 00011 00012 This program is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 GNU General Public License for more details. 00016 00017 You should have received a copy of the GNU General Public License 00018 along with this program; if not, write to the Free Software 00019 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00020 */ 00021 00022 #ifndef STATISTICS_H 00023 #define STATISTICS_H STATISTICS_H 00024 00025 #include "ph_types.h" 00026 #include "searchengine.h" 00027 #include "support/unicodehelpers.h" 00028 00029 namespace PhraseHunter { 00030 00031 //! Class that provides functionality to do statistical calculations on a corpus 00032 class StatisticsEngine 00033 { 00034 public: 00035 //! Constructor. Should not be called directly. Instead, use CorpusManager::statisticsEngine(). 00036 StatisticsEngine(SearchEngine*, ContextReader*, SQLitePP::SqliteDB& db); 00037 ~StatisticsEngine() {}; 00038 00039 //! Returns the rank of a Token. 00040 unsigned int rank(TokenPtr) const; 00041 //! Returns the frequency count of a Token. 00042 unsigned int overallFrequency(TokenPtr) const; 00043 /** 00044 \brief Returns the frequency count of the intersection of two Token. 00045 The size of the intersection is determined by the number of times the 00046 offsets of the two tokens are not further apart than getContextSize(). 00047 */ 00048 unsigned int frequency_x_intersection_y(TokenPtr, TokenPtr) const; 00049 //! Returns the relative frequency of the Token. 00050 double relative_frequency(TokenPtr) const; 00051 //! Returns the probability of the intersection of two tokens according to maximum likelyhood estimation. 00052 double mle_x_intersection_y(TokenPtr, TokenPtr) const; 00053 //! Returns the probability of one Token occuring given the other had occurred according to maximum likelyhood estimation. 00054 double mle_x_given_y(TokenPtr, TokenPtr) const; 00055 00056 //! Returns the size of the sample space, i.e. the number of all word tokens and everything else that is considered a token. 00057 unsigned int getSizeOfSampleSpace() const { return m_sizeOfSampleSpace; } 00058 //! Returns the max number of bytes two Tokens may be apart to be considered intersecting. 00059 int getContextSize() const { return m_contextWindow;} 00060 //! Returns the average word length in bytes. 00061 double getAverageWordLength() const { return m_averageWordLength; } 00062 //! Returns the number of Documents in the corpus. 00063 unsigned int getNumberOfDocuments() const { return m_numberOfDocuments; } 00064 //! Returns the number of word types (and everything else considered a token) as opposed to word tokens. 00065 unsigned int getNumberOfTypes() const { return m_numberOfTypes; } 00066 00067 /** 00068 \brief Set how large the context may be within which two tokens may be considered intersecting. 00069 \param words The context size in number of words. 00070 */ 00071 void setNumberContextWords(int words) { 00072 m_contextWindow = static_cast<int>(m_averageWordLength) * words + words; 00073 } 00074 00075 /** 00076 \brief Get all context strings for a particular Token. 00077 \return A map with all context strings as UnicodePtr and their frequencies. 00078 \param t A Token. 00079 \param contextlen The maximum size of the context in bytes. 00080 */ 00081 std::map<schma::UnicodePtr, int> getContextVector(TokenPtr t, int contextlen) const; 00082 /** 00083 \brief Returns the pointwise mutual information of a Token and a particular context. 00084 \param token A Token. 00085 \param contextVec The context vector of token. 00086 \param context A particular context. 00087 */ 00088 double mutual_information(const TokenPtr& token, const std::map<schma::UnicodePtr,int>& contextVec, schma::UnicodePtr context) const; 00089 /** 00090 \brief Calculate the similarity between two tokens. 00091 \param t1 First Token. 00092 \param t2 Second Token. 00093 \param contextlen The maximum size of the context in bytes. 00094 */ 00095 double similarity(TokenPtr t1, TokenPtr t2, int contextlen) const; 00096 /** 00097 \brief Find all Tokens that are semantically similar to a particular Token. 00098 \param t The Token for which to find similar ones. 00099 \param contextlen The maximum size of the context in bytes. 00100 */ 00101 std::multimap<double,TokenPtr> similarTokens(TokenPtr t, int contextlen) const; 00102 00103 static unsigned int sizeOfDocSet(const TokenVector&); 00104 00105 private: 00106 static const int MAX_CANDIDATES = 1000; 00107 static const int MAX_DEPENDENTS = 20; 00108 00109 unsigned int m_sizeOfSampleSpace; 00110 double m_averageWordLength; 00111 unsigned int m_numberOfDocuments; 00112 unsigned int m_numberOfTypes; 00113 unsigned int m_contextWindow; 00114 00115 SearchEngine* m_searcher; 00116 ContextReader* m_reader; 00117 SQLitePP::SqliteDB& m_db; 00118 00119 unsigned freq_intersection_in_doc(const PositionList&, const PositionList&) const; 00120 TokenVector getCandidates(TokenPtr) const; 00121 std::pair<unsigned int, unsigned int> contextBounds(unsigned int position, int numWord) const; 00122 00123 }; 00124 } // namespace PhraseHunter 00125 00126 #endif // STATISTICS_H