#include <statistics.h>
Collaboration diagram for PhraseHunter::StatisticsEngine:
Public Member Functions | |
StatisticsEngine (SearchEngine *, ContextReader *, SQLitePP::SqliteDB &db) | |
Constructor. Should not be called directly. Instead, use CorpusManager::statisticsEngine(). | |
~StatisticsEngine () | |
unsigned int | rank (TokenPtr) const |
Returns the rank of a Token. | |
unsigned int | overallFrequency (TokenPtr) const |
Returns the frequency count of a Token. | |
unsigned int | frequency_x_intersection_y (TokenPtr, TokenPtr) const |
Returns the frequency count of the intersection of two Token. The size of the intersection is determined by the number of times the offsets of the two tokens are not further apart than getContextSize(). | |
double | relative_frequency (TokenPtr) const |
Returns the relative frequency of the Token. | |
double | mle_x_intersection_y (TokenPtr, TokenPtr) const |
Returns the probability of the intersection of two tokens according to maximum likelyhood estimation. | |
double | mle_x_given_y (TokenPtr, TokenPtr) const |
Returns the probability of one Token occuring given the other had occurred according to maximum likelyhood estimation. | |
unsigned int | getSizeOfSampleSpace () const |
Returns the size of the sample space, i.e. the number of all word tokens and everything else that is considered a token. | |
int | getContextSize () const |
Returns the max number of bytes two Tokens may be apart to be considered intersecting. | |
double | getAverageWordLength () const |
Returns the average word length in bytes. | |
unsigned int | getNumberOfDocuments () const |
Returns the number of Documents in the corpus. | |
unsigned int | getNumberOfTypes () const |
Returns the number of word types (and everything else considered a token) as opposed to word tokens. | |
void | setNumberContextWords (int words) |
Set how large the context may be within which two tokens may be considered intersecting. | |
std::map< schma::UnicodePtr, int > | getContextVector (TokenPtr t, int contextlen) const |
Get all context strings for a particular Token. | |
double | mutual_information (const TokenPtr &token, const std::map< schma::UnicodePtr, int > &contextVec, schma::UnicodePtr context) const |
Returns the pointwise mutual information of a Token and a particular context. | |
double | similarity (TokenPtr t1, TokenPtr t2, int contextlen) const |
Calculate the similarity between two tokens. | |
std::multimap< double, TokenPtr > | similarTokens (TokenPtr t, int contextlen) const |
Find all Tokens that are semantically similar to a particular Token. | |
Static Public Member Functions | |
static unsigned int | sizeOfDocSet (const TokenVector &) |
Private Member Functions | |
unsigned | freq_intersection_in_doc (const PositionList &, const PositionList &) const |
TokenVector | getCandidates (TokenPtr) const |
std::pair< unsigned int, unsigned int > | contextBounds (unsigned int position, int numWord) const |
Private Attributes | |
unsigned int | m_sizeOfSampleSpace |
double | m_averageWordLength |
unsigned int | m_numberOfDocuments |
unsigned int | m_numberOfTypes |
unsigned int | m_contextWindow |
SearchEngine * | m_searcher |
ContextReader * | m_reader |
SQLitePP::SqliteDB & | m_db |
Static Private Attributes | |
static const int | MAX_CANDIDATES = 1000 |
static const int | MAX_DEPENDENTS = 20 |
Definition at line 32 of file statistics.h.
PhraseHunter::StatisticsEngine::StatisticsEngine | ( | SearchEngine * | , | |
ContextReader * | , | |||
SQLitePP::SqliteDB & | db | |||
) |
Constructor. Should not be called directly. Instead, use CorpusManager::statisticsEngine().
Definition at line 61 of file statistics.cpp.
References SQLitePP::ResultIterator::get(), m_averageWordLength, m_contextWindow, m_db, m_numberOfDocuments, m_numberOfTypes, m_sizeOfSampleSpace, and SQLitePP::SqliteDB::statement().
PhraseHunter::StatisticsEngine::~StatisticsEngine | ( | ) | [inline] |
Definition at line 37 of file statistics.h.
unsigned int PhraseHunter::StatisticsEngine::rank | ( | TokenPtr | ) | const |
Returns the rank of a Token.
Definition at line 86 of file statistics.cpp.
References SQLitePP::ResultIterator::get(), m_db, m_searcher, and SQLitePP::SqliteDB::statement().
Referenced by SearchTab::search().
unsigned int PhraseHunter::StatisticsEngine::overallFrequency | ( | TokenPtr | ) | const |
Returns the frequency count of a Token.
Definition at line 98 of file statistics.cpp.
References SQLitePP::ResultIterator::get(), m_db, m_searcher, and SQLitePP::SqliteDB::statement().
Referenced by mutual_information(), and relative_frequency().
unsigned int PhraseHunter::StatisticsEngine::frequency_x_intersection_y | ( | TokenPtr | , | |
TokenPtr | ||||
) | const |
Returns the frequency count of the intersection of two Token. The size of the intersection is determined by the number of times the offsets of the two tokens are not further apart than getContextSize().
Definition at line 110 of file statistics.cpp.
References freq_intersection_in_doc().
Referenced by mle_x_intersection_y().
double PhraseHunter::StatisticsEngine::relative_frequency | ( | TokenPtr | ) | const |
Returns the relative frequency of the Token.
Definition at line 153 of file statistics.cpp.
References getSizeOfSampleSpace(), and overallFrequency().
Referenced by mle_x_given_y(), and mutual_information().
Returns the probability of the intersection of two tokens according to maximum likelyhood estimation.
Definition at line 161 of file statistics.cpp.
References frequency_x_intersection_y(), and getSizeOfSampleSpace().
Referenced by mle_x_given_y().
Returns the probability of one Token occuring given the other had occurred according to maximum likelyhood estimation.
Definition at line 169 of file statistics.cpp.
References mle_x_intersection_y(), and relative_frequency().
unsigned int PhraseHunter::StatisticsEngine::getSizeOfSampleSpace | ( | ) | const [inline] |
Returns the size of the sample space, i.e. the number of all word tokens and everything else that is considered a token.
Definition at line 57 of file statistics.h.
References m_sizeOfSampleSpace.
Referenced by SearchTab::enableSearch(), mle_x_intersection_y(), mutual_information(), and relative_frequency().
int PhraseHunter::StatisticsEngine::getContextSize | ( | ) | const [inline] |
Returns the max number of bytes two Tokens may be apart to be considered intersecting.
Definition at line 59 of file statistics.h.
References m_contextWindow.
Referenced by getCandidates().
double PhraseHunter::StatisticsEngine::getAverageWordLength | ( | ) | const [inline] |
Returns the average word length in bytes.
Definition at line 61 of file statistics.h.
References m_averageWordLength.
unsigned int PhraseHunter::StatisticsEngine::getNumberOfDocuments | ( | ) | const [inline] |
Returns the number of Documents in the corpus.
Definition at line 63 of file statistics.h.
References m_numberOfDocuments.
Referenced by SearchTab::enableSearch().
unsigned int PhraseHunter::StatisticsEngine::getNumberOfTypes | ( | ) | const [inline] |
Returns the number of word types (and everything else considered a token) as opposed to word tokens.
Definition at line 65 of file statistics.h.
References m_numberOfTypes.
Referenced by SearchTab::enableSearch().
void PhraseHunter::StatisticsEngine::setNumberContextWords | ( | int | words | ) | [inline] |
Set how large the context may be within which two tokens may be considered intersecting.
words | The context size in number of words. |
Definition at line 71 of file statistics.h.
References m_averageWordLength, and m_contextWindow.
std::map< schma::UnicodePtr, int > PhraseHunter::StatisticsEngine::getContextVector | ( | TokenPtr | t, | |
int | contextlen | |||
) | const |
Get all context strings for a particular Token.
t | A Token. | |
contextlen | The maximum size of the context in bytes. |
Definition at line 250 of file statistics.cpp.
References PhraseHunter::ContextReader::context(), m_averageWordLength, m_reader, and m_searcher.
Referenced by similarity().
double PhraseHunter::StatisticsEngine::mutual_information | ( | const TokenPtr & | token, | |
const std::map< schma::UnicodePtr, int > & | contextVec, | |||
schma::UnicodePtr | context | |||
) | const |
Returns the pointwise mutual information of a Token and a particular context.
token | A Token. | |
contextVec | The context vector of token. | |
context | A particular context. |
Definition at line 277 of file statistics.cpp.
References getSizeOfSampleSpace(), m_searcher, overallFrequency(), relative_frequency(), PhraseHunter::SearchEngine::searchPhrase(), and PhraseHunter::SearchEngine::searchToken().
Referenced by similarity().
double PhraseHunter::StatisticsEngine::similarity | ( | TokenPtr | t1, | |
TokenPtr | t2, | |||
int | contextlen | |||
) | const |
Calculate the similarity between two tokens.
Definition at line 309 of file statistics.cpp.
References getContextVector(), and mutual_information().
Referenced by similarTokens().
std::multimap< double, TokenPtr > PhraseHunter::StatisticsEngine::similarTokens | ( | TokenPtr | t, | |
int | contextlen | |||
) | const |
Find all Tokens that are semantically similar to a particular Token.
t | The Token for which to find similar ones. | |
contextlen | The maximum size of the context in bytes. |
Definition at line 338 of file statistics.cpp.
References getCandidates(), and similarity().
static unsigned int PhraseHunter::StatisticsEngine::sizeOfDocSet | ( | const TokenVector & | ) | [static] |
Referenced by SearchTab::search().
unsigned PhraseHunter::StatisticsEngine::freq_intersection_in_doc | ( | const PositionList & | , | |
const PositionList & | ||||
) | const [private] |
Definition at line 130 of file statistics.cpp.
References m_contextWindow.
Referenced by frequency_x_intersection_y().
Definition at line 178 of file statistics.cpp.
References contextBounds(), SQLitePP::ResultIterator::get(), getContextSize(), SQLitePP::ResultIterator::hasMoreRows(), PhraseHunter::CorpusToken::loadFromCorpus(), m_db, and SQLitePP::SqliteDB::statement().
Referenced by similarTokens().
std::pair<unsigned int, unsigned int> PhraseHunter::StatisticsEngine::contextBounds | ( | unsigned int | position, | |
int | numWord | |||
) | const [private] |
Referenced by getCandidates().
const int PhraseHunter::StatisticsEngine::MAX_CANDIDATES = 1000 [static, private] |
Definition at line 106 of file statistics.h.
const int PhraseHunter::StatisticsEngine::MAX_DEPENDENTS = 20 [static, private] |
Definition at line 107 of file statistics.h.
unsigned int PhraseHunter::StatisticsEngine::m_sizeOfSampleSpace [private] |
Definition at line 109 of file statistics.h.
Referenced by getSizeOfSampleSpace(), and StatisticsEngine().
double PhraseHunter::StatisticsEngine::m_averageWordLength [private] |
Definition at line 110 of file statistics.h.
Referenced by getAverageWordLength(), getContextVector(), setNumberContextWords(), and StatisticsEngine().
unsigned int PhraseHunter::StatisticsEngine::m_numberOfDocuments [private] |
Definition at line 111 of file statistics.h.
Referenced by getNumberOfDocuments(), and StatisticsEngine().
unsigned int PhraseHunter::StatisticsEngine::m_numberOfTypes [private] |
Definition at line 112 of file statistics.h.
Referenced by getNumberOfTypes(), and StatisticsEngine().
unsigned int PhraseHunter::StatisticsEngine::m_contextWindow [private] |
Definition at line 113 of file statistics.h.
Referenced by freq_intersection_in_doc(), getContextSize(), setNumberContextWords(), and StatisticsEngine().
Definition at line 115 of file statistics.h.
Referenced by getContextVector(), mutual_information(), overallFrequency(), and rank().
Definition at line 117 of file statistics.h.
Referenced by getCandidates(), overallFrequency(), rank(), and StatisticsEngine().