00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <climits>
00022
00023 #include <unicode/regex.h>
00024
00025 #include "phrasehunter/searchengine.h"
00026 #include "phrasehunter/tokenizer.h"
00027 #include "phrasehunter/token.h"
00028
00029 using namespace SQLitePP;
00030
00031 namespace PhraseHunter {
00032
00033 SearchEngine::SearchEngine(SqliteDB& db):
00034 m_db(db), SPECIAL_CHARACTERS(".[{()\\*+?|^$")
00035 {}
00036
00037 TokenVector SearchEngine::searchRegexToken(schma::UnicodePtr re) const
00038 {
00039 UErrorCode c = U_ZERO_ERROR;
00040 RegexMatcher matcher(*re, 0, c);
00041
00042 std::vector<TokenPtr> results;
00043 std::string cutRe = cutRegex(schma::toStdString(re));
00044 Statement::Pointer s;
00045
00046 if(cutRe.empty()) {
00047 s = m_db.statement("SELECT word FROM tokens ORDER BY word");
00048 }
00049 else {
00050 s = m_db.statement("SELECT word FROM tokens WHERE word LIKE ? ORDER BY word");
00051 s->bindArgs(cutRe + "%");
00052 }
00053
00054 for(ResultIterator ri = s->query(); ri.hasMoreRows(); ri.next()){
00055 matcher.reset(*(ri.get<schma::UnicodePtr>(0)));
00056
00057 if(matcher.find())
00058 results.push_back(searchToken(ri.get<schma::UnicodePtr>(0)));
00059 }
00060 return results;
00061 }
00062
00063 struct LengthComparator
00064 {
00065 bool operator()(const TokenVector& a, const TokenVector& o) const
00066 {
00067 return a.size() < o.size();
00068 }
00069 };
00070
00071 struct TokenVectorValidator
00072 {
00073 bool operator()(const TokenVector& a) const
00074 {
00075 return !a.empty();
00076 }
00077 };
00078
00079 struct TokenVectorMerger
00080 {
00081 TokenVector operator()(const TokenVector& v1, const TokenVector& v2) const
00082 {
00083 TokenVector results;
00084 results.reserve(v1.size() * v2.size());
00085 for(TokenVector::const_iterator i=v1.begin(); i!=v1.end(); ++i) {
00086 for(TokenVector::const_iterator j=v2.begin(); j!=v2.end(); ++j) {
00087 TokenPtr phrase = Phrase::mergeTokens(*i, *j);
00088 if (*phrase)
00089 results.push_back(phrase);
00090 }
00091 }
00092 return results;
00093 }
00094 };
00095
00096 TokenVector SearchEngine::searchPhrasalRegex(schma::UnicodePtr re) const
00097 {
00098 typedef std::vector<TokenVector> TokenMatrix;
00099
00100 static TokenVector emptyVector;
00101 schma::UnicodeVector regexes = schma::splitString(schma::toStdString(re));
00102
00103 if(regexes.size() < 1) {
00104 return emptyVector;
00105 }
00106
00107 TokenMatrix matrix;
00108 matrix.reserve(regexes.size());
00109 for(schma::UnicodeVector::const_iterator i = regexes.begin(); i != regexes.end(); ++i) {
00110 matrix.push_back(searchRegexToken(*i));
00111 if(matrix.back().empty()) {
00112 return emptyVector;
00113 }
00114 }
00115 return combine(TokenMatrix::const_iterator(matrix.begin()),
00116 TokenMatrix::const_iterator(matrix.end()),
00117 LengthComparator(),
00118 TokenVectorMerger(),
00119 TokenVectorValidator());
00120 }
00121
00122 std::string SearchEngine::cutRegex(const std::string& re) const
00123 {
00124 std::string::size_type firstSpecial = re.find_first_of(SPECIAL_CHARACTERS);
00125 std::string result(re, 0, firstSpecial);
00126 if(re[firstSpecial] == '\\') {
00127 for(unsigned int i = static_cast<unsigned int>(firstSpecial);
00128 i < re.length(); ++i) {
00129 if(SPECIAL_CHARACTERS.find(re[i+1]) == std::string::npos)
00130 return result;
00131 result.push_back(re[i+1]);
00132 ++i;
00133 }
00134 }
00135 return result;
00136 }
00137
00138 TokenPtr SearchEngine::searchToken(schma::UnicodePtr stringtoken) const
00139 {
00140 Statement::Pointer getID = m_db.cachedStatement("SELECT id FROM tokens WHERE word = ?");
00141 getID->bindArgs(stringtoken);
00142
00143 ResultIterator ri = getID->query();
00144 return (ri.hasMoreRows())
00145 ? CorpusToken::loadFromCorpus(stringtoken, ri.get<int>(0), m_db)
00146 : EmptyToken::instance();
00147 }
00148
00149 struct OccurrenceComparator
00150 {
00151 bool operator()(const TokenPtr& a, const TokenPtr& o) const
00152 {
00153 return a->corpusFrequency() < o->corpusFrequency();
00154 }
00155 };
00156
00157 struct TokenValidator
00158 {
00159 bool operator()(const TokenPtr& a) const
00160 {
00161 return *a;
00162 }
00163 };
00164
00165 struct TokenMerger
00166 {
00167 TokenPtr operator()(const TokenPtr& a, const TokenPtr& o) const
00168 {
00169 return Phrase::mergeTokens(a, o);
00170 }
00171 };
00172
00173 TokenPtr SearchEngine::searchPhrase(schma::UnicodePtr search_string) const
00174 {
00175 Tokenizer t(new StringInput(schma::toStdString(search_string)));
00176 if(!t.hasMoreTokens())
00177 return EmptyToken::instance();
00178 bool whitespace;
00179
00180 TokenVector tokens;
00181
00182 while(t.hasMoreTokens()) {
00183 TokenPtr token = searchToken(schma::UnicodePtr(new UnicodeString(t.nextUnencodedToken(&whitespace).c_str())));
00184 if(!*token) {
00185 return token;
00186 }
00187 tokens.push_back(token);
00188 }
00189 return combine(TokenVector::const_iterator(tokens.begin()),
00190 TokenVector::const_iterator(tokens.end()),
00191 OccurrenceComparator(),
00192 TokenMerger(),
00193 TokenValidator());
00194
00195 }
00196
00197 template<typename _RandomAccessIt,
00198 typename _MinChooser,
00199 typename _Merger,
00200 typename _Validator>
00201 typename _RandomAccessIt::value_type SearchEngine::combine(_RandomAccessIt begin,
00202 _RandomAccessIt end,
00203 _MinChooser better,
00204 _Merger merge,
00205 _Validator valid) const
00206 {
00207 _RandomAccessIt start =
00208 std::min_element(begin, end, better);
00209
00210 _RandomAccessIt left = start - 1;
00211 _RandomAccessIt right = start + 1;
00212
00213 typename _RandomAccessIt::value_type aggregate = *start;
00214
00215 for(int i = 0; valid(aggregate) && i < (end - begin) - 1; ++i) {
00216 if (left < begin ||
00217 (right != end &&
00218 better(*right, *left))) {
00219 aggregate = merge(aggregate, *right);
00220 ++right;
00221 } else {
00222 aggregate = merge(*left, aggregate);
00223 --left;
00224 }
00225 }
00226 return aggregate;
00227 }
00228
00229 }