00001 // -*- C++ -*- 00002 /* 00003 Phrasehunter - index and query text corpora 00004 Copyright (C) 2006 Torsten Marek (shlomme@gmx.de) & 00005 Armin Schmidt (armin.sch@gmail.com) 00006 00007 This program is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU General Public License 00009 as published by the Free Software Foundation; either version 2 00010 of the License, or (at your option) any later version. 00011 00012 This program is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 GNU General Public License for more details. 00016 00017 You should have received a copy of the GNU General Public License 00018 along with this program; if not, write to the Free Software 00019 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00020 */ 00021 00022 #ifndef SEARCHENGINE_H 00023 #define SEARCHENGINE_H SEARCHENGINE_H 00024 00025 #include <string> 00026 00027 #include "sqlitepp/sqlitepp.h" 00028 00029 #include "phexception.h" 00030 #include "ph_types.h" 00031 #include "support/unicodehelpers.h" 00032 00033 namespace PhraseHunter { 00034 00035 //! Class that provides functionality for querying a corpus. 00036 class SearchEngine 00037 { 00038 public: 00039 //! Constructor. Should not be called directly. Instead, use CorpusManager::searchEngine(). 00040 SearchEngine(SQLitePP::SqliteDB& sqlite); 00041 00042 ~SearchEngine() {} 00043 00044 //! Search for a single-word Token. For phrases, use searchPhrase(). 00045 TokenPtr searchToken(schma::UnicodePtr stringtoken) const; 00046 //! Search for a Phrase. 00047 TokenPtr searchPhrase(schma::UnicodePtr search_string) const; 00048 00049 /** 00050 \brief Search for all tokens that match a regular expression 00051 \param re The regular expression to match. 00052 */ 00053 TokenVector searchRegexToken(schma::UnicodePtr re) const; 00054 /** 00055 \brief Search for all Phrase tokens, where each word matches the respective regex. 00056 \param re The phrasal regular expression. 00057 */ 00058 TokenVector searchPhrasalRegex(schma::UnicodePtr re) const; 00059 00060 private: 00061 template<typename _RandomAccessIt, 00062 typename _MinChooser, 00063 typename _Merger, 00064 typename _Validator> 00065 typename _RandomAccessIt::value_type combine(_RandomAccessIt begin, 00066 _RandomAccessIt end, 00067 _MinChooser better, 00068 _Merger merge, 00069 _Validator valid) const; 00070 00071 std::string cutRegex(const std::string& re) const; 00072 00073 SQLitePP::SqliteDB& m_db; 00074 const std::string SPECIAL_CHARACTERS; 00075 }; 00076 } 00077 00078 #endif // SEARCHENGINE_H