searchengine.h

Go to the documentation of this file.
00001 // -*- C++ -*-
00002 /*
00003   Phrasehunter - index and query text corpora
00004   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00005                       Armin Schmidt (armin.sch@gmail.com)
00006   
00007   This program is free software; you can redistribute it and/or
00008   modify it under the terms of the GNU General Public License
00009   as published by the Free Software Foundation; either version 2
00010   of the License, or (at your option) any later version.
00011   
00012   This program is distributed in the hope that it will be useful,
00013   but WITHOUT ANY WARRANTY; without even the implied warranty of
00014   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015   GNU General Public License for more details.
00016   
00017   You should have received a copy of the GNU General Public License
00018   along with this program; if not, write to the Free Software
00019   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00020 */
00021 
00022 #ifndef SEARCHENGINE_H
00023 #define SEARCHENGINE_H SEARCHENGINE_H
00024 
00025 #include <string>
00026 
00027 #include "sqlitepp/sqlitepp.h"
00028 
00029 #include "phexception.h"
00030 #include "ph_types.h"
00031 #include "support/unicodehelpers.h"
00032 
00033 namespace PhraseHunter {
00034 
00035 //! Class that provides functionality for querying a corpus.
00036 class SearchEngine
00037 {
00038 public:
00039     //! Constructor. Should not be called directly. Instead, use CorpusManager::searchEngine().
00040     SearchEngine(SQLitePP::SqliteDB& sqlite);
00041     
00042     ~SearchEngine() {}
00043 
00044     //! Search for a single-word Token. For phrases, use searchPhrase().
00045     TokenPtr searchToken(schma::UnicodePtr stringtoken) const;
00046     //! Search for a Phrase.
00047     TokenPtr searchPhrase(schma::UnicodePtr search_string) const;
00048 
00049     /**
00050        \brief Search for all tokens that match a regular expression
00051        \param re The regular expression to match.
00052     */
00053     TokenVector searchRegexToken(schma::UnicodePtr re) const;
00054     /**
00055        \brief Search for all Phrase tokens, where each word matches the respective regex.
00056        \param re The phrasal regular expression.
00057     */
00058     TokenVector searchPhrasalRegex(schma::UnicodePtr re) const;
00059 
00060 private:    
00061     template<typename _RandomAccessIt,
00062              typename _MinChooser,
00063              typename _Merger,
00064              typename _Validator>
00065     typename _RandomAccessIt::value_type combine(_RandomAccessIt begin,
00066                                                  _RandomAccessIt end,
00067                                                  _MinChooser better,
00068                                                  _Merger merge,
00069                                                  _Validator valid) const;
00070     
00071     std::string cutRegex(const std::string& re) const;
00072     
00073     SQLitePP::SqliteDB& m_db;
00074     const std::string SPECIAL_CHARACTERS;
00075 };
00076 }
00077 
00078 #endif // SEARCHENGINE_H

Generated on Thu Dec 21 16:14:40 2006 for The Phrasehunter by  doxygen 1.5.1