00001 // -*- C++ -*- 00002 /* 00003 Phrasehunter - index and query text corpora 00004 Copyright (C) 2006 Torsten Marek (shlomme@gmx.de) & 00005 Armin Schmidt (armin.sch@gmail.com) 00006 00007 This program is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU General Public License 00009 as published by the Free Software Foundation; either version 2 00010 of the License, or (at your option) any later version. 00011 00012 This program is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 GNU General Public License for more details. 00016 00017 You should have received a copy of the GNU General Public License 00018 along with this program; if not, write to the Free Software 00019 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00020 */ 00021 00022 #ifndef CONTEXTREADER_H 00023 #define CONTEXTREADER_H CONTEXTREADER_H 00024 00025 #include <boost/shared_ptr.hpp> 00026 #include <boost/filesystem/path.hpp> 00027 #include <boost/lexical_cast.hpp> 00028 00029 #include "ph_types.h" 00030 #include "phexception.h" 00031 00032 namespace PhraseHunter { 00033 00034 //! Class for reading the left and right context of a token. 00035 class ContextReader 00036 { 00037 public: 00038 /** 00039 \brief Constructor. Should not be called directly. Instead, use CorpusManager::contextReader(). 00040 \param Textdirectory The directory containing the plain source text files. 00041 */ 00042 ContextReader(const boost::filesystem::path& textdirectory); 00043 ~ContextReader() {} 00044 00045 /** 00046 \brief Get the context for one particular position in a particular document. 00047 \param doc The ID of the document. 00048 \param pos The offset of the token, i.e. the byte position in the document. 00049 \param tokenLength Byte length of word. 00050 \param width The width of each left and right context. 00051 */ 00052 TokenContextPtr getContextFromPosition(DocID doc, IdxPos pos, 00053 size_t tokenLength, unsigned int width) const 00054 throw(Exceptions::FileError); 00055 /** 00056 \brief Fill a TokenContext object with a particular context. 00057 \param context TokenContext object to fill. 00058 \param width the width of each left and right context. 00059 */ 00060 void fillContext(TokenContextPtr context, unsigned int width) const 00061 throw(Exceptions::FileError); 00062 00063 /** 00064 \brief Get the context of a token. 00065 \param token The token in question. 00066 \param width The width of each left and right context. 00067 */ 00068 std::vector<TokenContextPtr> context(TokenPtr token, unsigned int width) const 00069 throw(Exceptions::FileError); 00070 //! Overloaded convenience function. 00071 std::vector<TokenContextPtr> context(const TokenVector& tokens, unsigned int width) const 00072 throw(Exceptions::FileError); 00073 00074 private: 00075 static const int BufferSize = 5120; 00076 void readBlock(std::ifstream& file, TokenContextPtr) const; 00077 00078 unsigned int removeNewlines(char* buffer, unsigned int length) const; 00079 00080 inline boost::filesystem::path documentFile(DocID docId) const 00081 { 00082 return m_textDirectory / boost::lexical_cast<std::string>(docId); 00083 } 00084 00085 boost::filesystem::path m_textDirectory; 00086 }; 00087 00088 } // PhraseHunter 00089 00090 #endif // CONTEXTREADER_H