The Phrasehunter: phrasehunter/include/phrasehunter/contextreader.h Source File

00001 // -*- C++ -*-
00002 /*
00003   Phrasehunter - index and query text corpora
00004   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00005   Armin Schmidt (armin.sch@gmail.com)
00006   
00007   This program is free software; you can redistribute it and/or
00008   modify it under the terms of the GNU General Public License
00009   as published by the Free Software Foundation; either version 2
00010   of the License, or (at your option) any later version.
00011   
00012   This program is distributed in the hope that it will be useful,
00013   but WITHOUT ANY WARRANTY; without even the implied warranty of
00014   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015   GNU General Public License for more details.
00016   
00017   You should have received a copy of the GNU General Public License
00018   along with this program; if not, write to the Free Software
00019   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00020 */
00021 
00022 #ifndef CONTEXTREADER_H
00023 #define CONTEXTREADER_H CONTEXTREADER_H
00024 
00025 #include <boost/shared_ptr.hpp>
00026 #include <boost/filesystem/path.hpp>
00027 #include <boost/lexical_cast.hpp>
00028 
00029 #include "ph_types.h"
00030 #include "phexception.h"
00031 
00032 namespace PhraseHunter {
00033 
00034 //! Class for reading the left and right context of a token.
00035 class ContextReader 
00036 {
00037 public:
00038     /**
00039        \brief Constructor. Should not be called directly. Instead, use CorpusManager::contextReader().
00040        \param Textdirectory The directory containing the plain source text files.
00041     */
00042     ContextReader(const boost::filesystem::path& textdirectory);
00043     ~ContextReader() {}
00044 
00045     /**
00046        \brief Get the context for one particular position in a particular document.
00047        \param doc The ID of the document.
00048        \param pos The offset of the token, i.e. the byte position in the document.
00049        \param tokenLength Byte length of word.
00050        \param width The width of each left and right context.
00051     */
00052     TokenContextPtr getContextFromPosition(DocID doc, IdxPos pos, 
00053                                            size_t tokenLength, unsigned int width) const
00054       throw(Exceptions::FileError);
00055     /**
00056        \brief Fill a TokenContext object with a particular context.
00057        \param context TokenContext object to fill.
00058        \param width the width of each left and right context.
00059     */
00060     void fillContext(TokenContextPtr context, unsigned int width) const
00061         throw(Exceptions::FileError);
00062 
00063     /**
00064        \brief Get the context of a token.
00065        \param token The token in question.
00066        \param width The width of each left and right context.
00067     */
00068     std::vector<TokenContextPtr> context(TokenPtr token, unsigned int width) const
00069         throw(Exceptions::FileError);
00070     //! Overloaded convenience function.
00071     std::vector<TokenContextPtr> context(const TokenVector& tokens, unsigned int width) const
00072         throw(Exceptions::FileError);
00073 
00074 private:
00075     static const int BufferSize = 5120;
00076     void readBlock(std::ifstream& file, TokenContextPtr) const;
00077 
00078     unsigned int removeNewlines(char* buffer, unsigned int length) const;
00079 
00080     inline boost::filesystem::path documentFile(DocID docId) const
00081     {
00082         return m_textDirectory / boost::lexical_cast<std::string>(docId);    
00083     }
00084     
00085     boost::filesystem::path m_textDirectory;
00086 };
00087 
00088 } // PhraseHunter
00089 
00090 #endif // CONTEXTREADER_H