The Phrasehunter: phrasehunter/include/phrasehunter/tokenizer.h Source File

00001 // -*- C++ -*-
00002 /*
00003   Phrasehunter - index and query text corpora
00004   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00005   Armin Schmidt (armin.sch@gmail.com)
00006   
00007   This program is free software; you can redistribute it and/or
00008   modify it under the terms of the GNU General Public License
00009   as published by the Free Software Foundation; either version 2
00010   of the License, or (at your option) any later version.
00011   
00012   This program is distributed in the hope that it will be useful,
00013   but WITHOUT ANY WARRANTY; without even the implied warranty of
00014   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015   GNU General Public License for more details.
00016   
00017   You should have received a copy of the GNU General Public License
00018   along with this program; if not, write to the Free Software
00019   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00020 */
00021 
00022 #ifndef TOKENIZER_H
00023 #define TOKENIZER_H TOKENIZER_H
00024 
00025 #include <string>
00026 #include <fstream>
00027 #include <iostream>
00028 #include <stdexcept>
00029 #include <boost/scoped_ptr.hpp>
00030 
00031 #include <unicode/unistr.h>
00032 #include <unicode/ucnv.h>
00033 #include <unicode/normlzr.h>
00034 
00035 #include "ph_types.h"
00036 #include "phexception.h"
00037 #include "support/unicodehelpers.h"
00038 
00039 #define MAX_TOKEN_LEN 1024
00040 
00041 namespace PhraseHunter {
00042 
00043 class Input 
00044 {
00045 public:
00046     virtual ~Input() {};
00047     virtual bool hasMoreData() const = 0;
00048     virtual void getMoreData(std::string*) = 0;
00049 };
00050 
00051 class StringInput: public Input
00052 {
00053     std::string m_inputstring;
00054     bool m_delivered;
00055     
00056 public:
00057     StringInput(const std::string& inputstring): m_inputstring(inputstring), m_delivered(false)
00058     {}
00059     inline bool hasMoreData() const
00060     {
00061         return !m_delivered;
00062     }
00063     inline void getMoreData(std::string* s) 
00064     {
00065         m_delivered = true;
00066         *s = m_inputstring;
00067     }
00068 };
00069 
00070 class FileInput: public Input
00071 {
00072     std::ifstream m_input;
00073 public:
00074     FileInput(const std::string& filename)
00075         throw (Exceptions::FileError)
00076         : m_input(filename.c_str(), std::ios::in)
00077     {  
00078         if (!m_input)
00079             throw Exceptions::FileError(filename);
00080     }
00081     inline bool hasMoreData() const
00082     {
00083         return !m_input.eof();
00084     }
00085     inline void getMoreData(std::string* s) 
00086     {
00087         assert(!m_input.eof());
00088         std::getline(m_input, *s);
00089     }
00090 };
00091 
00092 
00093 class Tokenizer 
00094 {
00095 protected:
00096     UConverter* m_conv;
00097     UErrorCode m_errcode;
00098     
00099     std::string m_tokenbuffer;
00100     size_t m_pos;
00101     std::string m_all_delim;
00102     
00103     static std::string m_whitespace;
00104     static std::string m_delimiters;
00105     
00106     bool m_insertWhitespace;
00107     
00108     bool containsTokens(const std::string& buffer, int pos = 0) const
00109     {
00110         return buffer.length() > 0 && buffer.find_first_not_of(m_whitespace, pos) != std::string::npos;
00111     }
00112     inline bool bufferIsFull() const
00113     {
00114         return containsTokens(m_tokenbuffer, m_pos);
00115     }
00116     
00117     bool fillBuffer() 
00118     {
00119         while(m_input->hasMoreData()) {
00120             m_input->getMoreData(&m_tokenbuffer);
00121             if(containsTokens(m_tokenbuffer)) {
00122                 m_insertWhitespace = true;
00123                 m_pos = 0;
00124                 return true;
00125             }
00126         }
00127         return false;
00128     }
00129     
00130     boost::scoped_ptr<Input> m_input;
00131     
00132 public:
00133     Tokenizer(Input* i) 
00134         : m_errcode(U_ZERO_ERROR), m_input(i)
00135     {
00136         m_conv = ucnv_open("UTF8", &m_errcode);
00137         assert(m_errcode == U_ZERO_ERROR);
00138         m_errcode = U_ZERO_ERROR;
00139         m_pos = 0;
00140         m_all_delim = m_delimiters;
00141         m_all_delim.append(m_whitespace);
00142         
00143         fillBuffer();
00144         m_insertWhitespace = false;
00145     }
00146     ~Tokenizer() 
00147     {
00148         ucnv_close(m_conv);
00149     }
00150     
00151     inline bool hasMoreTokens()
00152     {
00153         return bufferIsFull() || fillBuffer();
00154     }
00155     // TODO: less copying around!
00156     std::string nextUnencodedToken(bool* hasWhitespace = 0);
00157     schma::UnicodePtr nextToken(bool* hasWhitespace = 0);
00158 };
00159 
00160 class TextSaver 
00161 {
00162     char buf[1024];
00163     Tokenizer m_tokenizer;
00164     std::ofstream m_normalized;
00165     
00166     
00167 
00168 public:
00169     struct TokenInformation 
00170     {
00171         std::streamoff offset;
00172         schma::UnicodePtr token;
00173       
00174         TokenInformation(std::streamoff o, schma::UnicodePtr t)
00175             : offset(o), token(t)
00176         {}
00177         TokenInformation()
00178         {}
00179       
00180     };
00181 
00182     TextSaver(const std::string& infile, const std::string& outfile) 
00183         throw(Exceptions::FileError)
00184         : m_tokenizer(new FileInput(infile)), 
00185           m_normalized(outfile.c_str(), std::ios::out | std::ios::binary) // die, windows scum, die! you'll never read our precious files!
00186         
00187     {
00188         if(!m_normalized)
00189             throw Exceptions::FileError(outfile);
00190     }
00191     inline bool hasMoreTokens() 
00192     {
00193         return m_tokenizer.hasMoreTokens();
00194     }
00195     TokenInformation nextToken()
00196         throw(std::out_of_range);
00197 };
00198 
00199 }
00200 
00201 
00202 #endif // TOKENIZER_H