00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef TOKENIZER_H
00023 #define TOKENIZER_H TOKENIZER_H
00024
00025 #include <string>
00026 #include <fstream>
00027 #include <iostream>
00028 #include <stdexcept>
00029 #include <boost/scoped_ptr.hpp>
00030
00031 #include <unicode/unistr.h>
00032 #include <unicode/ucnv.h>
00033 #include <unicode/normlzr.h>
00034
00035 #include "ph_types.h"
00036 #include "phexception.h"
00037 #include "support/unicodehelpers.h"
00038
00039 #define MAX_TOKEN_LEN 1024
00040
00041 namespace PhraseHunter {
00042
00043 class Input
00044 {
00045 public:
00046 virtual ~Input() {};
00047 virtual bool hasMoreData() const = 0;
00048 virtual void getMoreData(std::string*) = 0;
00049 };
00050
00051 class StringInput: public Input
00052 {
00053 std::string m_inputstring;
00054 bool m_delivered;
00055
00056 public:
00057 StringInput(const std::string& inputstring): m_inputstring(inputstring), m_delivered(false)
00058 {}
00059 inline bool hasMoreData() const
00060 {
00061 return !m_delivered;
00062 }
00063 inline void getMoreData(std::string* s)
00064 {
00065 m_delivered = true;
00066 *s = m_inputstring;
00067 }
00068 };
00069
00070 class FileInput: public Input
00071 {
00072 std::ifstream m_input;
00073 public:
00074 FileInput(const std::string& filename)
00075 throw (Exceptions::FileError)
00076 : m_input(filename.c_str(), std::ios::in)
00077 {
00078 if (!m_input)
00079 throw Exceptions::FileError(filename);
00080 }
00081 inline bool hasMoreData() const
00082 {
00083 return !m_input.eof();
00084 }
00085 inline void getMoreData(std::string* s)
00086 {
00087 assert(!m_input.eof());
00088 std::getline(m_input, *s);
00089 }
00090 };
00091
00092
00093 class Tokenizer
00094 {
00095 protected:
00096 UConverter* m_conv;
00097 UErrorCode m_errcode;
00098
00099 std::string m_tokenbuffer;
00100 size_t m_pos;
00101 std::string m_all_delim;
00102
00103 static std::string m_whitespace;
00104 static std::string m_delimiters;
00105
00106 bool m_insertWhitespace;
00107
00108 bool containsTokens(const std::string& buffer, int pos = 0) const
00109 {
00110 return buffer.length() > 0 && buffer.find_first_not_of(m_whitespace, pos) != std::string::npos;
00111 }
00112 inline bool bufferIsFull() const
00113 {
00114 return containsTokens(m_tokenbuffer, m_pos);
00115 }
00116
00117 bool fillBuffer()
00118 {
00119 while(m_input->hasMoreData()) {
00120 m_input->getMoreData(&m_tokenbuffer);
00121 if(containsTokens(m_tokenbuffer)) {
00122 m_insertWhitespace = true;
00123 m_pos = 0;
00124 return true;
00125 }
00126 }
00127 return false;
00128 }
00129
00130 boost::scoped_ptr<Input> m_input;
00131
00132 public:
00133 Tokenizer(Input* i)
00134 : m_errcode(U_ZERO_ERROR), m_input(i)
00135 {
00136 m_conv = ucnv_open("UTF8", &m_errcode);
00137 assert(m_errcode == U_ZERO_ERROR);
00138 m_errcode = U_ZERO_ERROR;
00139 m_pos = 0;
00140 m_all_delim = m_delimiters;
00141 m_all_delim.append(m_whitespace);
00142
00143 fillBuffer();
00144 m_insertWhitespace = false;
00145 }
00146 ~Tokenizer()
00147 {
00148 ucnv_close(m_conv);
00149 }
00150
00151 inline bool hasMoreTokens()
00152 {
00153 return bufferIsFull() || fillBuffer();
00154 }
00155
00156 std::string nextUnencodedToken(bool* hasWhitespace = 0);
00157 schma::UnicodePtr nextToken(bool* hasWhitespace = 0);
00158 };
00159
00160 class TextSaver
00161 {
00162 char buf[1024];
00163 Tokenizer m_tokenizer;
00164 std::ofstream m_normalized;
00165
00166
00167
00168 public:
00169 struct TokenInformation
00170 {
00171 std::streamoff offset;
00172 schma::UnicodePtr token;
00173
00174 TokenInformation(std::streamoff o, schma::UnicodePtr t)
00175 : offset(o), token(t)
00176 {}
00177 TokenInformation()
00178 {}
00179
00180 };
00181
00182 TextSaver(const std::string& infile, const std::string& outfile)
00183 throw(Exceptions::FileError)
00184 : m_tokenizer(new FileInput(infile)),
00185 m_normalized(outfile.c_str(), std::ios::out | std::ios::binary)
00186
00187 {
00188 if(!m_normalized)
00189 throw Exceptions::FileError(outfile);
00190 }
00191 inline bool hasMoreTokens()
00192 {
00193 return m_tokenizer.hasMoreTokens();
00194 }
00195 TokenInformation nextToken()
00196 throw(std::out_of_range);
00197 };
00198
00199 }
00200
00201
00202 #endif // TOKENIZER_H