00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "phrasehunter/tokenizer.h"
00022 #include "support/unicodehelpers.h"
00023
00024 namespace PhraseHunter {
00025
00026
00027
00028
00029 std::string Tokenizer::m_whitespace(" \n\t\r\x0c");
00030 std::string Tokenizer::m_delimiters("*!?.,'\"<[({})]>:;");
00031
00032
00033 std::string Tokenizer::nextUnencodedToken(bool *hasWhitespace)
00034 {
00035 size_t start = m_pos - 1;
00036 size_t delim_pos = std::string::npos;
00037
00038 start = m_tokenbuffer.find_first_not_of(m_whitespace, m_pos);
00039 if(hasWhitespace != 0)
00040 *hasWhitespace = start > m_pos || m_insertWhitespace;
00041 m_insertWhitespace = false;
00042 delim_pos = m_tokenbuffer.find_first_of(m_all_delim, start);
00043
00044 if(delim_pos == start)
00045 delim_pos++;
00046 else if(delim_pos == std::string::npos)
00047 delim_pos = m_tokenbuffer.length();
00048 m_pos = delim_pos;
00049 return m_tokenbuffer.substr(start, delim_pos - start);
00050 }
00051
00052 schma::UnicodePtr Tokenizer::nextToken(bool* hasWhitespace)
00053 {
00054 std::string t = nextUnencodedToken(hasWhitespace);
00055 UnicodeString utoken(t.c_str(), t.length(), m_conv, m_errcode);
00056
00057 m_errcode = U_ZERO_ERROR;
00058 schma::UnicodePtr result(new UnicodeString());
00059 Normalizer::normalize(utoken, UNORM_NFC, 0, *result, m_errcode);
00060 return result;
00061 }
00062
00063
00064
00065
00066 TextSaver::TokenInformation TextSaver::nextToken()
00067 throw(std::out_of_range)
00068 {
00069 std::streamoff start = m_normalized.tellp();
00070
00071 UErrorCode status = U_ZERO_ERROR;
00072 bool hasWhitespace;
00073 schma::UnicodePtr token(m_tokenizer.nextToken(&hasWhitespace));
00074 int len = token->extract(buf, 1023, schma::UTF8Converter(), status);
00075
00076
00077 if (len >= MAX_TOKEN_LEN)
00078 throw std::out_of_range("Token exceeds MAX_TOKEN_LEN");
00079 m_normalized << ((hasWhitespace) ? " " : "\n") << buf << std::endl;
00080 if(token->indexOf("\r") > 0) {
00081 std::cout << "bla" << std::endl;
00082 }
00083
00084 return TokenInformation(start + 1, token);
00085 }
00086
00087
00088 }