The Phrasehunter: phrasehunter/lib/tokenizer.cpp Source File

00001 /*
00002   Phrasehunter - index and query text corpora
00003   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00004   Armin Schmidt (armin.sch@gmail.com)
00005   
00006   This program is free software; you can redistribute it and/or
00007   modify it under the terms of the GNU General Public License
00008   as published by the Free Software Foundation; either version 2
00009   of the License, or (at your option) any later version.
00010   
00011   This program is distributed in the hope that it will be useful,
00012   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014   GNU General Public License for more details.
00015   
00016   You should have received a copy of the GNU General Public License
00017   along with this program; if not, write to the Free Software
00018   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00019 */
00020 
00021 #include "phrasehunter/tokenizer.h"
00022 #include "support/unicodehelpers.h"
00023 
00024 namespace PhraseHunter {
00025 
00026 /**
00027    \class PhraseHunter::Tokenizer tokenizer.h
00028 */
00029 std::string Tokenizer::m_whitespace(" \n\t\r\x0c");
00030 std::string Tokenizer::m_delimiters("*!?.,'\"<[({})]>:;");
00031 
00032 
00033 std::string Tokenizer::nextUnencodedToken(bool *hasWhitespace) 
00034 {
00035     size_t start = m_pos - 1;
00036     size_t delim_pos = std::string::npos;
00037     // skip whitespace
00038     start = m_tokenbuffer.find_first_not_of(m_whitespace, m_pos);
00039     if(hasWhitespace != 0)
00040         *hasWhitespace = start > m_pos || m_insertWhitespace;
00041     m_insertWhitespace = false;
00042     delim_pos = m_tokenbuffer.find_first_of(m_all_delim, start);
00043     
00044     if(delim_pos == start)
00045         delim_pos++;
00046     else if(delim_pos == std::string::npos)
00047         delim_pos = m_tokenbuffer.length();
00048     m_pos = delim_pos;
00049     return m_tokenbuffer.substr(start, delim_pos - start);
00050 }
00051 
00052 schma::UnicodePtr Tokenizer::nextToken(bool* hasWhitespace)
00053 {
00054     std::string t = nextUnencodedToken(hasWhitespace);
00055     UnicodeString utoken(t.c_str(), t.length(), m_conv, m_errcode);
00056     
00057     m_errcode = U_ZERO_ERROR;
00058     schma::UnicodePtr result(new UnicodeString());
00059     Normalizer::normalize(utoken, UNORM_NFC, 0, *result, m_errcode);
00060     return result;      
00061 }
00062 
00063 /**
00064    \class PhraseHunter::TextSaver tokenizer.h
00065 */
00066 TextSaver::TokenInformation TextSaver::nextToken()
00067     throw(std::out_of_range)
00068 {
00069     std::streamoff start = m_normalized.tellp();
00070     
00071     UErrorCode status = U_ZERO_ERROR;
00072     bool hasWhitespace;
00073     schma::UnicodePtr token(m_tokenizer.nextToken(&hasWhitespace));
00074     int len = token->extract(buf, 1023, schma::UTF8Converter(), status);
00075     
00076     //FIXME: handle tokens > 1023 specially
00077     if (len >= MAX_TOKEN_LEN)
00078         throw std::out_of_range("Token exceeds MAX_TOKEN_LEN");
00079     m_normalized << ((hasWhitespace) ? " " : "\n") << buf << std::endl;
00080     if(token->indexOf("\r") > 0) {
00081         std::cout << "bla" << std::endl;
00082     }
00083     
00084     return TokenInformation(start + 1, token);
00085 }
00086 
00087 
00088 } // PhraseHunter