The Phrasehunter: phrasehunter/include/support/unicodehelpers.h Source File

00001 //-*- C++ -*-
00002 /*
00003   Phrasehunter - index and query text corpora
00004   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00005   Armin Schmidt (armin.sch@gmail.com)
00006   
00007   This program is free software; you can redistribute it and/or
00008   modify it under the terms of the GNU General Public License
00009   as published by the Free Software Foundation; either version 2
00010   of the License, or (at your option) any later version.
00011   
00012   This program is distributed in the hope that it will be useful,
00013   but WITHOUT ANY WARRANTY; without even the implied warranty of
00014   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015   GNU General Public License for more details.
00016   
00017   You should have received a copy of the GNU General Public License
00018   along with this program; if not, write to the Free Software
00019   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00020 */
00021 
00022 #ifndef UNICODEHELPERS_H
00023 #define UNICODEHELPERS_H UNICODEHELPERS_H
00024 
00025 #include <vector>
00026 #include <unicode/ucnv.h>
00027 #include <unicode/schriter.h>
00028 #include <unicode/regex.h>
00029 #include <boost/shared_ptr.hpp>
00030 #include <boost/tokenizer.hpp>
00031 #include <boost/shared_array.hpp>
00032 
00033 namespace schma {
00034 
00035 typedef boost::shared_ptr<UnicodeString> UnicodePtr;
00036 typedef boost::shared_array<char> charArray;
00037 typedef std::vector<UnicodePtr> UnicodeVector;
00038 
00039 inline UnicodeVector splitString(const std::string& s)
00040 {
00041     UnicodeVector result;
00042     
00043     boost::char_separator<char> sep(" ");
00044     boost::tokenizer<boost::char_separator<char> > tok(s, sep);
00045     
00046     for(boost::tokenizer<boost::char_separator<char> >::const_iterator it = tok.begin();
00047         it != tok.end(); ++it){
00048         result.push_back(UnicodePtr(new UnicodeString(it->c_str())));
00049     }
00050     return result;
00051 }
00052 
00053 /**
00054    \brief Class for converting utf8-encoded input into UnicodeString.
00055    Don't use directly. Call UTF8Converter() to get a pointer to a singleton
00056    _UTFConverter
00057 */
00058 class _UTF8Converter
00059 {
00060     UConverter* m_conv;
00061 public:
00062     _UTF8Converter()
00063     {
00064         UErrorCode errcode = U_ZERO_ERROR;
00065         m_conv = ucnv_open("UTF8", &errcode);
00066         assert(errcode == U_ZERO_ERROR);
00067     }
00068     ~_UTF8Converter() 
00069     {
00070         ucnv_close(m_conv);
00071     }
00072     inline UConverter* converter() const 
00073     {
00074         return m_conv;
00075     }
00076 };
00077 
00078 //! Returns a pointer to a singleton _UTF8Converter object.
00079 inline UConverter* UTF8Converter() 
00080 {
00081     static _UTF8Converter u;
00082     return u.converter();
00083 }
00084 
00085 //! Convert the UnicodeString a UnicodePtr points to into a shared char array.
00086 inline charArray toCharArray(UnicodePtr u)
00087 {
00088     char* buffer = new char[u->length()*4];
00089     UErrorCode c = U_ZERO_ERROR;
00090     u->extract(buffer,u->length()*4, 0, c);
00091     return charArray(buffer);
00092 }
00093 
00094 //! Convert the UnicodeString a UnicodePtr points to into a std::string.
00095 inline std::string toStdString(UnicodePtr u)
00096 {
00097     char buffer[u->length()*4];
00098     UErrorCode c = U_ZERO_ERROR;
00099     u->extract(buffer,u->length()*4, 0, c);
00100     return std::string(buffer);
00101 }
00102 
00103 }
00104 
00105 #endif