00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <fstream>
00022
00023 #include "phrasehunter/contextreader.h"
00024 #include "phrasehunter/tokencontext.h"
00025 #include "phrasehunter/token.h"
00026 #include "support/unicodehelpers.h"
00027
00028 namespace PhraseHunter {
00029
00030 ContextReader::ContextReader(const boost::filesystem::path& textDirectory):
00031 m_textDirectory(textDirectory)
00032 {}
00033
00034 unsigned int ContextReader::removeNewlines(char* buffer, unsigned int length) const
00035 {
00036 unsigned int offset = 0;
00037 for(unsigned int idx = 0; idx + offset < length; ++idx) {
00038 while(buffer[idx + offset] == '\n') {
00039 ++offset;
00040 if(idx + offset >= length)
00041 return offset;
00042 }
00043 buffer[idx] = buffer[idx+offset];
00044 }
00045 return offset;
00046 }
00047
00048 void ContextReader::readBlock(std::ifstream& file, TokenContextPtr result) const
00049 {
00050 static char buffer[BufferSize];
00051 IdxPos position = result->position();
00052 int startpos = position - BufferSize;
00053 int bytesRead;
00054 if (startpos < 0)
00055 bytesRead = file.readsome(buffer, position);
00056 else {
00057 file.seekg(startpos);
00058 bytesRead = file.readsome(buffer, BufferSize-1);
00059 }
00060 bytesRead -= removeNewlines(buffer, bytesRead);
00061 UErrorCode status = U_ZERO_ERROR;
00062
00063 result->setLeftContext(schma::UnicodePtr(new UnicodeString(buffer, bytesRead, schma::UTF8Converter(), status)));
00064
00065 bytesRead = file.readsome(buffer, result->tokenLength());
00066 bytesRead -= removeNewlines(buffer, bytesRead);
00067 status = U_ZERO_ERROR;
00068 result->setToken(schma::UnicodePtr(new UnicodeString(buffer, bytesRead, schma::UTF8Converter(), status)));
00069 result->token()->trim();
00070
00071 bytesRead = file.readsome(buffer, BufferSize-1);
00072 bytesRead -= removeNewlines(buffer, bytesRead);
00073 status = U_ZERO_ERROR;
00074 result->setRightContext(schma::UnicodePtr(new UnicodeString(buffer, bytesRead, schma::UTF8Converter(), status)));
00075 }
00076
00077
00078 TokenContextPtr ContextReader::getContextFromPosition(DocID docId, IdxPos pos, unsigned int tokenLength, unsigned int width) const
00079 throw(Exceptions::FileError)
00080 {
00081 TokenContextPtr output = TokenContext::emptyContext(docId, pos, tokenLength);
00082 fillContext(output, width);
00083
00084 return output;
00085 }
00086
00087 void ContextReader::fillContext(TokenContextPtr context, unsigned int width) const
00088 throw(Exceptions::FileError)
00089 {
00090 std::ifstream file(documentFile(context->docID()).string().c_str(),
00091 std::ios::in | std::ios::binary);
00092 if(!file) {
00093 throw Exceptions::FileError(documentFile(context->docID()).string());
00094 }
00095 readBlock(file, context);
00096 context->leftContext()->remove(0, context->leftContext()->length() - width);
00097 context->rightContext()->truncate(width);
00098 }
00099
00100 std::vector<TokenContextPtr> ContextReader::context(TokenPtr token, unsigned int width) const
00101 throw(Exceptions::FileError)
00102 {
00103 try {
00104 schma::UnicodePtr unitoken;
00105
00106 if(token->isUniform()) {
00107 unitoken = token->tokenString();
00108 }
00109
00110 std::vector<TokenContextPtr> result;
00111
00112 const OccurrenceMap& all_occurrences = token->allOccurrences();
00113 for (OccurrenceMap::const_iterator it = all_occurrences.begin();
00114 it != all_occurrences.end();
00115 ++it) {
00116 std::string corpus_file = documentFile(it->first).string();
00117
00118 for (PositionList::const_iterator it_inner = it->second.begin();
00119 it_inner != it->second.end(); ++it_inner) {
00120
00121 TokenContextPtr context = TokenContext::emptyContext(it->first, *it_inner, token->length());
00122 fillContext(context, width);
00123
00124 if(unitoken != NULL) {
00125 context->setToken(unitoken);
00126 }
00127 }
00128 }
00129 return result;
00130 }
00131 catch (Exceptions::FileError& _e) {
00132 throw;
00133 }
00134 }
00135
00136 std::vector<TokenContextPtr> ContextReader::context(const TokenVector& tokens, unsigned int width) const
00137 throw(Exceptions::FileError)
00138 {
00139 std::vector<TokenContextPtr> results;
00140 for (unsigned int i=0; i<tokens.size(); ++i) {
00141 std::vector<TokenContextPtr> oneContext = context(tokens[i], width);
00142 results.insert(results.end(), oneContext.begin(), oneContext.end());
00143 }
00144 return results;
00145 }
00146
00147 }