The Phrasehunter: phrasehunter/lib/contextreader.cpp Source File

00001 /*
00002   Phrasehunter - index and query text corpora
00003   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00004   Armin Schmidt (armin.sch@gmail.com)
00005   
00006   This program is free software; you can redistribute it and/or
00007   modify it under the terms of the GNU General Public License
00008   as published by the Free Software Foundation; either version 2
00009   of the License, or (at your option) any later version.
00010   
00011   This program is distributed in the hope that it will be useful,
00012   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014   GNU General Public License for more details.
00015   
00016   You should have received a copy of the GNU General Public License
00017   along with this program; if not, write to the Free Software
00018   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00019 */
00020 
00021 #include <fstream>
00022 
00023 #include "phrasehunter/contextreader.h"
00024 #include "phrasehunter/tokencontext.h"
00025 #include "phrasehunter/token.h"
00026 #include "support/unicodehelpers.h"
00027 
00028 namespace PhraseHunter {
00029 
00030 ContextReader::ContextReader(const boost::filesystem::path& textDirectory):
00031     m_textDirectory(textDirectory)
00032 {}
00033 
00034 unsigned int ContextReader::removeNewlines(char* buffer, unsigned int length) const
00035 {
00036     unsigned int offset = 0;
00037     for(unsigned int idx = 0; idx + offset < length; ++idx) {
00038         while(buffer[idx + offset] == '\n') {
00039             ++offset;
00040             if(idx + offset >= length)
00041                 return offset;
00042         }
00043         buffer[idx] = buffer[idx+offset];
00044     }
00045     return offset;
00046 }
00047 
00048 void ContextReader::readBlock(std::ifstream& file, TokenContextPtr result) const
00049 {
00050     static char buffer[BufferSize];
00051     IdxPos position = result->position();
00052     int startpos = position - BufferSize;
00053     int bytesRead;
00054     if (startpos < 0) 
00055         bytesRead = file.readsome(buffer, position);
00056     else {
00057         file.seekg(startpos);
00058         bytesRead = file.readsome(buffer, BufferSize-1);
00059     }
00060     bytesRead -= removeNewlines(buffer, bytesRead);
00061     UErrorCode status = U_ZERO_ERROR;
00062 
00063     result->setLeftContext(schma::UnicodePtr(new UnicodeString(buffer, bytesRead, schma::UTF8Converter(), status)));
00064 
00065     bytesRead = file.readsome(buffer, result->tokenLength());
00066     bytesRead -= removeNewlines(buffer, bytesRead);
00067     status = U_ZERO_ERROR;
00068     result->setToken(schma::UnicodePtr(new UnicodeString(buffer, bytesRead, schma::UTF8Converter(), status)));
00069     result->token()->trim();
00070     
00071     bytesRead = file.readsome(buffer, BufferSize-1);
00072     bytesRead -= removeNewlines(buffer, bytesRead);
00073     status = U_ZERO_ERROR;
00074     result->setRightContext(schma::UnicodePtr(new UnicodeString(buffer, bytesRead, schma::UTF8Converter(), status)));
00075 }
00076 
00077 
00078 TokenContextPtr ContextReader::getContextFromPosition(DocID docId, IdxPos pos, unsigned int tokenLength, unsigned int width) const
00079     throw(Exceptions::FileError) 
00080 {
00081     TokenContextPtr output = TokenContext::emptyContext(docId, pos, tokenLength);
00082     fillContext(output, width);
00083     
00084     return output;
00085 }
00086 
00087 void ContextReader::fillContext(TokenContextPtr context, unsigned int width) const
00088     throw(Exceptions::FileError)
00089 {
00090     std::ifstream file(documentFile(context->docID()).string().c_str(), 
00091                    std::ios::in | std::ios::binary);
00092     if(!file) {
00093       throw Exceptions::FileError(documentFile(context->docID()).string());
00094     }
00095     readBlock(file, context);
00096     context->leftContext()->remove(0, context->leftContext()->length() - width);
00097     context->rightContext()->truncate(width);
00098 }
00099 
00100 std::vector<TokenContextPtr> ContextReader::context(TokenPtr token, unsigned int width) const
00101     throw(Exceptions::FileError)
00102 {
00103     try {
00104         schma::UnicodePtr unitoken;
00105       
00106         if(token->isUniform()) {
00107             unitoken = token->tokenString();
00108         }
00109         
00110         std::vector<TokenContextPtr> result;
00111       
00112         const OccurrenceMap& all_occurrences = token->allOccurrences();
00113         for (OccurrenceMap::const_iterator it = all_occurrences.begin();
00114              it != all_occurrences.end();
00115              ++it) {            
00116             std::string corpus_file = documentFile(it->first).string();
00117           
00118             for (PositionList::const_iterator it_inner = it->second.begin();
00119                  it_inner != it->second.end(); ++it_inner) {
00120 
00121                 TokenContextPtr context = TokenContext::emptyContext(it->first, *it_inner, token->length());
00122             fillContext(context, width);
00123                 
00124             if(unitoken != NULL) {
00125                     context->setToken(unitoken);
00126                 }
00127             }
00128         }
00129         return result;
00130     }
00131     catch (Exceptions::FileError& _e) {
00132         throw;
00133     }
00134 }
00135 
00136 std::vector<TokenContextPtr> ContextReader::context(const TokenVector& tokens, unsigned int width) const
00137     throw(Exceptions::FileError)
00138 {
00139     std::vector<TokenContextPtr> results;
00140     for (unsigned int i=0; i<tokens.size(); ++i) {
00141         std::vector<TokenContextPtr> oneContext = context(tokens[i], width);
00142         results.insert(results.end(), oneContext.begin(), oneContext.end());
00143     }
00144     return results;
00145 }
00146 
00147 }