corpus.cpp

Go to the documentation of this file.
00001 /*
00002   Phrasehunter - index and query text corpora
00003   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00004   Armin Schmidt (armin.sch@gmail.com)
00005   
00006   This program is free software; you can redistribute it and/or
00007   modify it under the terms of the GNU General Public License
00008   as published by the Free Software Foundation; either version 2
00009   of the License, or (at your option) any later version.
00010   
00011   This program is distributed in the hope that it will be useful,
00012   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014   GNU General Public License for more details.
00015   
00016   You should have received a copy of the GNU General Public License
00017   along with this program; if not, write to the Free Software
00018   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00019 */
00020 
00021 #include <iostream>
00022 #include <boost/filesystem/operations.hpp>
00023 #include <boost/format.hpp>
00024 #include <boost/lexical_cast.hpp>
00025 
00026 #include "sqlitepp/sqlitepp.h"
00027 
00028 #include "phrasehunter/corpus.h"
00029 #include "phrasehunter/indexer.h"
00030 #include "phrasehunter/tokenizer.h"
00031 #include "phrasehunter/searchengine.h"
00032 #include "phrasehunter/statistics.h"
00033 #include "phrasehunter/contextreader.h"
00034 
00035 
00036 using boost::lexical_cast;
00037 using namespace SQLitePP;
00038 namespace filesystem = boost::filesystem;
00039 
00040 namespace PhraseHunter {
00041 
00042 const int CorpusManager::Repository::Format = 4;
00043 const std::string CorpusManager::Repository::TextDirectory("texts");
00044 const std::string CorpusManager::Repository::InfoFile("info");
00045 const std::string CorpusManager::Repository::IndexFile((boost::format("index-%||.db") % Format).str());
00046 
00047 CorpusManager::CorpusManager(const std::string& corpus_repository, CorpusManager::Mode mode)
00048     throw(Exceptions::NoSuchCorpusError)
00049     : m_mode(mode), 
00050       m_directory(corpus_repository),
00051       m_searchEngine(NULL),
00052       m_statistics(NULL),
00053       m_contextReader(NULL),
00054       m_db(indexFile().string())
00055 {
00056     if(!filesystem::exists(textDir().string()) || !filesystem::exists(indexFile().string()))
00057         throw Exceptions::NoSuchCorpusError(corpus_repository);
00058 
00059     m_indexer = (mode & Write) ? new IndexManager(m_db) : NULL;
00060 }
00061 
00062 CorpusManager::~CorpusManager()
00063 {
00064     // it's safe to call delete on a NULL pointer
00065     delete m_indexer;
00066     delete m_searchEngine;
00067     delete m_statistics;
00068 }
00069 
00070 SearchEngine* CorpusManager::searchEngine()
00071 {
00072     if(m_searchEngine == NULL) {
00073         m_searchEngine = new SearchEngine(m_db);
00074     }
00075     return m_searchEngine;
00076 }
00077 
00078 StatisticsEngine* CorpusManager::statisticsEngine()
00079 {
00080     if(m_statistics == NULL) {
00081         m_statistics = new StatisticsEngine(searchEngine(), contextReader(), m_db);
00082     }
00083     return m_statistics;
00084 }
00085 
00086 ContextReader* CorpusManager::contextReader()    
00087 {
00088     if(m_contextReader == NULL) {
00089         m_contextReader = new ContextReader(textDir());
00090     }
00091     return m_contextReader;
00092 }
00093 
00094 std::string CorpusManager::documentName(DocID docID)
00095 {
00096     Statement::Pointer dn = m_db.statement("select description from docs where docid = ?");
00097     
00098     ResultIterator ri = (*dn)
00099         .bindArgs(docID)
00100         .query();
00101     
00102     return (ri.hasMoreRows()) 
00103         ? ri.get<std::string>(0)
00104         : "";
00105 }
00106 
00107     
00108 void CorpusManager::addFile(const std::string& filename, const std::string& info)
00109     throw (Exceptions::FileError, Exceptions::NoIndexerError, std::out_of_range)
00110 {
00111     if (m_indexer == NULL)
00112         throw Exceptions::NoIndexerError();
00113     
00114 
00115     
00116     int newDocID = 
00117         (*m_db.statement("INSERT INTO docs (description) VALUES (?)"))
00118         .bindArgs(info)
00119         .exec()
00120         .lastInsertId();
00121     
00122     filesystem::path newname = textDir() / lexical_cast<std::string>(newDocID);
00123     
00124     (*m_db.statement("UPDATE docs SET filename = ? WHERE id = ?"))
00125         .bindArgs(newname.string(), newDocID)
00126         .exec();
00127     
00128     TextSaver ts(filename, newname.string());
00129     m_indexer->addToIndex(ts, newDocID);
00130 }
00131 
00132 void CorpusManager::removeFile(DocID docID) throw (Exceptions::NoIndexerError)
00133 {
00134     if(m_indexer == NULL)
00135         throw Exceptions::NoIndexerError();
00136     filesystem::remove(textDir() / lexical_cast<std::string>(docID));
00137     
00138     (*m_db.statement("DELETE FROM docs WHERE docid = ?"))
00139         .bindArgs(docID)
00140         .exec();
00141     
00142     m_indexer->removeFromIndex(docID);
00143 }
00144 
00145 }

Generated on Thu Dec 21 16:14:40 2006 for The Phrasehunter by  doxygen 1.5.1