The Phrasehunter: phrasehunter/lib/indexer.cpp Source File

00001 /*
00002   Phrasehunter - index and query text corpora
00003   Copyright (C) 2006  Torsten Marek (shlomme@gmx.de) &
00004   Armin Schmidt (armin.sch@gmail.com)
00005   
00006   This program is free software; you can redistribute it and/or
00007   modify it under the terms of the GNU General Public License
00008   as published by the Free Software Foundation; either version 2
00009   of the License, or (at your option) any later version.
00010   
00011   This program is distributed in the hope that it will be useful,
00012   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014   GNU General Public License for more details.
00015   
00016   You should have received a copy of the GNU General Public License
00017   along with this program; if not, write to the Free Software
00018   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00019 */
00020 
00021 #include <fstream>
00022 #include <tr1/unordered_map>
00023 
00024 
00025 #include "sqlitepp/sqlitepp.h"
00026 
00027 #include "phrasehunter/indexer.h"
00028 #include "phrasehunter/tokenizer.h"
00029 #include "support/unicodehelpers.h"
00030 
00031 using namespace SQLitePP;
00032 
00033 namespace PhraseHunter {
00034 
00035 void IndexManager::removeFromIndex(DocID docID) 
00036 {
00037     (*m_db.statement("DELETE FROM occurrences WHERE docid = ?"))
00038         .bindArgs(docID)
00039         .exec();
00040 }
00041 
00042 
00043 typedef std::tr1::unordered_map<std::string, std::pair<TokenID, PositionList> > idx_map;
00044 
00045 void IndexManager::addToIndex(TextSaver& token_stream, DocID docID) 
00046     throw(Exceptions::FileError)
00047 {
00048     char buf[1024];
00049     idx_map tokens;
00050     
00051     TextSaver::TokenInformation ti;
00052     
00053     m_db.begin();
00054     
00055     Statement::Pointer getid = m_db.statement("SELECT id FROM tokens WHERE word = ?");
00056     Statement::Pointer insertTok = m_db.statement("INSERT INTO tokens (word) VALUES (?)");
00057     Statement::Pointer insertPos = m_db.statement("\
00058 INSERT INTO occurrences (wordid, docid, positions) VALUES (?,?,?)");
00059     
00060     UErrorCode status;
00061     
00062     while(token_stream.hasMoreTokens()) {
00063         ti = token_stream.nextToken();
00064       
00065         ti.token->toLower();
00066         status = U_ZERO_ERROR;
00067 
00068         ti.token->extract(buf, 1023, schma::UTF8Converter(), status);
00069 
00070         idx_map::iterator it = tokens.find(buf);
00071 
00072         if(it == tokens.end()) {
00073             int id;
00074             getid->bindArg(buf);
00075             ResultIterator ri = getid->query();
00076           
00077             if(ri.hasMoreRows()) {
00078                 id = ri.get<int>(0);
00079             } else {
00080                 id = (*insertTok)
00081                     .bindArg(buf)
00082                     .exec()
00083                     .lastInsertId();
00084 
00085                 insertTok->reset();
00086             }
00087             getid->reset();
00088             PositionList v;
00089             v.push_back(ti.offset);
00090           
00091             tokens[buf] = std::make_pair(id, v);
00092         } else {
00093             it->second.second.push_back(ti.offset);
00094         }
00095     }
00096     
00097     for(idx_map::iterator it = tokens.begin();
00098         it != tokens.end();
00099         ++it) {
00100         (*insertPos)
00101             .bindArgs(it->second.first, docID, 
00102                       Blob(reinterpret_cast<const void*>(&it->second.second[0]), 
00103                            it->second.second.size() * sizeof(PositionList::value_type)))
00104             .exec()
00105             .reset();
00106     }
00107     m_db.commit();
00108 }
00109 
00110 IndexManager::~IndexManager() 
00111 {
00112     (*m_db.statement("\
00113 UPDATE tokens SET \
00114 frequency = (SELECT SUM(LENGTH(positions)) / ? FROM occurrences WHERE wordid = tokens.id ), \
00115 documentfrequency = (SELECT DISTINCT docid FROM occurrences WHERE wordid = tokens.id)"))
00116         .bindArgs(sizeof(IdxPos))
00117         .exec();
00118     
00119     
00120     m_db.statement("UPDATE tokens SET rank = NULL")->exec();
00121     
00122     Statement::Pointer getByFrequency = 
00123         m_db.statement("SELECT id FROM tokens ORDER BY frequency DESC");
00124 
00125     Statement::Pointer setRank = 
00126         m_db.statement("UPDATE tokens SET rank = ? WHERE id = ?");
00127     
00128     int rank = 1;
00129     for(ResultIterator ri = getByFrequency->query();
00130         ri.hasMoreRows();
00131         ri.next()) {
00132         (*setRank)
00133             .bindArgs(rank++, ri.get<int>(0))
00134             .exec()
00135             .reset();
00136     }
00137 }
00138 
00139 
00140 } // PhraseHunter