# ****************************************************
# Name des Moduls: mod_ResultCollector
# Name des Projekts: TaxoSearch
#
# Autor(en):
#        Thorsten Beinhorn, Vesna Cvoro,
#        Khaled Dhaoui und Christian Pretzsch 
#
# Aufgaben des Moduls: siehe Code Dokumentation TaxoSearch
# 
#
# Datum der letzten Aenderung: 26.11.2003
# ****************************************************


import urllib
import threading
import time
from mod_DocumentObjects import *

#Begin of class PageCollector
#this thread collects the page content of a given URL
class PageCollector(threading.Thread):
    def __init__(self, URL):
        threading.Thread.__init__(self)
        self.done = 0
        self.err = 0
        self.URL = URL
        #self.tmppage=""
        self.page=""

    def run(self):
    	tmppage=""
    	try:
        	tmppage = urllib.urlopen(self.URL)
        	self.page = tmppage.read()
        	tmppage.close()
        except:
        	self.err=1
        self.done = 1
#End of class PageRetriever

#Begin of class ResultCollector
#this class mangages the threads of the PageCollector classes and creates a DocumentCollection
#as return value
class ResultCollector:
    def __init__(self, DocumentCollection):
        self.MAX_THREADS=20
        self.numThreads=0
        self.lstThreadList=[]
        self.queue=[]
        self.DocumentCollection=DocumentCollection

    def run(self):
        self.queue=self.CreateQueue(self.DocumentCollection)
        print "ResultCollector: Collecting page contents..."
        while self.queue or self.lstThreadList:
            while self.queue and (self.numThreads < self.MAX_THREADS):
                tmpURL=self.queue.pop()
                self.PageCollect(tmpURL)
            self.CheckThreads()

    def CheckThreads(self):
        #check if a thread is finished, get the page content and pass it to a document object
        tmpnumThreads = self.numThreads
        for pcThread in self.lstThreadList[:]:
            if pcThread.done:
            	if not pcThread.err:
                	self.DocumentCollection.UpdateDocumentPage(pcThread.URL, pcThread.page)
                else:
                	print "ResultCollector: Page failure on ",pcThread.URL
                	self.DocumentCollection.DeleteDocument(pcThread.URL)
                self.lstThreadList.remove(pcThread)
                print "ResultCollector: page retrieved"
                self.numThreads = self.numThreads - 1
        if tmpnumThreads == self.numThreads:
            time.sleep(1)

    def PageCollect(self, URL):
        pcThread = PageCollector(URL)
        pcThread.start()
        #print "Thread started..."
        self.lstThreadList.append(pcThread)
        self.numThreads = self.numThreads + 1

    def CreateQueue(self, DocumentCollection):
        tmpqueue=[]
        for DocumentObject in DocumentCollection.dicDocumentCollection.iterkeys():
            tmpqueue.append(DocumentObject)
        return tmpqueue        
