# ****************************************************
# Name des Moduls: mod_DocumentProcessor
# Name des Projekts: TaxoSearch
#
# Autor(en):
#        Thorsten Beinhorn, Vesna Cvoro,
#        Khaled Dhaoui und Christian Pretzsch 
#
# Aufgaben des Moduls: siehe Code Dokumentation TaxoSearch
# 
#
# Datum der letzten Aenderung: 26.11.2003
# ****************************************************

import sys,string,re,time
import MontyTagger
import threading
import urllib
import time

# ****************************************************
# Name des Moduls: mod_DocumentProcessor
# Name des Projekts: TaxoSearch
#
# Autor(en):
#        Thorsten Beinhorn, Vesna Cvoro,
#        Khaled Dhaoui und Christian Pretzsch 
#
# Aufgaben des Moduls: siehe Code Dokumentation TaxoSearch
# 
#
# Datum der letzten Aenderung: 26.11.2003
# ****************************************************

from mod_DocumentObjects import *
from wntools import *
from types import *

#Begin of class DocumentProcessor
class DocumentProcessor:
    def __init__(self, DocumentCollection, lstQuery):
        self.MAX_THREADS=5
        self.numThreads=0
        self.lstThreadList=[]
        self.queue=[]
        self.DocumentCollection=DocumentCollection
        self.Tagger=MontyTagger.MontyTagger()
        self.lstQuery=lstQuery

    def run(self):
        self.queue=self.CreateQueue(self.DocumentCollection)
        while self.queue or self.lstThreadList:
            while self.queue and (self.numThreads < self.MAX_THREADS):
                tmpDocument=self.queue.pop()
                self.PageProcess(tmpDocument)
            self.CheckThreads()
        
    def CheckThreads(self):
        #check if a thread is finished, get the page content and pass it to a document object
        tmpnumThreads = self.numThreads
        for pcThread in self.lstThreadList[:]:
            if pcThread.done:
                self.DocumentCollection.UpdateDocument(pcThread.DocumentObject)
                self.lstThreadList.remove(pcThread)
                #print "Thread finished..."
                self.numThreads = self.numThreads - 1
        if tmpnumThreads == self.numThreads:
            time.sleep(1)

    def CreateQueue(self, DocumentCollection):
        tmpqueue=[]
        for DocumentObject in DocumentCollection.dicDocumentCollection.itervalues():
            tmpqueue.append(DocumentObject)
        return tmpqueue

    def PageProcess(self, DocumentObject):
        pcThread = PageProcessor(DocumentObject, self.Tagger, self.lstQuery)
        pcThread.start()
        #print "Thread started..."
        self.lstThreadList.append(pcThread)
        self.numThreads = self.numThreads + 1    
#End of class DocumentProcessor


#Begin of class PageProcessor
class PageProcessor(threading.Thread):
    def __init__(self, DocumentObject, MyTagger, lstQuery):
        threading.Thread.__init__(self)
        self.Tagger= MyTagger
        self.done = 0
        self.DocumentObject=DocumentObject
        self.lstQuery=lstQuery

    def run(self):
        self.DocumentObject.PageContent=self.DeHTML(self.DocumentObject.Page)
        #print "de-html ready..."
        self.DocumentObject.dicDocumentVector=self.CreateDocumentVector(self.DocumentObject.PageContent)
        #print "document vector ready..."
        self.DocumentObject.numWordCount=self.GetWordCount(self.DocumentObject.dicDocumentVector)
        self.DocumentObject.Title=self.DeHTML(self.DocumentObject.Title)
        print "DocumentProcessor: Document ready"
        self.done=1
        
    def DeHTML(self, page):
		style = re.compile('<style.*?>.*?</style>', re.I | re.S)
		script = re.compile('<script.*?>.*?</script>', re.I | re.S)
		comments= re.compile('<!--.*?-->', re.I | re.S)
		tags = re.compile('<.*?>', re.S)
		nbsp = re.compile('&nbsp;', re.S)
		tmpPageContent = nbsp.sub('', tags.sub(' ', comments.sub(' ', script.sub(' ', style.sub(' ', page)))))
	
		tmpPageContent=tmpPageContent.replace('\n', ' ')
		for iSpace in string.whitespace:
			if not iSpace==' ':
				tmpPageContent=tmpPageContent.replace(iSpace, '')
	
		PageContent=''
		for iChar in range(len(tmpPageContent)):
			if iChar<len(tmpPageContent)-1:
				if not tmpPageContent[iChar]==' ':
					PageContent=PageContent+tmpPageContent[iChar]
				else:
					if not tmpPageContent[iChar+1]==' ':
						PageContent=PageContent+tmpPageContent[iChar]
			else:
				PageContent=PageContent+tmpPageContent[iChar]
	
		PageContent=PageContent.strip()
		return PageContent

    def CreateDocumentVector(self, PageContent):
        # gets page content and returns the tagged text in form: word/NN, etc.
        POS=""
        Word=""
        strMorphy=""
        SentenceCounter=0
        dicWords={}
        dicDocumentVector={}
        lstTaggedText=[]
        nounMatcher = re.compile("^N")  # matches all kind of "noun"-pos, ex.:/NN, /NNS, etc.
        verbMatcher = re.compile("^V")	# matches all kind of verbs...

        lstSentence=self.Sentenize(PageContent)
        for sentence in lstSentence:
        	SentenceCounter=SentenceCounter+1
        	sentence=sentence.lower()
        	sentence=sentence.replace("\n"," ")
    		#tag current sentence
    		lstTaggedText=self.Tagger.tag(sentence,0)
	        for dicWords in lstTaggedText[1:-1]: # exclude boundary tokens
	            del dicWords["all_pos"]
	            Word = dicWords['word']
	            POS = dicWords['pos']
	            
	            if nounMatcher.search(POS) or verbMatcher.search(POS):  # creates a dict{word:count}
					dicDocumentVector[Word]=dicDocumentVector.get(Word,0)+1 #(1*WordBonusFactor)
	            else:
					if Word in self.lstQuery:
						dicDocumentVector[Word]=dicDocumentVector.get(Word,0)+1 #(1*WordBonusFactor)
        return dicDocumentVector
        
    def Sentenize(self, PageContent):
		lstSentence=[]
		StartValue=0
		EndValue=0
		
		for i in range(len(PageContent)-1):
			if (PageContent[i]==".") or (PageContent[i]=="!") or (PageContent[i]=="?") or (PageContent[i]==";"):
				if ((PageContent[i]==".") and (not ((PageContent[i-1].isdigit()) and (PageContent[i+1].isdigit())))):
					if not (((PageContent[i-1].isalnum()) and (PageContent[i+1].isalnum())) and (PageContent[i+1].islower())):
						EndValue=i
						lstSentence.append(PageContent[StartValue:EndValue+1])
						StartValue=i+1    
		return lstSentence

    def GetWordCount(self, dicDocumentVector):
        tmpCounter=0
        for i in dicDocumentVector.itervalues():
            tmpCounter=tmpCounter+i
        return tmpCounter
#End of class PageProcessor

#Begin of class PostProcessor
class PostProcessor:
	def __init__(self, DocumentCollection):
		self.DocumentCollection=DocumentCollection

	def SingularizeDC(self, DocumentCollection):
		strMorphy=""
		print "DocumentProcessor: performing DocumentVector post processing..."
		for DocumentObject in DocumentCollection.dicDocumentCollection.itervalues():
			TMP_dicDocumentVector={}
			for Word in DocumentObject.dicDocumentVector:
				strMorphy=morphy(Word, "noun")
				if strMorphy:
					TMP_dicDocumentVector[strMorphy]=TMP_dicDocumentVector.get(strMorphy,0)+DocumentObject.dicDocumentVector[Word]
				else:
					strMorphy=morphy(Word, "verb")
					if strMorphy:
						TMP_dicDocumentVector[strMorphy]=TMP_dicDocumentVector.get(strMorphy,0)+DocumentObject.dicDocumentVector[Word]
					else:
						TMP_dicDocumentVector[Word]=DocumentObject.dicDocumentVector[Word]
			DocumentObject.dicDocumentVector=TMP_dicDocumentVector
			self.DocumentCollection.UpdateDocument(DocumentObject)
