"""
Modul AcronmyDecomposer
Programmier-Gesellenstueck
Author: Branimira Nikolova
braniad@yahoo.com
SS 2006

This module surches text for Akronym-Expansion Pairs and trys to match
the them to each other.

"""


import re
from AD_token import Token
                       
class Acro(Token):
    """
    class Acro:
    arguments:
    acronym: the acronym self
    expansion: the resulting expansion
    expCandidate: the candidate string for the expansion
    candidate: the candidate string for the acronym
    position: the current possition necessary to calculate the search space 
    digitDict: possible matches for some digits
    """
    
    def __init__(self):

        self.acronym = ''
        self.expansion = ''
        self.candidate = ''
        self.expCandidate = ''
        self.position = 0
        self.digitDict = {'2':['to', 'two', 'second'],'3':'third','4':['for', 'fourth']}
        
    def __str__(self):
        """
        Representation from Acro for printing.
        """
        return '%s, %s' %(self.acronym, self.expansion)

    
    def findAcroCandidate(self, text):
        '''
        Surches for Acronym-candidates: uppercase or capitalized tokens
        - in parentheses
        - infront of parentheses
        - inftont of ", or" 
        - after ", or"
        
        Returns Candidates list.
        '''

        newToken=Token(token=None, tag=None)
        tokenList=self.tagTextWithMT(text)

            
        resultList = []
        result= None
        mix=('([A-Z][a-zA-Z]+)')
        regex=re.compile(mix)
        
        for it in range(len(tokenList)):          
            if (tokenList[it].token.istitle() or tokenList[it].token.isupper()\
                or regex.match(tokenList[it].token)):
                
                if not  2 > len(tokenList[it].token)  >= 10:
                    
                    if tokenList[it].tag in newToken.tagSetDict["noun"] :
                       
                        
                        # if acronym in parentheses
                        if tokenList[it-1].tag == newToken.tagSetDict["open"]:
                            
                            if tokenList[it+1].tag == newToken.tagSetDict['close']:
            
                   
                                self.candidate = tokenList[it].token
                                self.position = it
                                self.expCandidate=self.computeSearchSpace(self.candidate, self.position, tokenList, newToken.tagSetDict)
                                result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
                                if result:
                                    resultList.append(result)

                        # if acronym infront of parentheses         
                        elif tokenList[it+1].tag == newToken.tagSetDict["open"]:
                            if tokenList[it+3].tag != newToken.tagSetDict["close"]:
                                self.expCandidate=[]
                                if tokenList[it].tag == newToken.tagSetDict["digit"]:
                                    continue
                                else:
                                    self.candidate = tokenList[it].token
                                
                                while tokenList[it+1].tag != newToken.tagSetDict["close"]:
                                    if tokenList[it+1].tag in newToken.tagSetDict['delimiters']:
                                        break
                                    else:
                                        it+=1
                                        self.expCandidate.append(tokenList[it])
                                        

                                if len(self.expCandidate) < 2:
                                    continue
                                else:
                                    self.expCandidate.reverse()         
                            
                                    result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
                                    if result:
                                        resultList.append(result)                           
                                
                        # if acronym follows ', or'
                        elif (tokenList[it-1].token == 'or' and tokenList[it-2].token == ','):
                            self.candidate = tokenList[it].token
                            self.position = it-1
                    
                            self.expCandidate=self.computeSearchSpace(self.candidate, self.position, tokenList, newToken.tagSetDict )
                            result=self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
                            if result:
                                resultList.append(result)


                        # if acronym infront of ", or"
                        elif (tokenList[it+1].token==',' and tokenList[it+2].token == 'or'):
                            self.expCandidate = []
                            self.candidate = tokenList[it].token
                            self.position = it
                            it+=3
                            
                       
                            while 1:
                                if tokenList[it].tag in newToken.tagSetDict['delimiters']:
                                    break
                                else:
                                    self.expCandidate.append(tokenList[it])
                                    it+=1
                    
                            if len(self.expCandidate) < 2:
                                continue
                            else:
                                self.expCandidate.reverse()
                            result= self.checkTrueExp(self.candidate, self.expCandidate, newToken.tagSetDict)
                            if result:
                                resultList.append(result)

            
        return resultList
                        
        

    def computeSearchSpace(self, candidate, position, tokenList, tagSetDict):
        '''
        Computes the search space for Acronym-Candidates:
        - if the Acronym-Candidate is longer than 5 characters the searchspace is definedto be the lenght of the Acronym-Candidate+5;
        - if it is shorter than 5 characters the searchspace is the lenght*2

        Returns the Acronym-Candiate as list of AD_Token instances.
        '''
        
        expCandidate = []
        
        if len(self.candidate)>= 5:
            searchSpace = len(self.candidate)+5
        else:
            searchSpace = len(self.candidate)*2

        start = self.position-2
            
        end = start-searchSpace
                
        

        while start > end:
           
            if tokenList[start].tag in tagSetDict['delimiters']:
                break
            else:
                expCandidate.append(tokenList[start])
            start-=1
        
        return expCandidate

        
    def checkTrueExp(self, candidate, expCandidate, tagSetDict):
        '''
        Compares the Acronym-Candidate and the Expansion-Candidate backwards.
        Each character from the Acronym-Candidate must appear in one of the tokens in the Expansion-Candidate
        in the same order as in the Acronym-Candidate; the first character of the Acronym-Candidate must match
        a character in the initial position of the first word in the Expansion-Candidate.

        Returns an Acro instance.        
        '''

        text=''
        for i in self.expCandidate:
            text='%s %s' %(text,i.token)

        # iterator over the acronym candidate list(clist)            
        itc = 0
        #iterator over the expansion candidate list (self.expCandidate)
        its = 0
        #acronym candidate list
        clist=[]
        match=''
        found=0
        # expansion result list
        expansion=[]
        expansionEnd=0
        
        for character in self.candidate:
            clist.append(character)
        clist.reverse()
        
        while 1:
            # if last character in Acronym-Candidate reached
            if itc==len(clist)-1:
                
                # no prepositions and conjunctions as first token
                if self.expCandidate[its].tag in tagSetDict['stopTags']:
                    its+=1
                    continue
                else:
                    first=self.expCandidate[its].token
                    
                    if clist[itc].lower()== first[0].lower():
                        expansion.append(self.expCandidate[its].token)
                        expansionEnd=its
                        
                        break
                    else:
                        its+=1
                        if its > len(self.expCandidate)-1:

                            break
                            
            
                
            elif clist[itc].lower() in self.expCandidate[its].token.lower():

                #if the same character allready matched in the same token, move one token forward
                if match == (clist[itc], self.expCandidate[its]):
                    its+=1
                else:
                    expansion.append(self.expCandidate[its].token)
                    
                    match=(clist[itc], self.expCandidate[its])
                    found+=1
                    itc+=1
                    
                    # max 3 matches in the same token allowed
                    if  found >=3:
                        its+=1
                        found=0
                    

            elif clist[itc].isdigit():
                its+=1
                if self.expCandidate[its].tag == tagSetDict['digit']:
                    if clist[itc]== self.expCandidate[its].token:
                        
                        expansion.append(self.expCandidate[its].token)
                        
                        itc+=1
                        its+=1
                       
                    
                    
                elif self.checkDigitDict(clist[itc], self.expCandidate[its].token):
                    expansion.append(self.expCandidate[its].token)
                    itc+=1
                    its+=1
                   
                
                elif self.countFirstChar(clist, itc, self.expCandidate, its):
                    its+=1
                    step=int(clist[itc])
                    
                   
                    for it in self.expCandidate[its:(its+step)]:
                        expansion.append(it.token)
                        
                    itc+=1
                    its+=step-1
                    if itc==len(clist)-1:
                        expansionEnd=its
                        break
                else:
                    break                   
            
            else:
                its+=1
                if its>=len(self.expCandidate):
                    break
                    
                
            if itc>len(clist):
                break
        
        if len(expansion) < len (self.candidate):
            
            return 0
        
        else:
            # build the expansion string
            i=expansionEnd
            full = ''
            while i >=0:
                full="%s %s" %(full, self.expCandidate[i].token) 
                i-=1
           

            newA=Acro()
            newA.acronym = self.candidate
            newA.expansion = full
       
            print newA
            return newA
    
    def checkDigitDict(self, ac, ec):
        
        '''Check if the digit in the acronym stands for
        a word like "for"(4) or "to"(2).
        '''
        
        if ac in self.digitDict.keys():

            if ec.lower() in self.digitDict[ac]:
                return 1
            else:
                return 0

    def countFirstChar(self, ac, acPos, ec, ecPos):
        
        ''' If the acronym starts with a digit, or some digit is inside
        of the acronym, count the words in the expansion, which start with
        the character preceding or following the digit and if the number of the
        characters is equal to the digit return 1.
        '''

        count=0
        ecEnd=len(ec)-1
        # if the acronym starts with digit
        if ac[acPos]==ac[-1]:
            print ac[acPos]
            
            for it in ec:
                if ac[acPos-1] == it.token[0]:
                    print it.token
                    count+=1
                    
        # if the digit is inside 
        elif ac[acPos+1]:
            print ac[acPos]
            for it in ec[ecPos:ecEnd]:
               
                if ac[acPos+1] == it.token[0]:
                    print it.token
                    count+=1

        
        if count == int(ac[acPos]):
            return 1
        else:
            return 0
                
if __name__=="__main__":

    import sys, os
    
    if len(sys.argv) == 1:
        filename = "\\res\\TestFile.txt"
        try:
            fh=open(filename, 'r')
            text=fh.read()
            fh.close()
            newAcro=Acro()   
            result=newAcro.findAcroCandidate(text)
            
        except:
            print "%s not found!" %(filename)
        
    elif len(sys.argv)>1:
        result = []
        for f in os.listdir(sys.argv[1]): 
            filename=os.path.join(sys.argv[1], f)
            print filename
            try:
                fh=open(filename, 'r')
                text=fh.read()
                fh.close()
                newAcro=Acro()   
                result=newAcro.findAcroCandidate(text)
            
            except:
                print "Error while reading %s!" %(filename)
            
           
        num=0
        if not result:
            print "There was no Acronym-Expansion Pairs in this document!"
        else:
            for item in result:
                num+=1
                print num,item

