mò
0™ÜDc           @   s   d  Z  d k Z d e f d „  ƒ  YZ e d j ojd k Z d k Z e e i ƒ d j o^ d Z	 yA e
 e	 d ƒ Z e i ƒ  Z e i ƒ  e ƒ  Z e i e ƒ Z Wn d e	 GHn Xná e e i ƒ d j oÊ g  Z x“ e i e i d ƒ D]{ Z e i i e i d e ƒ Z	 e	 GHyA e
 e	 d ƒ Z e i ƒ  Z e i ƒ  e ƒ  Z e i e ƒ Z Wn d	 e	 GHn Xqë Wd
 Z x! e D] Z e d 7Z e Ge GHqwWn n d S(   s¸   
Modul AcronmyDecomposer
Programmier-Gesellenstueck
Author: Branimira Nikolova
SS 2006

This module surches text for Akronym-Expansion Pairs and trys to match
the them to each other.

Nt   Acroc           B   sG   t  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   Nc         C   sd   d |  _ d |  _ d |  _ d |  _ d |  _ h  d d d d g <d d <d	 d
 d g <|  _ d  S(   Nt    i    t   2t   tot   twot   secondt   3t   thirdt   4t   fort   fourth(   t   selft   acronymt	   expansiont	   candidatet   expCandidatet   positiont	   digitDict(   R   (    (    t   C:\AD\src\acronymDecomposer.pyt   __init__   s    					c         C   s   d |  i |  i f S(   Ns   %s, %s(   R   R   R   (   R   (    (    R   t   __str__   s    c   	      C   s«  t  d d d d ƒ } |  i | ƒ } g  } d } d } t
 i | ƒ } x_t t | ƒ ƒ D]K} | | i i ƒ  p+ | | i i ƒ  p | i | | i ƒ od t | | i ƒ j o
 d j n p×| | i | i d j o¸| | d i | i d j oš | | d i | i d	 j ow | | i |  _ | |  _ |  i |  i |  i | | i ƒ |  _ |  i |  i |  i | i ƒ } | o | i | ƒ q˜q—q›| | d i | i d j o+| | d
 i | i d	 j og  |  _ | | i | i d j o qX n | | i |  _ xf | | d i | i d	 j oF | | d i | i d j o Pq| d 7} |  i i | | ƒ qWt |  i ƒ d j  o qX qâ|  i i ƒ  |  i |  i |  i | i ƒ } | o | i | ƒ qâq—q›| | d i d j o“ | | d i d j o{ | | i |  _ | d |  _ |  i |  i |  i | | i ƒ |  _ |  i |  i |  i | i ƒ } | o | i | ƒ q—q›| | d i d j oî | | d i d j oÖ g  |  _ | | i |  _ | |  _ | d
 7} xC | | i | i d j o Pqð|  i i | | ƒ | d 7} qðWt |  i ƒ d j  o qX n |  i i ƒ  |  i |  i |  i | i ƒ } | o | i | ƒ q—q›qŸq£qX qX W| S(   sé   
        Surches for Acronym-candidates: uppercase or capitalized tokens
        - in parentheses
        - infront of parentheses
        - inftont of ", or" 
        - after ", or"
        
        Returns Candidates list.
        t   tokent   tags   ([A-Z][a-zA-Z]+)i   i
   t   nouni   t   opent   closei   t   digitt
   delimiterst   ort   ,N(   t   Tokent   Nonet   newTokenR   t   tagTextWithMTt   textt	   tokenListt
   resultListt   resultt   mixt   ret   compilet   regext   ranget   lent   itR   t   istitlet   isuppert   matchR   t
   tagSetDictR   R   t   computeSearchSpaceR   t   checkTrueExpt   appendt   reverse(	   R   R"   R)   R&   R$   R    R#   R%   R,   (    (    R   t   findAcroCandidate   s~    	  ?+	$	 
0$0		
  %c   	      C   s®   g  } t |  i ƒ d j o t |  i ƒ d } n t |  i ƒ d } |  i d } | | } xJ | | j o< | | i	 | d j o Pn | i | | ƒ | d 8} q` W| S(   s[  
        Computes the search space for Acronym-Candidates:
        - if the Acronym-Candidate is longer than 5 characters the searchspace is definedto be the lenght of the Acronym-Candidate+5;
        - if it is shorter than 5 characters the searchspace is the lenght*2

        Returns the Acronym-Candiate as list of AD_Token instances.
        i   i   R   i   N(   R   R+   R   R   t   searchSpaceR   t   startt   endR#   R   R0   R3   (	   R   R   R   R#   R0   R8   R   R6   R7   (    (    R   R1   Š   s     
 c         C   sT  d } x$ |  i D] } d | | i f } q Wd } d } g  } d }
 d } g  } d }	 x |  i D] } | i | ƒ qa W| i ƒ  x;| t | ƒ d j o¬ |  i | i | d j o | d 7} q… q£|  i | i } | | i ƒ  | d i ƒ  j o" | i |  i | i ƒ | }	 Pq£| d 7} | t |  i ƒ d j o Pq£n\| | i ƒ  |  i | i i ƒ  j o“ |
 | | |  i | f j o | d 7} q£| i |  i | i ƒ | | |  i | f }
 | d 7} | d 7} | d j o | d 7} d } q£n¢| | i ƒ  ok| d 7} |  i | i | d j oN | | |  i | i j o/ | i |  i | i ƒ | d 7} | d 7} qzq£|  i | | |  i | i ƒ o/ | i |  i | i ƒ | d 7} | d 7} q£|  i | | |  i | ƒ o‡ | d 7} t | | ƒ } x, |  i | | | !D] } | i | i ƒ q!W| d 7} | | d 7} | t | ƒ d j o | }	 Pqzq£Pn& | d 7} | t |  i ƒ j o Pn | t | ƒ j o Pq… q… Wt | ƒ t |  i ƒ j  o d Snm |	 } d } x6 | d j o( d | |  i | i f } | d 8} qóWt ƒ  } |  i | _ | | _
 | GH| Sd S(	   s¹  
        Compares the Acronym-Candidate and the Expansion-Candidate backwards.
        Each character from the Acronym-Candidate must appear in one of the tokens in the Expansion-Candidate
        in the same order as in the Acronym-Candidate; the first character of the Acronym-Candidate must match
        a character in the initial position of the first word in the Expansion-Candidate.

        Returns an Acro instance.        
        R   s   %s %si    i   t   stopTagsi   R   N(   R"   R   R   t   iR   t   itct   itst   clistR/   t   foundR   t   expansionEndR   t	   characterR3   R4   R+   R   R0   t   firstt   lowert   isdigitt   checkDigitDictt   countFirstChart   intt   stepR,   t   fullR    t   newAR   (   R   R   R   R0   R"   R;   R<   R@   R=   R?   R/   RH   R   R,   RG   R:   RI   R>   RA   (    (    R   R2   «   s¢     
 
 
  
!
	'




!

 
	

 		c         C   sD   | |  i i ƒ  j o* | i ƒ  |  i | j o d Sq@ d Sn d S(   s^   Check if the digit in the acronym stands for
        a word like "for"(4) or "to"(2).
        i   i    N(   t   acR   R   t   keyst   ecRB   (   R   RJ   RL   (    (    R   RD   2  s
     c         C   s  d } t | ƒ d } | | | d j oP | | GHxª | D]8 } | | d | i d j o | i GH| d 7} q; q; Wng | | d oW | | GHxK | | | !D]8 } | | d | i d j o | i GH| d 7} q¡ q¡ Wn | t	 | | ƒ j o d Sn d Sd S(   s   If the acronym starts with a digit, or some digit is inside
        of the acronym, count the words in the expansion, which start with
        the character preceding or following the digit and if the number of the
        characters is equal to the digit return 1.
        i    i   iÿÿÿÿN(
   t   countR+   RL   t   ecEndRJ   t   acPosR,   R   t   ecPosRF   (   R   RJ   RO   RL   RP   RM   RN   R,   (    (    R   RE   ?  s(     	 	 (	   t   __name__t
   __module__R   R   R5   R1   R2   RD   RE   (    (    (    R   R       s   				k	!	‡	t   __main__i   s   \t\testFile.txtt   rs   %s not found!s   Error while reading %s!i    (   t   __doc__R'   R   R    RQ   t   syst   osR+   t   argvt   filenameR   t   fht   readR"   R   t   newAcroR5   R%   t   listdirt   ft   patht   joint   numt   item(   R\   R^   RY   RV   R'   Ra   R%   Rb   R"   RZ   RW   R    (    (    R   t   ?
   sD   	ÿ R
	 
	 
