#!/usr/bin/env python
"""An integrated semantic tagger and TNT2XML-Converter.
semTagger reads the output of the part-of-speech tagger TNT by Thorsten Brants
<http://www.coli.uni-sb.de/~thorsten/tnt/> from <STDIN> , adds some coarse
semantic information to some of the tokens and writes the output in an XML
format to <STDOUT>.

INSTALLATION

    After unpacking the files into a directory of your choice, please make sure
    that these external components are installed:

    Python <http://www.python.org/>

        semTagger has been implemented and tested using Python 2.2.1.
        If you run into problems with Python1.5 this could be the reason.

    TNT <http://www.coli.uni-sb.de/~thorsten/tnt/>

        The tagger is free of charge for research purposes. There is a shell
        script in the semTagger folder which might be useful (tntg.sh)

USAGE

    [python][./]paramark.py < text.txt |tntg.sh |[python] [./]semTagger.py
    or
    [bash] [./]sem.sh text.txt

INPUT

    TNT's output will be read from <STDIN>.
    Example:
    
        Mein                    PPOSAT
        Feuer                   NN
        brennt                  VVFIN
        lichterloh              ADJD
        .                       $.

    The input may also contain linebreak markers: <LINEBREAK/>.
    Use paramark.py in a first pipe stage to mark the text.

    Important: the input must be encoded ISO8859-1. Use a good text editor
    to save your input data in the right encoding (Western European (ISO)).
    http://www.editplus.com ist capable of saving in different encodings.

OUTPUT

    The output will be written to <STDOUT> with one token each line.
    Each token has two attributes: pos and sem.
    Example:
        <?xml version="1.0" standalone="yes"?>
        <TEXT>
        <token pos="APPRART" sem="">Am</token>
        <token pos="NN" sem="wtag">Samstag</token>
        <token pos="COM" sem="">,</token>
        <token pos="ART" sem="">den</token>
        <token pos="CARD_PER" sem="datum">31.8.</token>
        <token pos="APPR" sem="">um</token>
        <token pos="CARD_NN" sem="zeit">14.30 Uhr</token>
        <token pos="VVFIN" sem="">findet</token>
        <token pos="PIS" sem="">nichts</token>
        <token pos="PTKVZ" sem="">statt</token>
        <token pos="PER" sem="">.</token>
        <LINEBREAK/>
        <token pos="ADV" sem="">Leider</token>
        <token pos="PER" sem="">.</token>
        <LINEBREAK/>

    ATTRIBUTES:

        pos: please refer to the STTS tag set table
             <http://www.ims.uni-stuttgart.de/projekte/corplex/TagSets/stts-table.html>
        sem:
            "ort": locations (ex: "Feuerwache", "Hinterzimmer", "Brunnengasse")
            "stadt": german cities (ex: "Heidelberg", "Hamburg", "Celle")
            "veranstalter": organizers (ex: "Geschichtsverein", "B-M-Gruppe")
            "datum": dates (ex: "3.11.", "2. November")
            "jahr:  years (ex: "2002", "1978")
            "zeit": time (ex: "12 Uhr", "17:45")
            "wtag": week days (ex: "Montag")

AUXILLIARY SCRIPTS

    paramark.py        Inserts placeholders for paragraph and line breaks

FILES

    germanCities.txt:    a list of German cities
    months.txt:          a list of month names
    locIndicators.txt:   a list of indicators found in location names
    wDays.txt:           a list of week days
    iso8859Entities.txt: a list of ascii-codes and their XML-Entitities

"""

__author__ = 'Oezguer Demir <oeze@gmx.de>'
__date__ = 'Fri Dec 13, 2002   08:19 PM'
__version__ = '1.8'
__project__ = 'terminparser' 

import string
import sys
import re

WDAYS = {} # the week days (string:0)
MONTHS = {} # the months (string:0)
GERMANCITIES = {} # the 1457 biggest German cities (string:0)
LOCINDICATORS = [] # location indicator strings (list of strings)
ORGINDICATORS = [] # organizor indicator strings (list of strings)
ISO8859XML = {} # all ISO8859 XML-Entitites (ascii_value_integer:xml_ent_string)

def init_global_vars():
    '''Initialize all global variables.'''
    global WDAYS, MONTHS, LOCINDICATORS, GERMANCITIES, ISO8859XML
    #  WDAYS
    wdaysFile = open('wDays.txt', 'r')
    wdays = wdaysFile.readlines ()
    wdaysFile.close()
    for wday in wdays:
        WDAYS[string.strip(wday)] = 0
    # MONTHS
    monthsFile = open('months.txt', 'r')
    months = monthsFile.readlines ()
    monthsFile.close()
    for month in months:
         MONTHS[string.strip(month)] = 0
    # LOCINDICATORS
    locIndicatorsFile = open('locIndicators.txt', 'r')
    locIndicators = locIndicatorsFile.readlines ()
    locIndicatorsFile.close()
    for locIndicator in locIndicators:
        LOCINDICATORS.append(string.strip(locIndicator))
    # ORGINDICATORS
    orgIndicatorsFile = open('orgIndicators.txt', 'r')
    orgIndicators = orgIndicatorsFile.readlines ()
    orgIndicatorsFile.close()
    for orgIndicator in orgIndicators:
        ORGINDICATORS.append(string.strip(orgIndicator))
    # GERMNANCITIES
    citiesFile = open('germanCities.txt', 'r')
    germanCities = citiesFile.readlines ()
    citiesFile.close()
    for city in germanCities:
        GERMANCITIES[string.strip(city)] = 0
    # Iso8859XML
    iso8859File = open('iso8859Entities.txt', 'r')
    iso8850List = iso8859File.read().split()
    i = 0
    for key in iso8850List:
        if i%2==0:
            ISO8859XML[string.atoi(key)] = iso8850List[i+1]
        i+=1

class Word:
    '''Just a container class for holding the two attributes surface form and
    POS-Tag, the surface form and the index of a word object.'''
    def __init__(self, inWord, inPosTag, inSemTag, inIndex):
        self.surface = inWord
        self.posTag = inPosTag
        self.semTag = inSemTag
        self.index = inIndex
    
def read_input():
    '''Read and clean the input, return a single string'''
    return replace_dollar(
        add_pseudo_postags(
        clean_xml_string(
        remove_comments(
        sys.stdin.read()))))

def clean_xml_string(inText):
    '''Replace all special characters with &xmlEntities;'''
    global ISO8859XML
    outText = []
    for char in inText:
        if ISO8859XML.has_key(ord(char)):
            char = ISO8859XML[ord(char)]
        outText.append(char)
    outText = string.join(outText, "")
    return outText

def add_pseudo_postags(inText):
    '''Help TNT by tagging each <LINEBREAK/> as pos:NONE'''
    outText = string.replace(inText, '&lt;LINEBREAK/&gt;',
                            '&lt;LINEBREAK/&gt;\t\tNONE')
    return outText

def replace_dollar(inText):
    '''Replace "$," "$." and "$(" with "COM" "PER" and "INTP"'''
    inText = string.replace(inText, '$,', 'COM')
    inText = string.replace(inText, '$.', 'PER')
    inText = string.replace(inText, '$(', 'INTP')
    return inText

def remove_comments(inText):
    '''Remove the comment line (starting with %%)'''
    inText = re.sub('\%\%.*\n', '', inText)
    return inText

def get_sem(word, allWords):
    '''Return the semantic tag if a heuristic applies. Return "" if not.'''
    # Don't do redundant work
    if word.semTag != '': return word.semTag
    # init vars
    global WDAYS, MONTHS, GERMANCITIES, LOCINDICATORS, ORGINDICATORS
    WDAY = "wtag"
    DATE = "datum"
    YEAR = "jahr"
    TIME = "zeit"
    LOCATION = "ort"
    CITY = "stadt"
    ORGANIZER = "veranstalter"
    # go go
    #######
    # sem = "wtag"
    ##############
    if WDAYS.has_key(word.surface.lower()): return WDAY
    # sem = "datum" / sem = "zeit"
    ##############################
    if word.posTag == "CARD":
        dateCand = word.surface.split('.')
        # 12.01.98 / 12.01.1998 / 12.01.2002
        if len(dateCand) == 3: return DATE
        # 12.1
        if len(dateCand) == 2:
            if len(dateCand[1]) == 1:
                # 31.8. (w/ . as the next token)
                if allWords[word.index+1].surface == '.':
                    allWords[word.index+1].semTag = DATE
                return DATE
            try: #(try to perform atoi(). do nothing if it fails)
                # 12.1, 12.01 -> a date; 12.14 -> not a date
                if string.atoi(dateCand[1]) <= 12 and dateCand[1]!='00':
                    # 31.8. (w/ . as the next token)
                    if allWords[word.index+1].surface == '.':
                        allWords[word.index+1].semTag = DATE
                    return DATE
                if string.atoi(dateCand[0]) < 24 \
                   and string.atoi(dateCand[1]) < 60:
                    if allWords[word.index+1].surface.lower() == 'h' or \
                       allWords[word.index+1].surface.lower() == 'uhr':
                        allWords[word.index+1].semTag = TIME
                    return TIME
            except:
                pass
        if len(dateCand) == 1:
            # [85, 98, 01, 02, ...]
            try: yearCand = string.atoi(dateCand[0])
            except: yearCand = 32768
            if  yearCand >= 85 and yearCand <= 15: return YEAR
            if yearCand >= 1985 and yearCand <= 2015: return YEAR
        timeCand = word.surface.split(':')
        try:
            # 15:30, 16:24, ...
            if string.atoi(timeCand[1]) <= 12 and timeCand[1]!='00':
                if allWords[word.index+1].surface.lower() == 'h' or \
                   allWords[word.index+1].surface.lower() == 'uhr':
                    allWords[word.index+1].semTag = TIME
                return TIME
            if string.atoi(timeCand[0]) < 24 \
               and string.atoi(timeCand[1]) < 60:
                if allWords[word.index+1].surface.lower() == 'h' or \
                   allWords[word.index+1].surface.lower() == 'uhr':
                    allWords[word.index+1].semTag = TIME
                return TIME
        except:
            pass
        # 16h, 4h, 14H
        if (len(word.surface) == 3 or len(word.surface) == 2) \
           and word.surface[-1].lower() == 'h':
            return TIME
        # 16 Uhr, 3 uhr
        if word.index != len(allWords)-1:
            if allWords[word.index+1].surface.lower() == 'h' or \
               allWords[word.index+1].surface.lower() == 'uhr':
                allWords[word.index+1].semTag = TIME
                return TIME
        # 12 Uhr -->15<--
        if allWords[word.index-1].surface.lower() == 'h' or \
           allWords[word.index-1].surface.lower() == 'uhr':
            allWords[word.index-1].semTag = TIME
            return TIME
    # end (if posTag == CARD)
    # 12. Dezember
    if word.posTag[0] == 'N':
        if MONTHS.has_key(word.surface.lower()):
            if allWords[word.index-1].posTag == 'ADJA':
                allWords[word.index-1].semTag = DATE
                return DATE
    # sem = "ort"/"city"
    ####################
    if word.posTag == 'NE':
        # check if it is a german city -> "city"
        if GERMANCITIES.has_key(word.surface): return CITY
    if word.posTag == 'NN' or word.posTag == 'NE':
        # check if one of the indicators like "-saal", "-hof", ... applies
        # -> "ort"
        lenWord = len(word.surface)
        for locIndicator in LOCINDICATORS:
            if lenWord >= len(locIndicator):
                if word.surface.lower().rfind(locIndicator) \
                   == lenWord-len(locIndicator):
                    return LOCATION
    # sem = "veranstalter"
    #####################
    if word.posTag == 'NN' or word.posTag == 'NE':
        # check if one of the indicators like "-verein", "-gruppe", ... applies
        # -> "veranstalter"
        lenWord = len(word.surface)
        for orgIndicator in ORGINDICATORS:
            if lenWord >= len(orgIndicator):
                if word.surface.lower().rfind(orgIndicator) \
                   == lenWord-len(orgIndicator):
                    return ORGANIZER
    # nothing applies -> return the empty string ""
    return ""

def merge_words(inWord1, inWord2):
    '''Combine two word objects to one.'''
    if inWord1.surface == '.' or inWord2.surface == '.':
        surface = inWord1.surface + inWord2.surface
    else:
        surface = inWord1.surface + ' ' + inWord2.surface
    posTag = inWord1.posTag + '_' + inWord2.posTag
    semTag = inWord1.semTag
    index = inWord1.index
    outWord = Word(surface, posTag, semTag, index)
    return outWord

def shrink_stack(inWordStack):
    '''Look for tokens in a row with the same semTag and merge them.'''
    _index = 0
    outWordStack = []
    oldWord = Word(-1, -1, -1, -1)
    for word in inWordStack:
        if word.semTag != "":
            if word.semTag == oldWord.semTag:
                addWord = merge_words(oldWord, word)
                outWordStack.pop()
            else:
                addWord = word
        else:
            addWord = word
        outWordStack.append(addWord)
        oldWord = addWord
    # correct the indices (remember we've removed some items)
    for word in outWordStack:
        word.index = _index
        _index += 1
    return outWordStack
    
def main():
    # Go Go
    init_global_vars()
    # Get the input tokens
    tokens = string.split(read_input())
    i = 0; wordStack = []; surface = ''; _word = Word('', '', '', 0)
    # Build a list of <code>Word</code> objects
    for token in tokens:
        #if token is word (not a pos tag)
        if i%2 == 0:
            _word.surface = token
            _word.index = i/2
        # token is a pos tag
        else:
            _word.posTag = token
            wordStack.append(Word(_word.surface,_word.posTag, '',_word.index))
        i+=1
    # Assign a semantic attribute where possible
    for _word in wordStack:
        _word.semTag = get_sem(_word, wordStack)
    # Shrink the stack (Merge multiple tokens in row of the same sem-type)
    wordStack = shrink_stack(wordStack);
    # Write the XML file
    print '<?xml version="1.0" standalone="yes"?>'
    print '<TEXT>'
    for _word in wordStack:
        if (_word.surface == '&lt;LINEBREAK/&gt;'):
            print '<LINEBREAK/>'
        else:
            print '<token pos="' + _word.posTag + '" sem="'\
                  + _word.semTag + '">' + _word.surface + '</token>'
    print '</TEXT>'
    # GAME OVER

#
# MAIN
#
if __name__ == '__main__':
    main()
