#!python2.4.4 # -*- coding: iso-8859-1 -*- """ Project: LRApy Author: Max Jakob (max.jakob@web.de) Module: synonym Module Description: This module contains functions and a parser to get synonyms from an online similarity thesaurus. In LRApy this is used to search for alternate word pairs. Version: 1.0 Last change: 2007-01-23 Copyright 2007 by Max Jakob. This code is released under the GNU GPL. See the accompanying LICENSE file. Embedded documentation can be translated with the Python pydoc module. """ import re, urllib, HTMLParser class DependencyBasedSimilarityHTMLParser(HTMLParser.HTMLParser): """(SAX-like) HTML-Parser for synonym pages from http://www.cs.ualberta.ca/~lindek/demos/depsim.htm Synonyms are inside tags. After these tags, the similarity scores occurr for that synonym. The rest of the data is ignored. The parser is independent of the input word and merges all POS's. The method getWholeSynonymList() returns synonyms decreasing in similarity score. Usage: p = DependencyBasedSimilarityHTMLParser() p.feed(htmlCode) p.close() synList = p.getCompleteSynonymList() """ def __init__(self): self.synonymDict = {} self.currentSynonym = "" self.synonymAheadFlag = False HTMLParser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if tag == "font": self.synonymAheadFlag = True else: self.synonymAheadFlag = False def handle_data(self, data): if self.currentSynonym: score = float(data.replace(",", "")) if self.synonymDict.get(self.currentSynonym, 0) < score: self.synonymDict[self.currentSynonym] = score self.currentSynonym = "" elif self.synonymAheadFlag: if self.isProperWord(data): self.currentSynonym = data else: self.synonymAheadFlag = False def isProperWord(self, word): """Returns False if - is shorter than 4 letters - contains non-alphabetical characters - contains capitalized letters - contains a hyphen - contains whitespace (multi-word phrases) - consists of just whitespace and returns True otherwise. """ if len(word) <= 3 or re.search("[^a-z]", word) or not word.strip(): return False else: return True def getCompleteSynonymList(self): """Returns a list of synonyms, decreasing in similarity to the word whoms HTML-code was fed. Multible POS's are merged. """ return [word for score,word in sorted( [(score,word) for word,score in self.synonymDict.iteritems()] , reverse=True)] def _getSynonymsHTMLPage(word): """Returns the HTML-code for of Prof. Lin's Dependency-based Word Similarity online-demo as a string. It also corrects some formal syntax-errors inside the tags. Some information, that is unimportant to the following parsing process (links to other synonyms), is lost due to this adjustment. """ simDemoURL = "http://armena.cs.ualberta.ca/cgi-bin/getsim.sh?%s" params = urllib.urlencode({'word': word, 'simdb': '/data/web/LaTaT/dict/sims'}) page = urllib.urlopen(simDemoURL%params) htmlCode = page.read() page.close() return re.sub('href=[^>]+>', 'href="">', htmlCode) def getSynonymList(word, numSim=10): """Returns a list of the top synonyms to , using the Dependency-based Word Similarity online-demo. Multible POS's are merged. """ simParser = DependencyBasedSimilarityHTMLParser() simParser.feed(_getSynonymsHTMLPage(word)) simParser.close() return simParser.getCompleteSynonymList()[:numSim] def test(): import sys if len(sys.argv) == 2: w = sys.argv[1] else: w = raw_input("Enter a word: ") print "Top 10 synonyms for '%s':\n%s"%(w, ", ".join(getSynonymList(w))) if __name__ == "__main__": test()