#!python2.4.4
# -*- coding: iso-8859-1 -*-

"""
Project: LRApy
Author: Max Jakob (max.jakob@web.de)
Module: synonym
Module Description: This module contains functions and a parser to get
synonyms from an online similarity thesaurus. In LRApy this is used to
search for alternate word pairs.

Version: 1.0
Last change: 2007-01-23

Copyright 2007 by Max Jakob.
This code is released under the GNU GPL. See the accompanying LICENSE file.

Embedded documentation can be translated with the Python pydoc module.
"""


import re, urllib, HTMLParser

class DependencyBasedSimilarityHTMLParser(HTMLParser.HTMLParser):
	"""(SAX-like) HTML-Parser for synonym pages from
	http://www.cs.ualberta.ca/~lindek/demos/depsim.htm

	Synonyms are inside <font></font> tags.	After these tags, the similarity
	scores occurr for that synonym. The rest of the data is ignored.
	The parser is independent of the input word and merges all POS's.
	The method getWholeSynonymList() returns synonyms decreasing in similarity
	score.

	Usage:
		p = DependencyBasedSimilarityHTMLParser()
		p.feed(htmlCode)
		p.close()
		synList = p.getCompleteSynonymList()
	"""
	def __init__(self):
		self.synonymDict = {}
		self.currentSynonym = ""
		self.synonymAheadFlag = False
		HTMLParser.HTMLParser.__init__(self)

	def handle_starttag(self, tag, attrs):
		if tag == "font":
			self.synonymAheadFlag = True
		else:
			self.synonymAheadFlag = False

	def handle_data(self, data):
		if self.currentSynonym:
			score = float(data.replace(",", ""))
			if self.synonymDict.get(self.currentSynonym, 0) < score:
				self.synonymDict[self.currentSynonym] = score
			self.currentSynonym = ""
		elif self.synonymAheadFlag:
			if self.isProperWord(data):
				self.currentSynonym = data
			else:
				self.synonymAheadFlag = False

	def isProperWord(self, word):
		"""Returns False if <word>
		 - is shorter than 4 letters
		 - contains non-alphabetical characters
		 - contains capitalized letters
 		 - contains a hyphen
		 - contains whitespace (multi-word phrases)
		 - consists of just whitespace
		and returns True otherwise.
		"""
		if len(word) <= 3 or re.search("[^a-z]", word) or not word.strip():
			return False
		else:
			return True

	def getCompleteSynonymList(self):
		"""Returns a list of synonyms, decreasing in similarity to the
		word whoms HTML-code was fed. Multible POS's are merged.
		"""
		return [word for score,word in sorted(
			[(score,word) for word,score in self.synonymDict.iteritems()]
			, reverse=True)]


def _getSynonymsHTMLPage(word):
	"""Returns the HTML-code for <word> of Prof. Lin's Dependency-based
	Word Similarity online-demo as a string. It also corrects some formal
	syntax-errors inside the <a> tags. Some information, that is unimportant
	to the following parsing process (links to other synonyms), is lost due
	to this adjustment.
	"""
	simDemoURL = "http://armena.cs.ualberta.ca/cgi-bin/getsim.sh?%s"
	params = urllib.urlencode({'word': word,
		'simdb': '/data/web/LaTaT/dict/sims'})
	page = urllib.urlopen(simDemoURL%params)
	htmlCode = page.read()
	page.close()
	return re.sub('href=[^>]+>', 'href="">', htmlCode)

def getSynonymList(word, numSim=10):
	"""Returns a list of the top <numSim> synonyms to <word>, using the
	Dependency-based Word Similarity online-demo. Multible POS's are
	merged.
	"""
	simParser = DependencyBasedSimilarityHTMLParser()
	simParser.feed(_getSynonymsHTMLPage(word))
	simParser.close()
	return simParser.getCompleteSynonymList()[:numSim]


def test():
	import sys
	if len(sys.argv) == 2:
		w = sys.argv[1]
	else:
		w = raw_input("Enter a word: ")
	print "Top 10 synonyms for '%s':\n%s"%(w, ", ".join(getSynonymList(w)))

if __name__ == "__main__":
	test()
