#!python2.4.4
# -*- coding: iso-8859-1 -*-

"""
Project: LRApy
Author: Max Jakob (max.jakob@web.de)
Module: matrix
Module Description: This module provides a matrix data structure, entropy
and SVD calculations, and a cosinus similarity function.

Version: 1.1
Last change: 2007-02-04

Copyright 2007 by Max Jakob.
This code is released under the GNU GPL. See the accompanying LICENSE file.

Embedded documentation can be translated with the Python pydoc module.
"""


import numpy, math

def cosinus(vector1, vector2):
	"""Returns the cosinus similarity for two one-dimensional vectors of
	the same length as a float."""
	if len(vector1) != len(vector2):
		raise ValueError, "the two vectors must be of same length"
	enumerator, denominator1, denominator2 = 0, 0, 0
	for idx in xrange(len(vector1)):
		enumerator += vector1[idx] * vector2[idx]
		denominator1 += vector1[idx] ** 2
		denominator2 += vector2[idx] ** 2
	denominator = math.sqrt(denominator1) * math.sqrt(denominator2)
	if not denominator: return 0
	return float(enumerator) / (denominator)

class LRAMatrix:
	"""A LRAMatrix instance represents a matrix inside which the
	relational similarity between word pairs can be computed.
	<matrix> is a two-dimensional numpy-array.
	<rows> is a mapping from word pairs to their row number in <matrix>,
	so that the word pairs and their vectors can be found easily.
	"""
	def __init__(self):
		self.matrix = numpy.array([], dtype="float")
		self.rows = {}  # mapping of word pair tuples to row numbers
		self._longestWordPair = 0  # for string formatting

	def __str__(self):
		"""Returns an unsorted string of the self.matrix, in which the
		rows are labeled with the actual word pairs.
		"""
		rowTemplate = "%"+str(self._longestWordPair)+"s %s\n"
		res = ""
		for pair in self.rows.iterkeys():
			res += rowTemplate%(":".join(pair), self.getRowVector(pair))
		return res[:-1]

	def setWordPairVector(self, wordPairTuple, vector):
		"""Appends <vector> to the matrix and adds a mapping of
		<wordPairTuple> to the right row number to the self.rows
		dictionary. The length of the two word pairs is also checked,
		for string formatting porpuses.
		"""
		self.rows[wordPairTuple] = len(self.matrix)
		if self.matrix.shape == (0,):
			self.matrix = numpy.array([vector], dtype="float")
		else:
			self.matrix = numpy.concatenate([self.matrix, [vector]])
		if len(":".join(wordPairTuple)) > self._longestWordPair:
			self._longestWordPair = len(":".join(wordPairTuple))

	def getRowNumber(self, wordPairTuple):
		"""Returns the row number of the word pair of <wordPairTuple>.
		"""
		try:
			return self.rows[wordPairTuple]
		except KeyError:
			raise KeyError, "word pair '%s' is not assigned to a row"

	def getRowVector(self, wordPairTuple):
		"""Returns the row vector of the word pair of <wordPairTuple>.
		"""
		return self.matrix[self.getRowNumber(wordPairTuple)]

	def applyEntropy(self):
		"""Apply log and entropy transformations to the matrix.
		"""
		if self.matrix.shape == (0,): return
		rowCount, columnCount = self.matrix.shape
		for columnIdx in xrange(columnCount):
			entropy, columnSum = 0, 0
			for rowIdx1 in xrange(rowCount):
				columnSum += self.matrix[rowIdx1,columnIdx]
			for rowIdx2 in xrange(rowCount):
				if columnSum:
					prob = float(self.matrix[rowIdx2,columnIdx]/columnSum)
					if prob <= 0: continue
					entropy += prob * math.log(prob)
			weight = 1 - (- (entropy/math.log(rowCount) ) )
			for rowIdx3 in xrange(rowCount):
				logTrans = math.log(self.matrix[rowIdx3,columnIdx] + 1)
				self.matrix[rowIdx3,columnIdx] = float(weight * logTrans)

	def applySVD(self, k):
		"""Apply Singular Valus Decomposition (SVD) to the matrix, and project
		to <k> columns. Sigma is a one-dimensional array of non-zero diagonal
		elements of sigma in descending order. It is converted in a kxk array
		to be able to multiply the matrices.
		We drop vt (V transposed), because it can be shown that is not important 
		for the cosinus similarity.
		"""
		if self.matrix.shape == (0,): return
		try:
			u, sigma, vt = numpy.linalg.svd(self.matrix)
		except numpy.linalg.linalg.LinAlgError:
			print "  SVD did not converge -> skipping SVD"
			return				
		if k > len(sigma): k = len(sigma)
		truncatedSigma = numpy.zeros((k,k))  # --> diagonal matrix Sigma_k
		for i in xrange(k):
			truncatedSigma[i,i] = sigma[i]
		self.matrix = numpy.dot(u[:,:k], truncatedSigma)  # U_k * Sigma_k
	#	self.matrix = numpy.dot(u[:,:k],
	#		numpy.dot(truncatedSigma, vt[:k]))  # U_k * Sigma_k * Vt_k


def getEmptyVector(vectorLen):
	"""Returns an empty one-dimensional numpy.array with length <vectorLen>.
	"""
	return numpy.empty(vectorLen, dtype="float")

def mergeVectors(vector1, vector2):
	"""Merges two vectors of the same dimension.
	"""
	if len(vector1) != len(vector2):
		raise ValueError, "the two vectors must be of same length"
	mergeResult = getEmptyVector(len(vector1)*2)
	idx = 0
	for j in xrange(len(vector1)):
		mergeResult[idx] = vector1[j]
		mergeResult[idx+1] = vector2[j]
		idx += 2
	return mergeResult
