#!python2.4.4
# -*- coding: iso-8859-1 -*-

"""
Project: LRApy
Author: Max Jakob (max.jakob@web.de)
Module: search
Module Description: This module contains the SearchEngine class for
context searching in the corpus and a function to compute patterns
from a phrase list.

Version: 1.0
Last change: 2007-02-03

Copyright 2007 by Max Jakob.
This code is released under the GNU GPL. See the accompanying LICENSE file.

Embedded documentation can be translated with the Python pydoc module.
"""


import os, re, corpusInterface

def getPatterns(phraseList, numPatterns):
	"""Returns a list of patterns. <phraseList> is a list of lists.
	The lists inside, contain words and form a phrase together.
	A pattern is recursively constructed by replacing any or all or none
	of those words with a wildcard ("*"):
	  Example (phraseList=[["it"], ["works", "well"]]):
	  ["it", "*", "works well", "works *", "* well", "* *"]
	
	Only the <numPatterns> most frequent patterns are kept.
	"""
	def getPatternList(wordList):
		if len(wordList) == 1:
			return [wordList[0], "*"]
		res = []
		for pat in getPatternList(wordList[1:]):
			res.append(wordList[0]+" "+pat)
			res.append("* "+pat)
		return res

	patFreqDict = {}
	for phrase in phraseList:
		# phrase is a list of words
		for pattern in getPatternList(phrase):
			patFreqDict[pattern] = patFreqDict.get(pattern, 0) + 1
	patList = [(frq,ptrn) for ptrn,frq in patFreqDict.iteritems()]
	patList.sort(reverse=True)
	return [ptrn for frq,ptrn in patList[:numPatterns]]


class SearchEngine:
	"""This class provides specific search funtionalities for the LRA
	system. That involves methods to count word pairs in a certain scope,
	to search for phrases in a certain scope (with a minimum and a maximum
	of intervening words), and to count pattern matchings in phrases that
	were found before (caching).

	The class uses the CorpusInterface of the corpus module to perform
	context search operations.
	"""
	def __init__(self, directory):
		self.corpusInterface = corpusInterface.CorpusInterface(directory)
		self.pairPhraseDict = {}

	def _getStemmedIndex(self, word, wordList):
		"""Returns the first position of <word>* in <wordList>.
		That means that	if <word> appears with a suffix, it's found too,
		i.e. suffixes are ignored.
		"""
		for idx, wordOutOfList in enumerate(wordList):
			if wordOutOfList.lower().startswith(word):
				return idx
		return -1

	def countWordPairInScope(self, word1, word2, scope):
		"""Returns the number of occurrences of two words in a certain scope,
		while saving the intervening phrases in the cache.
		"""
		# one intervening word minimum!
		self.cacheInterPhrases(word1, word2, 1, scope-2)
		return len(self.getCachedPhrasesForWordPair(self, word1, word2))
		
		"""old:
		count = 0
		w1, w2 = self.corpusInterface.getLesserWorkOrder(word1, word2)
		for context in self.corpusInterface.getWordContextes(w1, scope):
			if w2 in context:
				count += 1
		return count
		"""

	def cacheInterPhrases(self, word1, word2, minInter, maxInter):
		"""Caches a list of phrases, that intervene between two words.
		The phrase length must be at least <minInter>, and must not be
		bigger than <maxInter>. The occurrence order of the two words is
		irrelevant.	Suffixes are ignored when searching for the words.
		"""
		w1, w2 = self.corpusInterface.getLesserWorkOrder(word1, word2)
		for context in self.corpusInterface.getWordContextes(
			w1, maxInter+2, doStemming=True):
			idx2 = self._getStemmedIndex(w2, context)
			if idx2 == -1:
				continue
			idx1 = self._getStemmedIndex(w1, context)
			if idx1 < idx2:
				interNr = idx2 - idx1 - 1
				if minInter <= interNr and interNr <= maxInter:
					self.pairPhraseDict.setdefault(
						(w1, w2), []).append(context[idx1+1:idx2])
			elif idx1 > idx2:
				interNr = idx1 - idx2 - 1
				if minInter <= interNr and interNr <= maxInter:
					self.pairPhraseDict.setdefault(
						(w2, w1), []).append(context[idx2+1:idx1])

	def countPatternForWordPair(self, pattern, word1, word2):
		"""Returns the number of matches of <pattern> in the the corpus for
		<word1> <pattern> <word2>.
		<pattern> is a string containing words or wildcards for words ("*")
		that are split by whitespace.
		The pairPhraseDict is used to count the patterns, thus the method
		cacheInterPhrases must be called before using this method.
		"""
		count = 0
		patList = pattern.split()
		for phrase in self.pairPhraseDict.get((word1, word2), []):
			# phrase is a list of words
			if len(patList) != len(phrase):
				continue
			match = True
			for idx in xrange(len(patList)):
				if patList[idx] != "*" and patList[idx] != phrase[idx]:
					match = False
					break
			if match:
				count += 1
		return count

	def getCachedPhrasesForWordPair(self, word1, word2):
		"""Returns all intervening phrases of a word pair in given and
		flipped order.
		The pairPhraseDict is used to get the phrases, thus the method
		cacheInterPhrases must be called before using this method.
		"""
		order12 = self.pairPhraseDict.get((word1, word2), [])
		order21 = self.pairPhraseDict.get((word2, word1), [])
		return order12 + order21

	def getCachedPhrases(self):
		"""Returns all intervening phrases of all word pairs previously
		searched for.
		The pairPhraseDict is used to get the phrases, thus the method
		cacheInterPhrases must be called before using this method.
		"""
		allPhrases = []
		for phrasesOfOneWordPair in self.pairPhraseDict.itervalues():
			allPhrases.extend(phrasesOfOneWordPair)
		return allPhrases