#!python2.4.4
# -*- encoding: iso-8859-1 -*-

"""
Project: LRApy
Author: Max Jakob (max.jakob@web.de)
Module: corpusInterface
Module Description: This Module provides an interface for word context
search in a prepared corpus. To prepare a corpus see the indexCorpus module.

Version: 1.1
Last change: 2007-02-03

Copyright 2007 by Max Jakob.
This code is released under the GNU GPL. See the accompanying LICENSE file.

Embedded documentation can be translated with the Python pydoc module.
"""


MAX_WORD_LENGTH = 15  # to know how many chars to read from a file


import os, re

try:
	from Looker import Looker
except ImportError:
	raise ImportError, "please run 'python setup.py build' to install Looker"


class CorpusInterface:
	"""This class is an interface to a corpus. It is expected, that
	indexCorpus.py has been called somewhen in advance, so that two files
	exist in the corpus' root directory: One file to map all complete file
	paths to indices ('files.list'), and one file to see all the occurrences
	of all words in the files with their position ('words.index').

	A corpus directoy must be specified when instanciating.
	The method getWordContextes provides context search functionality, with
	or without stemming. Stemming means simply that suffixes are ignored.
	The method getLesserWorkOrder estimates for which of two words it is
	more efficient to search the corpus and get the contextes.
	"""
	def __init__(self, directory, files="files.list", words="words.index"):
		self.fileLooker = Looker(os.path.join(directory, files))
		self.wordLooker = Looker(os.path.join(directory, words))

	def getLesserWorkOrder(self, word1, word2):
		"""Returns a tuple, with that word first, for which there are lesser
		entries in the word-index-file, and for which therefore look-up-work
		in the corpus is less. More efficiency is expected from this, but
		there was no sufficient testing of this claim.
		"""
		lookRes1 = self.wordLooker.look(word1)
		lookRes2 = self.wordLooker.look(word2)
		if not lookRes1 or not lookRes2:
			return word1, word2
		workFor1 = reduce(lambda x, y: x+y, [len(line.split("\t")[1].split())
			for line in lookRes1.replace("\r","").split("\n")
			if line])
		workFor2 = reduce(lambda x, y: x+y, [len(line.split("\t")[1].split())
			for line in lookRes2.replace("\r","").split("\n")
			if line])
		if workFor1 < workFor2:
			return word1, word2
		else:
			return word2, word1

	def _getWordSurroundingsString(self, occurrence, wordScope,
		maxWordLen=MAX_WORD_LENGTH):
		fileNameIdx, wordPosInFile = occurrence.split(":")
		fNameLine = self.fileLooker.look(
			"%07i"%int(fileNameIdx)).replace("\r","").replace("\n","")
		start = int(wordPosInFile) - wordScope*maxWordLen
		if start < 0: start = 0
		f = open(fNameLine.split("\t")[1], "rb") # Win/Unix file-ending indep.
		f.seek(start, 0)
		surroundings = f.read(wordScope * maxWordLen * 2 + maxWordLen)
		f.close()
		return surroundings
		
	def _getCleanWordList(self, aString):
		"""Adjusts <aString> for XML-mark-up and all chars other than letters
		and numbers, and returns a list of lowered words.
		"""
		return re.sub(r"(?L)\W+", " ",
			re.sub("<[^>]*>", " ", aString)
			).lower().split()

	def _getContext(self, word, indexLine, scope, doStemming):
		"""Generator that yields tuples of contextes for <indexLine>.
		"""
		wordInIndex, occurrences = indexLine.split("\t")
		if not doStemming and wordInIndex != word:
			# no stemming, wrong word (different suffix)
			yield []
		allContextes = []
		for occ in occurrences.split():
			wordList = self._getCleanWordList(
				self._getWordSurroundingsString(occ, scope))
			try:
				wordPos = wordList.index(wordInIndex)
			except ValueError:
				# 1. rare case that the word is not in the context
				#    (for unknown reason)
				# 2. file contains a char of weird encoding
				#    (UnicodeError is a subclass of ValueError)
				continue
			start = wordPos - scope + 1
			if start < 0: start = 0
			yield tuple(wordList[start:wordPos+scope])

	def getWordContextes(self, word, scope, doStemming=False):
		"""Returns a list of word tuples of length <scope>+1. Every word
		tuple contains <word> with <scope>-1 words left and right of its
		occurrences in the corpus.

		  Example (word="are",scope=2):
		  [
		   ("simply","are","very"),
		   ("you","are","nice"),
		   ("are","these"),
		   ("they","are")
		  ]
		  In the last two elements, "are" is the first and the last word
		  of the file respectively.

		If <doStemming> is True, suffixes are ignored when searching
		for <word>.
		"""
		contextes = []
		for line in self.wordLooker.look(word).replace("\r","").split("\n"):
			if not line:
				continue
			for context in self._getContext(word, line, scope, doStemming):
				contextes.append(context)
		return contextes
