#!python2.4.4 # -*- encoding: iso-8859-1 -*- """ Project: LRApy Author: Max Jakob (max.jakob@web.de) Module: corpusInterface Module Description: This Module provides an interface for word context search in a prepared corpus. To prepare a corpus see the indexCorpus module. Version: 1.1 Last change: 2007-02-03 Copyright 2007 by Max Jakob. This code is released under the GNU GPL. See the accompanying LICENSE file. Embedded documentation can be translated with the Python pydoc module. """ MAX_WORD_LENGTH = 15 # to know how many chars to read from a file import os, re try: from Looker import Looker except ImportError: raise ImportError, "please run 'python setup.py build' to install Looker" class CorpusInterface: """This class is an interface to a corpus. It is expected, that indexCorpus.py has been called somewhen in advance, so that two files exist in the corpus' root directory: One file to map all complete file paths to indices ('files.list'), and one file to see all the occurrences of all words in the files with their position ('words.index'). A corpus directoy must be specified when instanciating. The method getWordContextes provides context search functionality, with or without stemming. Stemming means simply that suffixes are ignored. The method getLesserWorkOrder estimates for which of two words it is more efficient to search the corpus and get the contextes. """ def __init__(self, directory, files="files.list", words="words.index"): self.fileLooker = Looker(os.path.join(directory, files)) self.wordLooker = Looker(os.path.join(directory, words)) def getLesserWorkOrder(self, word1, word2): """Returns a tuple, with that word first, for which there are lesser entries in the word-index-file, and for which therefore look-up-work in the corpus is less. More efficiency is expected from this, but there was no sufficient testing of this claim. """ lookRes1 = self.wordLooker.look(word1) lookRes2 = self.wordLooker.look(word2) if not lookRes1 or not lookRes2: return word1, word2 workFor1 = reduce(lambda x, y: x+y, [len(line.split("\t")[1].split()) for line in lookRes1.replace("\r","").split("\n") if line]) workFor2 = reduce(lambda x, y: x+y, [len(line.split("\t")[1].split()) for line in lookRes2.replace("\r","").split("\n") if line]) if workFor1 < workFor2: return word1, word2 else: return word2, word1 def _getWordSurroundingsString(self, occurrence, wordScope, maxWordLen=MAX_WORD_LENGTH): fileNameIdx, wordPosInFile = occurrence.split(":") fNameLine = self.fileLooker.look( "%07i"%int(fileNameIdx)).replace("\r","").replace("\n","") start = int(wordPosInFile) - wordScope*maxWordLen if start < 0: start = 0 f = open(fNameLine.split("\t")[1], "rb") # Win/Unix file-ending indep. f.seek(start, 0) surroundings = f.read(wordScope * maxWordLen * 2 + maxWordLen) f.close() return surroundings def _getCleanWordList(self, aString): """Adjusts for XML-mark-up and all chars other than letters and numbers, and returns a list of lowered words. """ return re.sub(r"(?L)\W+", " ", re.sub("<[^>]*>", " ", aString) ).lower().split() def _getContext(self, word, indexLine, scope, doStemming): """Generator that yields tuples of contextes for . """ wordInIndex, occurrences = indexLine.split("\t") if not doStemming and wordInIndex != word: # no stemming, wrong word (different suffix) yield [] allContextes = [] for occ in occurrences.split(): wordList = self._getCleanWordList( self._getWordSurroundingsString(occ, scope)) try: wordPos = wordList.index(wordInIndex) except ValueError: # 1. rare case that the word is not in the context # (for unknown reason) # 2. file contains a char of weird encoding # (UnicodeError is a subclass of ValueError) continue start = wordPos - scope + 1 if start < 0: start = 0 yield tuple(wordList[start:wordPos+scope]) def getWordContextes(self, word, scope, doStemming=False): """Returns a list of word tuples of length +1. Every word tuple contains with -1 words left and right of its occurrences in the corpus. Example (word="are",scope=2): [ ("simply","are","very"), ("you","are","nice"), ("are","these"), ("they","are") ] In the last two elements, "are" is the first and the last word of the file respectively. If is True, suffixes are ignored when searching for . """ contextes = [] for line in self.wordLooker.look(word).replace("\r","").split("\n"): if not line: continue for context in self._getContext(word, line, scope, doStemming): contextes.append(context) return contextes