#!python2.4.4
# -*- encoding: iso-8859-1 -*-

"""
Project: LRApy
Author: Max Jakob (max.jakob@web.de)
Module: indexCorpus
Module Description: This Module contains functions to prepare a corpus of
uncompressed XML-Files of the Reuters Corpus or text files for retrieval.

Version: 0.9
Last change: 2007-02-01

Copyright 2007 by Max Jakob.
This code is released under the GNU GPL. See the accompanying LICENSE file.

Embedded documentation can be translated with the Python pydoc module.
"""


STOPWORDS_LIST = "stopwords_en.txt"


import sys, os, shutil, re, fnmatch, xml.sax

class ReutersContentHandler(xml.sax.handler.ContentHandler):
	"""SAX Content Handler for XML-files of the Reuters Corpus.
	usage:
	  import xml.sax
	  parser = xml.sax.make_parser()
	  thisHandler = ReutersContentHandler()
	  parser.setContentHandler(thisHandler)
	  try:
	    parser.parse(someXmlFileName)
	  except SAXException:
	    return thisHandler.getArticleText()
	
	A SAXException is risen when a </text> tag is encountered, because
	the article data is finished at that point, and the parser can stop.
	Headlines and text body are returned in a string when calling the
	getArticleText method.
	"""
	def __init__(self):
		self.capture = False
		self.articleText = ""

	def startElement(self, name, attrs):
		if name == "headline":
			self.capture = True
		elif name == "text":
			self.capture = True

	def endElement(self, name):
		if name == "headline":
			self.capture = False
		elif name == "text":
			raise xml.sax.SAXException, "end of article data"

	def characters(self, content):
		if self.capture:
			self.articleText += content

	def getArticleText(self):
		return self.articleText

def getReutersArticleString(xmlFileName):
	"""Returns the headline and the article text of <xmlFileName>, which
	is a file of the Reuters Corpus, as a string. A SAX-Parser is used.
	The parser raises a SAXException when the closing text tag is
	encountered, because it can stop parsing at this point.
	"""
	parser = xml.sax.make_parser()
	reutersHandler = ReutersContentHandler()
	parser.setContentHandler(reutersHandler)
	try:
		parser.parse(xmlFileName)
	except xml.sax.SAXException:
		return reutersHandler.getArticleText()

stopwords = {}
try:
	stopwordfile = open(STOPWORDS_LIST)
except IOError:
	stopwordfile = []
for line in stopwordfile:
	stopwords[line] = 1

def _getWordsPositionDictFromFile(fileName):
	"""Returns a dictionary with the words in <fileName> as keys, and their
	position in the file (number of bytes from the start) as values.
	Words that only consist of numbers are ignored
	"""
	fileStr = open(fileName, "rb").read()  # Win/Unix file-ending independent
	if fileName.endswith(".xml"):
		data = getReutersArticleString(fileName)
	else:
		data = fileStr
	words = {}
	position = 0  # position in file, from which forward will be indexed
	for word in re.sub(r"(?L)\W+"," ",data).split():
		try:
			wordPos = fileStr.index(word, position)
		except ValueError:
			# 1. rare case that the word is not in the context
			#    (for unknown reason)
			# 2. file contains a char of weird encoding
			#    (UnicodeError is a subclass of ValueError)
			continue
		if len(word) > 1 and not stopwords.get(word):
			if not re.search("[0-9_]", word):
				words.setdefault(word.lower(), []).append(wordPos)
		position = wordPos + len(word)
	return words


## ----------------------------------------------- functions using harddisk


def _getFilesForOneIndex(indexPath, portionNr):
	import math
	if math.log(portionNr, 2)%1:  # hat Kommastellen
		raise ValueError, "number of portions must be in 2**int(n)"
	lastIdx = 0
	for line in open(os.path.join(indexPath, "files.list")):
		lastIdx = line.replace("\n","").split("\t")[0]
	return int(int(lastIdx) / float(portionNr)) + 1

def _saveOnePortionFile(invDict, fileIdx, indexPath):
	wordsDir = os.path.join(indexPath, "wordsIndices")
	fName = os.path.join(wordsDir, "words"+str(fileIdx)+".index")
	f = open(fName, "w")
	# word-occurrences splitter: "\t"
	# occurrence-occurrence splitter: " "
	# fileindex-position splitter: ":"
	for w in sorted(invDict.keys()):
		f.write("%s\t%s\n"%(w,
      					" ".join(
				[" ".join(["%d:%d"%(docIdx,wIdx) for wIdx in wrdInds])
					for docIdx, wrdInds in invDict[w]])))
	f.close()

def savePortionWordFiles(indexPath, portions=32):
	"""Creates <portions> files with word indices.
	"""
	inputFileCount, outputFileCount = 0, 0
	invDict = {}
	os.mkdir(os.path.join(indexPath, "wordsIndices"))
	filesForOneIndex = _getFilesForOneIndex(indexPath, portions)
	# files.list must already exist
	for line in open(os.path.join(indexPath, "files.list")):
		inputFileCount += 1
		idx, filePath = line.replace("\n","").split("\t")
		wordPosDict = _getWordsPositionDictFromFile(filePath)
		for word in wordPosDict.iterkeys():
			invDict.setdefault(word,[]).append((int(idx),wordPosDict[word]))
		if inputFileCount >= filesForOneIndex:
			_saveOnePortionFile(invDict, outputFileCount, indexPath)
			invDict = {}
			inputFileCount = 0
			outputFileCount += 1
			sys.stdout.write("%i/%i files written\r"%
				(outputFileCount,portions))
	if invDict:
		_saveOnePortionFile(invDict, outputFileCount, indexPath)
		outputFileCount += 1
	print "total number of written files: %i"%(outputFileCount)


def mergeToOneWordIndexFile(indexPath):
	"""Ugly but working fuction for merging all the files created by
	savePortionWordFiles to one 'words.index' file.
	"""
	wordsIndexPath = os.path.join(indexPath, "wordsIndices")
	filenames = getCompleteFilesList(wordsIndexPath, "words*.index")
	while len(filenames) > 1:
		if len(filenames)%2 != 0:
			raise ValueError, "odd number of files"
		print "merging %i to %i files..."%(len(filenames),len(filenames)/2)
		fileCount = 0
		for fName1,fName2 in zip(
			filenames[:len(filenames)/2], filenames[len(filenames)/2:]):
			f1 = open(fName1, "r", 1)
			f2 = open(fName2, "r", 1)
			mergedPath = os.path.join(wordsIndexPath, "merged")
			if not os.path.isdir(mergedPath):
				os.mkdir(mergedPath)
			out = open(os.path.join(
				mergedPath,"words"+str(fileCount)+".index"),"a")
			lineF1 = f1.next()
			lineF2 = f2.next()
			i = 0
			workingDisplay = {"-":"\\","\\":"|","|":"/","/":"-"}
			state = "/"
			while 1:
				i += 1
				if i == 500:
					state = workingDisplay[state]
					sys.stdout.write("writing nr %i %s\r"%(fileCount+1,state))
					i = 0
				w1 = lineF1.split("\t")[0]
				w2 = lineF2.split("\t")[0]
				if w1 == w2:
					completeLine = w1 + "\t"
					completeLine += lineF1.split("\t")[1].replace("\n","")
					completeLine += " " + lineF2.split("\t")[1]
					out.write(completeLine)
					try:
						lineF1 = f1.next()
					except StopIteration:
						# wenn EOF, schreib den Rest von f2 rein
						while 1:
							try:
								out.write(lineF2)
								lineF2 = f2.next()
							except StopIteration:
								break
						break
					try:
						lineF2 = f2.next()
					except StopIteration:
						# wenn EOF, schreib den Rest von f1 rein
						while 1:
							try:
								out.write(lineF1)
								lineF1 = f1.next()
							except StopIteration:
								break
						break
				elif w1 < w2:
					out.write(lineF1)
					try:
						lineF1 = f1.next()
					except StopIteration:
						# wenn EOF, schreib den Rest von f2 rein
						while 1:
							try:
								out.write(lineF2)
								lineF2 = f2.next()
							except StopIteration:
								break
						break
				elif w1 > w2:
					out.write(lineF2)
					try:
						lineF2 = f2.next()
					except StopIteration:
						# wenn EOF, schreib den Rest von f1 rein
						while 1:
							try:
								out.write(lineF1)
								lineF1 = f1.next()
							except StopIteration:
								break
						break
			out.close()
			f1.close()
			f2.close()
			os.remove(fName1)
			os.remove(fName2)
			fileCount += 1
		wordsIndexPath = mergedPath
		filenames = getCompleteFilesList(wordsIndexPath, "words*.index")
	_moveResultToRoot(indexPath)

def _moveResultToRoot(rootDir):
	wordsIndexPath = os.path.join(rootDir, "wordsIndices")
	longestFName = sorted(getCompleteFilesList(wordsIndexPath,
		"words*.index"), lambda x,y: cmp(len(x),len(y)), reverse=True)[0]
	shutil.move(longestFName, os.path.join(rootDir, "words.index"))
	shutil.rmtree(wordsIndexPath)

def saveFilesList(directory, pat="*"):
	"""Scans through <directory> and all its subdirectories for files
	matching the shell pattern <pat> and saves a list of complete paths.
	  fileindex-complete path splitter: "\t"
	"""
	pathHead, _pathTail = os.path.split(os.path.realpath(directory))
	fileCount = 0
	out = open(os.path.join(directory, "files.list"), "a")
	for dirName, _subDirs, fileNames in os.walk(os.path.realpath(directory)):
		for fName in fileNames:
			if fnmatch.fnmatch(fName, pat) and fName != "files.list":
				fullPath = os.path.join(pathHead,os.path.join(dirName,fName))
				out.write("%07i\t%s\n"%(fileCount, fullPath))
				fileCount += 1
	out.close()


## ---------------------------------------------------- in memory functions


def getCompleteFilesList(directory, pat="*"):
	"""Scans through <directory> and all its subdirectories for files
	matching the shell pattern <pat> and returns a list of complete paths.
	"""
	if directory[-1] == os.sep:
		directory = directory[:-1]
	pathHead, _pathTail = os.path.split(os.path.realpath(directory))
	fileList = []
	for dirName, _subDirs, fileNames in os.walk(os.path.realpath(directory)):
		for fName in fileNames:
			fileList.append(os.path.join(pathHead,os.path.join(dirName,fName)))
	return fileList

def getInvertedWordIndexDict(fileList):
	"""Returns a dictionary with the words of all files in fileList as keys.
	The values are tuples. The first element is the index of the file, in
	which the word occurrs, and the second element is the position in
	which the word occurrs (number of bytes from the start).
	"""
	invIndex = {}
	for fIndex, fName in enumerate(fileList):
		wordPosDict = _getWordsPositionDictFromFile(fName)
		for word in wordPosDict.iterkeys():
			invIndex.setdefault(word, []).append((fIndex, wordPosDict[word]))
	return invIndex

def saveBothIndices(fileList, invDict, indexPath):
	"""Dumps <fileList> and <invDict> into <indexPath> in appropriate
	formats.
	"""
	f = open(os.path.join(indexPath, "files.list"), "w")
	# fileindex-complete path splitter: "\t"
	for idx, fName in enumerate(fileList):
		f.write("%07i\t%s\n"%(idx, fName))
	f.close()

	f = open(os.path.join(indexPath, "words.index"), "w")
	# word-occurrences splitter: "\t"
	# occurrence-occurrence splitter: " "
	# fileindex-position splitter: ":"
	for word in sorted(invDict.keys()):
		f.write("%s\t%s\n"%(word,
			" ".join([" ".join(["%d:%d"%(docIdx, wIdx) for wIdx in wrdInds])
				for docIdx, wrdInds in invDict[word]])))
	f.close()

## ---------------------------------------------------- top level functions

def _checkExistingIndexFiles(indexPath):
	"""Deletes the files 'words.index' and 'files.list' in <indexPath>,
	if they exist. Should be done before indexing a corpus, because the
	data of these files will be part of the corpus, and you never want that.
	"""
	if os.path.isfile(os.path.join(indexPath, "words.index")):
		sys.stderr.write("ERROR: Please delete existing 'words.index'!\n")
		sys.exit(1)
	if os.path.isdir(os.path.join(indexPath, "wordsIndices")):
		sys.stderr.write("ERROR: Please delete existing dir 'wordsIndices',")
		sys.stderr.write("\n       or adjust indexing task!\n")
		sys.exit(1)
	if os.path.isfile(os.path.join(indexPath, "files.list")):
		sys.stderr.write("ERROR: Please delete existing 'files.list',")
		sys.stderr.write("\n       or adjust indexing task!\n")
		sys.exit(1)

def makeIndex(indexPath):
	"""Creates two index files in <indexPath>.
	The first file is a mapping	of indices to complete paths of all the
	filenames in <indexPath> and its subdirectories:
	  00	/home/usr/corpus/herText.txt
	  01	/home/usr/corpus/myText.txt
	  02	/home/usr/corpus/someText.txt
	  ...

	The other file is a mapping of all the words in the	corpus, to their
	occurrences in the files, and their position in the	files. The list
	of words is sorted alphabetically, so that the Unix look command, or
	an equivalent binary search, can use it:
	  all	0:45 2:14
	  belong	0:3 0:187 1:38
	  count	2:21
	  ...
	"""
	_checkExistingIndexFiles(indexPath)
	sys.stdout.write("indexing corpus... ")
	fileList = getCompleteFilesList(indexPath)
	invIndex = getInvertedWordIndexDict(fileList)
	saveBothIndices(fileList, invIndex, indexPath)
	print "index created"

def makeIndex_lowMemory(indexPath):
	"""Like makeIndex, but saves first the files list, then 128 portion
	files of the words index, which are finally merged.
	"""
	_checkExistingIndexFiles(indexPath)
	print "indexing corpus..."
	saveFilesList(indexPath)
	savePortionWordFiles(indexPath, portions=128)
	mergeToOneWordIndexFile(indexPath)
	print "\nindex created"


def printUsage():
		print "usage: python indexCorpus.py <corpusDir>"
		print "  CAUTION: For big corpora or lack of memory use"
		print "  python indexCorpus.py -big <corpusDir>"

if __name__=="__main__":
	if len(sys.argv) == 2 and os.path.isdir(sys.argv[1]):
		makeIndex(sys.argv[1])
	elif len(sys.argv) == 3 and os.path.isdir(
		sys.argv[2]) and sys.argv[1].lower() == "-big":
		makeIndex_lowMemory(sys.argv[2])
	else:
		printUsage()