#!python2.4.4
# -*- coding: iso-8859-1 -*-

"""
Project: LRApy
Author: Max Jakob (max.jakob@web.de)
Module: makesamplecorpus
Module Description: Creates a small sample corpus, including index files,
for testing puposes.

Version: 1.0.1
Last change: 2007-01-23

Copyright 2007 by Max Jakob.
This code is released under the GNU GPL. See the accompanying LICENSE file.

Embedded documentation can be translated with the Python pydoc module.
"""


CORPUS_DIR = "samplecorpus"  # name of the directory to save the corpus


import random, sys, os, indexCorpus

def getCorpusString(taskFile, corpusSize):
	"""Returns a random corpus string for testing the LRA, using the word
	pairs in <taskFile> as input. <corpusSize> is the number of times one
	word pair occurs. To every word pair a random number (zero to four)
	of intervening words (just three kinds of non-sense characters) is added.
	The order of the two words is also random.
	"""
	# somthing to seperate the word pairs, "context" if you like:
	pairsSplitter = "\nyada yada yada yada yada yada yada yada\n"
	
	res = ""
	for line in open(taskFile).read().split("\n"):
		if not line.strip():
			continue
		word1, word2 = line.replace("*","").split(":")
		for _i in xrange(corpusSize):
			inter = ""
			for _j in xrange(random.randint(0, 4)):
				inter += random.choice(["aaa ","bbb ","ccc "])
			if random.randint(0, 1):
				res += "%s %s %s %s"%(word1, inter, word2, pairsSplitter)
			else:
				res += "%s %s %s %s"%(word2, inter, word1, pairsSplitter)
	return res

def makeSampleCorpus(corpusDir, taskFile, corpusSize):
	"""Makes a testing corpus including word and file indices in
	<corpusDir>, using the word pairs in <taskFile> as input.
	<corpusSize> is passed to getCorpusString.
	"""
	if not os.path.isdir(corpusDir):
		os.mkdir(corpusDir)
	d = open(os.path.join(corpusDir, "corpus.txt"), "w")
	d.write(getCorpusString(taskFile, corpusSize))
	d.close()
	indexCorpus.makeIndex(corpusDir)

if __name__ == "__main__":
 	if len(sys.argv) == 3:
		taskFile, corpusSize = sys.argv[1], int(sys.argv[2])
		makeSampleCorpus(CORPUS_DIR, taskFile, corpusSize)
	else:
		print "usage: 'python makesamplecorpus.py <taskFile> <corpusSize>'"
