#!/usr/bin/python
import sys, random
from Numeric import *

def get_term_vectors(doc_file_names):
	term_vectors = {}
	doc_idx = 0
	dims = len(doc_file_names)
	for doc in doc_file_names:
		print "Processing %s..." % (doc,)
		terms = {}
		words = map(lambda a:a.strip(), open(doc,'r').readlines())
		for word in words:
			terms[word] = terms.get(word,0.0) + 1
		doc_len = len(words)	
		for term in terms.keys():
			term_vectors.setdefault(term, zeros(dims,Float))[doc_idx] = terms[term] / doc_len
		doc_idx += 1
	return map(lambda key,d=term_vectors:(d[key],key), term_vectors.keys())
	
def k_means(k, term_vectors, dims):
	print "Initialisation..."
	print "computing initial means..."
	means = []
	for i in range(k):
		means.append([])
	i = 0;
	for vector in term_vectors:
		means[i%k].append(vector[0])
		i+=1
	for i in range(k):
		m_temp = zeros(dims,Float)
		for vector in means[i]:
			m_temp += vector
		means[i] = m_temp / len(means[i])
	iter = 1
	while(iter):
		print '%d. iteration:' % (iter)
		iter +=1
		vector_clusters = []
		term_clusters = []
		for i in range(k):
			vector_clusters.append([])
			term_clusters.append([])
		print "computing distances..."
		n=0
		for t in term_vectors:
			term_vector = t[0]
			term = t[1]
			next_mean_id = 0
			min_dist = 2
			tmp_dist = 2
			for i in range(k):
				tmp_dist = sum((means[i] - term_vector)**2)**(1.0/2.0) #eucidian metric
				if tmp_dist < min_dist:
					next_mean_id,min_dist = i,tmp_dist
			term_clusters[next_mean_id].append(term)
			vector_clusters[next_mean_id].append(term_vector)
		print "computing new means..."
		old_means = means
		means = []
		for i in range(k):
			if len(vector_clusters[i])==0:
				means.append(old_means[i])
				continue
			new_mean = zeros(dims,Float) 
			for term_vector in vector_clusters[i]:
				new_mean += term_vector
			new_mean /= len(vector_clusters[i])
			means.append(new_mean)
		something_changed = 0
		for i in range(len(means)):
			if means[i] != old_means[i]:
				something_changed = 1
		if not something_changed:
			return term_clusters

if __name__=="__main__":
#def main():
	print "Getting file names..."
	doc_file_names = map(lambda a:a.strip(), sys.stdin.readlines())
	#doc_file_names = map(lambda a:a.strip(), open('filenames').readlines())
	dims = len(doc_file_names)
	print "Computing vectors..."
	term_vectors = get_term_vectors(doc_file_names)
	print str(len(term_vectors)), ' ', 'vectors generated.'
	print 'K-Means:'
	term_clusters = k_means(180,term_vectors,dims)
	file = open("cluster","w")
	for i in range(180):
		file.write("<========= %d =========>\n"%(i,))
		for term in term_clusters[i]:
			file.write( "%d %s\n" % (i,term))

def profile():
	import profile
	profile.run("k_means2.main()")
