=begin rdoc
Clustering: iCookWare clusterer class using some algorithm similar to k-means

Author:: Nicola Kaiser (kaiser@cl.uni-heidelberg.de), Ana Kovatcheva (kovatch@cl.uni-heidelberg.de)
Project:: iCookWare
Copyright:: iCookWare Team 2005 (Nicola Kaiser, Ana Kovatcheva, Olga Mordvinova, Stephanie Schuldes)
Embedded Documentation Tool:: rdoc
=end

require 'utilities'
require 'icw_row'

module ICW
  # clusters a matrix of vectors (in this context ingerdients) using k-means
  class Clustering
    # +matrix+:: matrix of vectors to cluster (icw_matrix), required
    # +k+:: number of clusters (integer), optional, default: 5th of elements to cluster
    # +c+:: initial cluster (array of arrays of 2 arrays), optional, mainly used for testcases 
    attr_reader :k, :c_new, :matrix
    attr_writer :c_new # only for testcases

	  def initialize matrix, k=nil, c=nil
		  @matrix=matrix
      @dim=matrix[0].length
		  if not k
			  @k=@matrix.length/5
		  else
			  @k=k
		  end
      if not c
        @c_new=Array.new
        @k.times { | i | @c_new << [[],[]] } 
      else
        @c_new=c 
      end
    end

    # builds a row object from vector, here used for centroids
    # needs to be done to apply distance method, which belongs to class Row   
    # +v+:: aray containing preferably floats or integers
    def buildRowObject v
      colnames=Array.new v.length, "x"
       newRow=ICW::Row.new "c", colnames
       v.each { |value| newRow.push value }
       return newRow
    end
 
    # determins the maximum margin of a set of vectors
    # needed to define the vector space in which to place random centroids
    def maxMargin
      maxV=Array.new
      @dim.times do |i|
        maxV << (@matrix.max{|a,b|a[i]<=>b[i]})[i] 
      end
      return (buildRowObject maxV)
    end
  
    # determins the minimum margin of a set of vectors
    # needed to define the vector space in which to place random centroids
    def minMargin
      minV=Array.new
      @dim.times do |i|
        minV << (@matrix.min{|a,b|a[i]<=>b[i]})[i]                 
      end
      return (buildRowObject minV)
    end

    # computes a new random centroid in a vector space defined by minV and maxV
    # +minV+:: array of minimum values which a vector may contain
    # +maxV+:: array of maximum values which a vector may contain
    def newCentroid minV, maxV
      centroid=Array.new
      minV.each_with_index do |mv, i|
        centroid.push(mv+rand(maxV[i]-mv))
      end
      return (buildRowObject centroid)
    end

    # initializes k random centroids somewhere in the vectorspace used by
    # the vectors of @matrix
    def initializeCentroids
      maxV=maxMargin
      minV=minMargin
      new_centroid=Array.new
      old_centroid=Array.new
      minDist= (1- (minV.dist maxV))/(@k*2)
      old_centroid=maxV
      @k.times do |i|
        new_centroid=newCentroid minV, maxV
        while (1- (old_centroid.dist new_centroid)<minDist) do
          new_centroid=newCentroid minV, maxV
        end
        @c_new[i][0]=new_centroid
        old_centroid=new_centroid
      end
    end
    
    # AK - choosing k random data points from the set of points which is better 
    # alternative Version to initializeCentroids
    def initializeExistingCentroids
      new_centroid=Array.new
      centroid_indexes=Array.new
      @k.times do |i|
        new_c_index=rand(@matrix.length)
        # to prevent duplicate centroids
        while centroid_indexes.include?(new_c_index) do
          new_c_index=rand(@matrix.length)
        end
        centroid_indexes.push(new_c_index)
        new_centroid=@matrix[new_c_index].deepcopy      
        @c_new[i][0]=new_centroid.deepcopy
      end
    end
	 
    # computes centroids given a cluster distribution
    # centroid= average value of all vectors in a cluster
    def computeCentroids
      @c_new.each_with_index do |cluster, c_i| 
        c=Array.new(@dim, 0)
        cluster[1].each do |element|
          for v_i in 0...element.length do
           if element[v_i]==nil
              element[v_i]=0
           end            
           c[v_i]+=element[v_i].to_f
          end
        end
        begin
          l=cluster[1].length
          c.each_with_index{ |value, i| c[i]=value/l}          
        rescue ZeroDivisionError
        end
        @c_new[c_i][0]=c
       end
    end
 
    # after new centroids are computed, removes all elements of a cluster
    def clearC_new
      @c_new.each_with_index do |e, i|
        @c_new[i][1]=[]
      end
    end
    
    # assigns element to it's closest centroid
    # +elem+:: Row object
    # +d_max+:: closest centroid
    def toCluster elem, d_max
      @c_new.each do |centroid, cluster|
        if centroid==d_max
          cluster.push(elem)
        end
      end
    end
    
    # reassigns every vector of @matrix to it's closest centroid
    # steps: removes all elements from all clusters
    #        computes for every vector of matrix distances to all new centroids
    #        assingns vector to cluster with closest centroid
    def updateClusters
      clearC_new
      @matrix.each do |v|
        distances=Array.new
        @c_new.each do |c, e|
          distances.push([v.dist(c), c])
        end
        toCluster v, distances.max{|a,b| a[0]<=>b[0]}[1]
      end
    end
	 

    # prints content of all clusters using names of all Row objects
    # (useful for testing and debugging)
    def displayClusters
      @c_new.each_with_index do |cluster, i|
        puts "############### Cluster: "+ i.to_s+ " ########################"
        cluster[1].each do |v|
          puts v.name
        end
      end  
    end 
  
    # function to determine wether two set of clusters are identical
    def compareClusters c_old
      if c_old.sort==@c_new.sort
        return 1
      else
        return 0
      end
    end 
   
    def checkForDuplicates
      #checks wehter there are identical clusters
      duplicates=Array.new
      @c_new.each_with_index do | c, i|
        @c_new.each_with_index do |c2, i2|
          if not i==i2
            if c[1].sort==c2[1].sort
              if not duplicates.assoc(c[1].sort)
                duplicates.push([c[1], [i,i2]])
              else
                cur=duplicates.index(duplicates.assoc(c[1].sort))
                if not duplicates[cur][1].include?(i)
                  duplicates[cur][1].push(i)
                end
                if not duplicates[cur][1].include?(i2)
                  duplicates[cur][1].push(i2)
                end
              end
            end
          end
        end
      end
      #puts "found #{duplicates.length} duplicates:"
      #duplicates.each_with_index do |dup, i|
      #  puts "duplicate no #{i}: #{dup[1].inspect}"
      #end
      # ... and removes all but the first version
      duplicates.each do |dup|
        for i in 1...dup[1].length
          @c_new.delete_at(dup[1][i])
        end
      end
    end

    def removeEmpty
      # before returning finished cluster: removes all empty clusters
      @c_new.each_with_index do |cluster, i|
        if cluster[1].empty?
          @c_new.delete_at(i)
        end
      end
    end
 
    
    # cluster function using initializeCentroids
    def cluster
		  initializeCentroids
      iter=0 #counter for interations
      c_old=Array.new
      while compareClusters(c_old)==0
        iter+=1 #counts number of iterations (for testing)
			  c_old=@c_new.deepcopy
			  updateClusters
			  computeCentroids
		  end
		  updateClusters
      displayClusters #just for testing
      checkForDuplicates
      removeEmpty
		  return @c_new # , iter
	  end    

    # cluster function using initializeExistingCentroids
    def cluster_withExistingCentroids
		  initializeExistingCentroids
      iter=0 #counter for interations
      c_old=Array.new
      while compareClusters(c_old)==0
        iter+=1 #counts number of iterations (for testing)
			  c_old=@c_new.deepcopy
			  updateClusters
			  computeCentroids
		  end
		  updateClusters
      #displayClusters #just for testing
      checkForDuplicates
      removeEmpty
		  return @c_new #, iter
	  end

    # reduces @c_new to a datastructure consisting only of the ingredient names 
    def reduceCluster
      namesOnlyCl=Array.new
      #@_new.length.times { | i | namesOnlyCl << [] } 
      @c_new.each do |c, cl|
        tmpCl=Array.new
        cl.each do |v|
          tmpCl.push v.name
        end
        namesOnlyCl.push tmpCl
      end
      return namesOnlyCl
    end   
  end
end 

############ DONE ################
# * alle leeren cluster und duplikate nachträglich elimineren
#   -> warum sind die duplicate nicht in jedem fall weg????
############ TO DO ###############
# * welche variante der zentroide funktioniert besser?
# * the evil of duplicates
#   -> bei random centroids gibt es nie duplikate
#   -> wenn man schon existierende punkte nimmt, seht wohl!