| import random
|
| import math
|
| import sys
|
| import statistics
|
|
|
| def mahattan_distance_one_dimension(x1,x2):
|
| return abs(x1-x2)
|
|
|
| def minkowski_distance(x1, x2, power):
|
| return ((abs(x1[0] - x2[0]))**power + (abs(x1[1] - x2[1])**power)) **(1.0/power)
|
|
|
| def chebyshev_distance(x1, x2, power=0):
|
| return max(abs(x1[0]-x2[0]),abs(x1[1]-x2[1]))
|
|
|
| one_dimensional_data_points = [5,13,4,6,15,13,32,14,6,10,12,31,12,41,13]
|
| data_points = [(1,7),(1,-11),(3,17),(7,18),(-8,4),(8,12),(11,-7),(12,14),(13,71),(-16,11),(13,1),(-9,2),(-5,3),(0,12)]
|
|
|
| def k_means_clustering(K,func,power=0):
|
| print("==============================")
|
| print("K: {0}".format(K))
|
|
|
|
|
| C = random.sample(data_points,K)
|
|
|
|
|
| iteration = 1000000
|
|
|
|
|
| min_decrease_sse = 1e-5
|
|
|
|
|
| previous_sse = float("inf")
|
| current_iteration = 0
|
|
|
| while(True):
|
| current_iteration +=1
|
| current_sse = 0.0
|
|
|
|
|
| cms_dict = {}
|
| for c in C:
|
| cms_dict[c] = []
|
|
|
| for x in data_points:
|
| max_dist = float("inf")
|
| closest = ()
|
|
|
| for c in C:
|
| d= func(x,c,power)
|
| if(d <= max_dist):
|
| max_dist = d
|
| closest = c
|
|
|
|
|
| cms_dict[closest].append(x)
|
|
|
|
|
| C = []
|
| for cm in cms_dict:
|
| cm_total_distance = 0.0
|
| new_m_x = 0.0
|
| new_m_y = 0.0
|
|
|
|
|
| for x in cms_dict[cm]:
|
| new_m_x += x[0]
|
| new_m_y += x[1]
|
| new_m_x /= len(cms_dict[cm])
|
| new_m_y /= len(cms_dict[cm])
|
| C.append((new_m_x,new_m_y))
|
|
|
|
|
| for x in cms_dict[cm]:
|
| cm_total_distance += func(x,(new_m_x,new_m_y),power)**2
|
| current_sse += cm_total_distance
|
|
|
|
|
| if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
|
| print("Final SSE: {0}".format(current_sse))
|
| print("Final Iteration: {0}".format(current_iteration))
|
| i = 0
|
| silhouetee_coefficent = 0.0
|
|
|
| for cm in cms_dict:
|
| i+=1
|
| abs = []
|
| if(len(cms_dict[cm]) != 1):
|
| if(K > 1):
|
| m = 0
|
| for xi in cms_dict[cm]:
|
| total_distance = 0
|
| for xj in cms_dict[cm]:
|
| if(xi != xj):
|
| total_distance += func(xi,xj,power)
|
| ai = total_distance//(len(cms_dict[cm])-1)
|
| bi = None
|
| for cm2 in cms_dict:
|
| if( cm !=cm2 ):
|
| total_distance = 0
|
| for xj in cms_dict[cm2]:
|
| total_distance += func(xi,xj,power)
|
| average = total_distance//len(cms_dict[cm2])
|
| if(bi is None):
|
| bi = average
|
| else:
|
| if(average < bi ):
|
| bi = average
|
| si = float(bi - ai) / max(ai,bi)
|
| silhouetee_coefficent += si
|
| dict ={}
|
| dict["a{0}".format(m)] = ai
|
| dict["b{0}".format(m)] = bi
|
| dict["s{0}".format(m)] = si
|
| abs.append(dict)
|
| m+=1
|
| else:
|
| dict ={}
|
| dict["a0"] = "Undefined"
|
| dict["b0"] = "Undefined"
|
| dict["s0"] = "0 by defintion"
|
| abs.append(dict)
|
| print("Cluster {0}: {1}".format(i,cms_dict[cm]))
|
| print(abs)
|
| print("------------------------------------------------")
|
| if( K > 1):
|
| print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
|
| else:
|
| print("Silhouetee Coefficent is not defined for K = 1")
|
| break
|
| else:
|
| print("Current SSE: {0}".format(current_sse))
|
| previous_sse = current_sse
|
|
|
|
|
| def k_median_clustering(K):
|
| print("==============================")
|
| print("K: {0}".format(K))
|
|
|
|
|
| C = random.sample(one_dimensional_data_points,K)
|
|
|
|
|
| iteration = 1000000
|
|
|
|
|
| min_decrease_se = 1e-5
|
|
|
|
|
| previous_se = float("inf")
|
| current_iteration = 0
|
|
|
| while(True):
|
| current_iteration +=1
|
| current_se = 0.0
|
|
|
|
|
| cms_dict = {}
|
| for c in C:
|
| cms_dict[c] = []
|
|
|
| for x in one_dimensional_data_points:
|
| max_dist = float("inf")
|
| closest = None
|
|
|
| for c in C:
|
| d = mahattan_distance_one_dimension(x,c,)
|
| if(d <= max_dist):
|
| max_dist = d
|
| closest = c
|
|
|
|
|
| cms_dict[closest].append(x)
|
|
|
|
|
| C = []
|
| for cm in cms_dict:
|
| cm_total_distance = 0.0
|
| new_m_x = 0.0
|
|
|
|
|
| new_m_x = statistics.median(cms_dict[cm])
|
| C.append(new_m_x)
|
|
|
|
|
| for x in cms_dict[cm]:
|
| cm_total_distance += mahattan_distance_one_dimension(x,(new_m_x))
|
| current_se += cm_total_distance
|
|
|
|
|
| if(previous_se - current_se <= min_decrease_se or current_iteration > iteration):
|
| print("Final SSE: {0}".format(current_se))
|
| print("Final Iteration: {0}".format(current_iteration))
|
| i = 0
|
| silhouetee_coefficent = 0.0
|
|
|
| for cm in cms_dict:
|
| i+=1
|
| abs = []
|
| if(len(cms_dict[cm]) != 1):
|
| if(K > 1):
|
| m = 0
|
| for xi in cms_dict[cm]:
|
| total_distance = 0
|
| for xj in cms_dict[cm]:
|
| if(xi != xj):
|
| total_distance += mahattan_distance_one_dimension(xi,xj)
|
| ai = total_distance//(len(cms_dict[cm])-1)
|
| bi = None
|
| for cm2 in cms_dict:
|
| if( cm !=cm2 ):
|
| total_distance = 0
|
| for xj in cms_dict[cm2]:
|
| total_distance += mahattan_distance_one_dimension(xi,xj)
|
| average = total_distance//len(cms_dict[cm2])
|
| if(bi is None):
|
| bi = average
|
| else:
|
| if(average < bi ):
|
| bi = average
|
| si = float(bi - ai) / max(ai,bi)
|
| silhouetee_coefficent += si
|
| dict ={}
|
| dict["a{0}".format(m)] = ai
|
| dict["b{0}".format(m)] = bi
|
| dict["s{0}".format(m)] = si
|
| abs.append(dict)
|
| m+=1
|
| else:
|
| dict ={}
|
| dict["a0"] = "Undefined"
|
| dict["b0"] = "Undefined"
|
| dict["s0"] = "0 by defintion"
|
| abs.append(dict)
|
| print("Cluster {0}: {1}, Median: {2}".format(i,cms_dict[cm],statistics.median(cms_dict[cm])))
|
| print(abs)
|
| print("------------------------------------------------")
|
| if( K > 1):
|
| print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent / len(data_points)))
|
| else:
|
| print("Silhouetee Coefficent is not defined for K = 1")
|
| break
|
| else:
|
| print("Current SSE: {0}".format(current_se))
|
| previous_se = current_se
|
|
|
| def k_medoids_clustering(K,func,power=0):
|
| print("==============================")
|
| print("K: {0}".format(K))
|
|
|
|
|
| C = random.sample(data_points,K)
|
|
|
|
|
| iteration = 1000000
|
|
|
|
|
| min_decrease_sse = 1e-5
|
|
|
|
|
| previous_sse = float("inf")
|
| current_iteration = 0
|
|
|
| while(True):
|
| current_iteration +=1
|
| current_sse = 0.0
|
|
|
|
|
| cms_dict = {}
|
| for c in C:
|
| cms_dict[c] = []
|
|
|
| for x in data_points:
|
| max_dist = float("inf")
|
| closest = ()
|
|
|
| for c in C:
|
| d= func(x,c,power)
|
| if(d <= max_dist):
|
| max_dist = d
|
| closest = c
|
|
|
|
|
| cms_dict[closest].append(x)
|
|
|
|
|
| C = []
|
| for m in cms_dict:
|
| max_dist = float("inf")
|
| new_c = None
|
|
|
| for x1 in cms_dict[m]:
|
| cm_total_distance = 0.0
|
| for x2 in cms_dict[m]:
|
| cm_total_distance += func(x1, x2, power)
|
| if(cm_total_distance <= max_dist):
|
| max_dist = cm_total_distance
|
| new_c = x1
|
| C.append(new_c)
|
|
|
|
|
| cm_total_distance = 0.0
|
| for x in cms_dict[m]:
|
| cm_total_distance += func(x,new_c,power)**2
|
| current_sse += cm_total_distance
|
|
|
|
|
| if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
|
| print("Final SSE: {0}".format(current_sse))
|
| print("Final Iteration: {0}".format(current_iteration))
|
| silhouetee_coefficent = 0.0
|
| i = 0
|
|
|
| for cm in cms_dict:
|
| i+=1
|
| abs = []
|
| if(len(cms_dict[cm]) != 1):
|
| if(K > 1):
|
| m = 0
|
| for xi in cms_dict[cm]:
|
| total_distance = 0
|
| for xj in cms_dict[cm]:
|
| if(xi != xj):
|
| total_distance += func(xi,xj,power)
|
| ai = total_distance//(len(cms_dict[cm])-1)
|
| bi = None
|
| for cm2 in cms_dict:
|
| if( cm !=cm2 ):
|
| total_distance = 0
|
| for xj in cms_dict[cm2]:
|
| total_distance += func(xi,xj,power)
|
| average = total_distance//len(cms_dict[cm2])
|
| if(bi is None):
|
| bi = average
|
| else:
|
| if(average < bi ):
|
| bi = average
|
| si = float(bi - ai) / max(ai,bi)
|
| silhouetee_coefficent += si
|
| dict ={}
|
| dict["a{0}".format(m)] = ai
|
| dict["b{0}".format(m)] = bi
|
| dict["s{0}".format(m)] = si
|
| abs.append(dict)
|
| m+=1
|
| else:
|
| dict ={}
|
| dict["a0"] = "Undefined"
|
| dict["b0"] = "Undefined"
|
| dict["s0"] = "0 by defintion"
|
| abs.append(dict)
|
| print("Cluster {0}: {1}".format(i,cms_dict[cm]))
|
| print(abs)
|
| print("------------------------------------------------")
|
| if( K > 1):
|
| print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
|
| else:
|
| print("Silhouetee Coefficent is not defined for K = 1")
|
| break
|
|
|
| else:
|
| print("Current SSE: {0}".format(current_sse))
|
| previous_sse = current_sse
|
|
|
|
|
| def main(argv):
|
| algo = {
|
| "euclidean": lambda k,pow: k_means_clustering(k,minkowski_distance,2),
|
| "minkowski": lambda k,pow: k_means_clustering(k,minkowski_distance,pow),
|
| "chebyshev": lambda k,pow,: k_means_clustering(k,chebyshev_distance),
|
| "median": lambda k,pow: k_median_clustering(k),
|
| "medoids": lambda k,pow: k_medoids_clustering(k,minkowski_distance,pow)
|
| }
|
| for k in range(1,int(argv[1])+1):
|
| algo[str(argv[0])](k,float(argv[1]) if (len(argv) > 2 and argv[2].replace('.','',1).isdigit()) else 2 )
|
| print("===========================================================================")
|
|
|
| if __name__ == "__main__":
|
| if(len(sys.argv) == 1):
|
| print("Missing Arugments")
|
| else:
|
| main(sys.argv[1:]) |