import numpy as np
import copy
import matplotlib.pyplot as plt
class Hierarchical_cluster:
def __init__(self,k=None,p=2,dis_way='min',c_way='agg'):
self.k = k
self.p = p
self.dis_way = dis_way
self.c_way = c_way
def cauclate_dis(self, x1, x2):
return np.sum(abs(x1 - x2) ** self.p) ** (1 / self.p)
def create_D_matrix(self):
self.D = np.zeros((self.n,self.n))
for i in range(self.n):
for j in range(self.n):
if i==j:
self.D[i][j] = 0
if i>j:
self.D[i][j] = self.cauclate_dis(self.x[i],self.x[j])
self.D[j][i] = self.cauclate_dis(self.x[i],self.x[j])
def cauclate_cluster_dis(self,c1,c2):
if self.dis_way == 'min':
min_ = 999999999
for i in c1:
for j in c2:
min_ = min(min_,self.D[i][j])
return min_
if self.dis_way == 'mean':
mean1 = np.mean([self.x[i] for i in c1],axis=0)
mean2 = np.mean([self.x[i] for i in c2],axis=0)
return self.cauclate_dis(mean1,mean2)
def split_C(self,C):
C1 = []
C2 = []
max_ = 0
for i in C:
for j in C:
if j >= i:
if self.D[i][j] >= max_:
max_ = self.D[i][j]
max_i = i
max_j = j
C1.append(max_i)
C2.append(max_j)
for c in C:
if c == max_i or c == max_j:
continue
elif self.D[max_i][c] >= self.D[max_j][c]:
C2.append(c)
else:
C1.append(c)
return C1, C2
def fit(self,x):
self.x = np.array(x)
self.n = len(x)
self.create_D_matrix()
if self.c_way == 'agg':
C_way = []
start_c = []
for i in range(self.n):
start_c.append([i])
C_way.append(start_c)
iter = 0
while len(C_way[iter]) > 1:
num_c = len(C_way[iter])
c_temp = []
dis_temp = []
for c1 in range(num_c):
for c2 in range(num_c):
if c1 > c2:
c_temp.append([c1,c2])
dis_temp.append(self.cauclate_cluster_dis(C_way[iter][c1],C_way[iter][c2]))
min_dis = min(dis_temp)
min_index = dis_temp.index(min_dis)
c1 = c_temp[min_index][0]
c2 = c_temp[min_index][1]
new_c = copy.deepcopy(C_way[iter])
new_c.append(new_c[c1]+new_c[c2])
del new_c[c1]
del new_c[c2]
C_way.append(new_c)
iter += 1
self.result = C_way
if self.c_way == 'div':
C_way = []
new_c = list(range(self.n))
C_way.append([new_c])
iter = 0
while len(C_way[iter]) < self.n:
print(C_way)
new_c = []
for C in C_way[iter]:
if len(C) == 1 :
new_c.append(C)
else:
C1,C2 = self.split_C(C)
new_c.append(C1)
new_c.append(C2)
C_way.append(new_c)
iter+=1
self.result = C_way
def predict(self):
final = np.zeros(len(self.x))
if self.c_way == 'div':
result = self.result[self.k//2+1]
if self.c_way == 'agg':
result = self.result[-self.k]
print(result)
for i in range(len(result)):
for j in result[i]:
final[j] = i
return list(final)
def main():
x = [[0,2],[0,0],[1,0],[5,0],[5,2],[1,5],
[2,4],[3,5],[1,1],[1,4],[5,4],[5,3],[0,1],
[-1,0],[1,1],[4,4],[3,4]]
k = 3
HC = Hierarchical_cluster(k,dis_way='mean',c_way='agg')
HC.fit(x)
result = HC.predict()
print(result)
color_list =['r','k','b','g']
for i,c in zip(range(len(result)),color_list[:len(result)]):
positive_ = np.array(x)[np.array(result) == i]
plt.scatter([k[0] for k in positive_],[k[1] for k in positive_] , c=c)
plt.show()
if __name__ == '__main__':
main()
#-----result-----------------
/usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/shixian2/hierarchical_clustering.py
[[3], [2, 1, 14, 8, 12, 0, 13], [16, 6, 7, 9, 5, 15, 10, 11, 4]]
[1.0, 1.0, 1.0, 0.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0]
K_means聚类算法
import numpy as np
import matplotlib.pyplot as plt
class K_menas:
def __init__(self,k=None,p=2):
self.k = k
self.p = p
def cauclate_dis(self, x1, x2):
return np.sum(abs(x1 - x2) ** self.p) ** (1 / self.p)
def fit(self,x):
self.x = np.array(x)
self.n = len(x)
initial_c = np.random.choice(list(range(self.n)),self.k,replace=False)
self.mean_list = [self.x[i] for i in initial_c]
while True:
self.c_list = [[] for _ in range(self.k)]
self.x_list = [[] for _ in range(self.k)]
for i in range(self.n):
min_dis = 9999999
for j in range(self.k):
dis = self.cauclate_dis(self.x[i], self.mean_list[j])
if min_dis > dis:
min_dis = dis
c = j
self.c_list[c].append(i)
self.x_list[c].append(self.x[i])
mean_list_ = np.array([np.mean(c,0) for c in self.x_list])
if (mean_list_ == self.mean_list).all():
break
self.mean_list = mean_list_
def predict(self):
dict_ = {}
for i in range(len(self.c_list)):
for j in self.c_list[i]:
dict_[j]=i
return [dict_[i] for i in range(self.n)]
def compute_d(self,c):
max_ = -1
for c1 in range(len(c)):
for c2 in range(len(c)):
if c1 >= c2:
max_ = max(max_,self.cauclate_dis(c[c1],c[c2]))
return max_
def compute_mean_d(self):
return np.mean([self.compute_d(i) for i in self.x_list])
def main():
x = [[0,2],[0,0],[1,0],[5,0],[5,2],[1,5],
[2,4],[3,5],[1,1],[1,4],[5,4],[5,3],[0,1],
[-1,0],[1,1],[4,4],[3,4]]
k_ = 5
KM = K_menas(k_)
KM.fit(x)
result = KM.predict()
print(result)
k_list=[]
for k in [1,2,3,4,5,6,7]:
KM = K_menas(k)
KM.fit(x)
k_list.append([k,KM.compute_mean_d()])
print(k_list)
color_list =['r','k','b','g']
for i,c in zip(range(len(result)),color_list[:len(result)]):
positive_ = np.array(x)[np.array(result) == i]
plt.scatter([k[0] for k in positive_],[k[1] for k in positive_] , c=c)
plt.show()
if __name__ == '__main__':
main()
#----------result----------------
/usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/shixian2/K_means.py
[3, 3, 3, 4, 2, 0, 0, 1, 3, 0, 2, 2, 3, 3, 3, 1, 1]
[[1, 7.211102550927978], [2, 4.319596107466319], [3, 3.7863912010391503], [4, 1.9715873793431686], [5, 1.612899020449196], [6, 1.0107491837076632], [7, 1.0337239967856842]]