统计学习方法第十四章作业:聚类—层次聚类聚合/分裂算法、K_means聚类算法 代码实现

层次聚类聚合/分裂算法

import numpy as np
import copy
import matplotlib.pyplot as plt

class Hierarchical_cluster:

    def __init__(self,k=None,p=2,dis_way='min',c_way='agg'):
        self.k = k
        self.p = p
        self.dis_way = dis_way
        self.c_way = c_way

    def cauclate_dis(self, x1, x2):
        return np.sum(abs(x1 - x2) ** self.p) ** (1 / self.p)

    def create_D_matrix(self):
        self.D = np.zeros((self.n,self.n))
        for i in range(self.n):
            for j in range(self.n):
                if i==j:
                    self.D[i][j] = 0
                if i>j:
                    self.D[i][j] = self.cauclate_dis(self.x[i],self.x[j])
                    self.D[j][i] = self.cauclate_dis(self.x[i],self.x[j])

    def cauclate_cluster_dis(self,c1,c2):
        if self.dis_way == 'min':
            min_ = 999999999
            for i in c1:
                for j in c2:
                    min_ = min(min_,self.D[i][j])
            return min_

        if self.dis_way == 'mean':
            mean1 = np.mean([self.x[i] for i in c1],axis=0)
            mean2 = np.mean([self.x[i] for i in c2],axis=0)
            return self.cauclate_dis(mean1,mean2)

    def split_C(self,C):
        C1 = []
        C2 = []
        max_ = 0
        for i in C:
            for j in C:
                if j >= i:
                    if self.D[i][j] >= max_:
                        max_ = self.D[i][j]
                        max_i = i
                        max_j = j
        C1.append(max_i)
        C2.append(max_j)
        for c in C:
            if c == max_i or c == max_j:
                continue
            elif self.D[max_i][c] >= self.D[max_j][c]:
                C2.append(c)
            else:
                C1.append(c)
        return C1, C2

    def fit(self,x):
        self.x = np.array(x)
        self.n = len(x)
        self.create_D_matrix()

        if self.c_way == 'agg':
            C_way = []
            start_c = []
            for i in range(self.n):
                start_c.append([i])
            C_way.append(start_c)
            iter = 0
            while len(C_way[iter]) > 1:
                num_c = len(C_way[iter])
                c_temp = []
                dis_temp = []
                for c1 in range(num_c):
                    for c2 in range(num_c):
                        if c1 > c2:
                            c_temp.append([c1,c2])
                            dis_temp.append(self.cauclate_cluster_dis(C_way[iter][c1],C_way[iter][c2]))
                min_dis = min(dis_temp)
                min_index = dis_temp.index(min_dis)
                c1 = c_temp[min_index][0]
                c2 = c_temp[min_index][1]
                new_c = copy.deepcopy(C_way[iter])
                new_c.append(new_c[c1]+new_c[c2])
                del new_c[c1]
                del new_c[c2]
                C_way.append(new_c)
                iter += 1
            self.result = C_way


        if self.c_way == 'div':
            C_way = []
            new_c = list(range(self.n))
            C_way.append([new_c])
            iter = 0
            while len(C_way[iter]) < self.n:
                print(C_way)
                new_c = []
                for C in C_way[iter]:
                    if len(C) == 1 :
                        new_c.append(C)
                    else:
                        C1,C2 = self.split_C(C)
                        new_c.append(C1)
                        new_c.append(C2)
                C_way.append(new_c)
                iter+=1
            self.result = C_way

    def predict(self):
        final = np.zeros(len(self.x))
        if self.c_way == 'div':
            result  = self.result[self.k//2+1]
        if self.c_way == 'agg':
            result = self.result[-self.k]
        print(result)
        for i in range(len(result)):
            for j in result[i]:
                final[j] = i
        return list(final)

def main():
    x = [[0,2],[0,0],[1,0],[5,0],[5,2],[1,5],
         [2,4],[3,5],[1,1],[1,4],[5,4],[5,3],[0,1],
         [-1,0],[1,1],[4,4],[3,4]]
    k = 3
    HC = Hierarchical_cluster(k,dis_way='mean',c_way='agg')
    HC.fit(x)
    result = HC.predict()
    print(result)
    color_list =['r','k','b','g']
    for i,c in zip(range(len(result)),color_list[:len(result)]):
        positive_ = np.array(x)[np.array(result) == i]
        plt.scatter([k[0] for k in positive_],[k[1] for k in positive_] , c=c)
    plt.show()

if __name__ == '__main__':
    main()

#-----result-----------------
/usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/shixian2/hierarchical_clustering.py
[[3], [2, 1, 14, 8, 12, 0, 13], [16, 6, 7, 9, 5, 15, 10, 11, 4]]
[1.0, 1.0, 1.0, 0.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0]
统计学习方法第十四章作业:聚类—层次聚类聚合/分裂算法、K_means聚类算法 代码实现

K_means聚类算法

import numpy as np
import matplotlib.pyplot as plt

class K_menas:
    def __init__(self,k=None,p=2):
        self.k = k
        self.p = p

    def cauclate_dis(self, x1, x2):
        return np.sum(abs(x1 - x2) ** self.p) ** (1 / self.p)

    def fit(self,x):
        self.x = np.array(x)
        self.n = len(x)
        initial_c = np.random.choice(list(range(self.n)),self.k,replace=False)
        self.mean_list = [self.x[i] for i in initial_c]

        while True:
            self.c_list = [[] for _ in range(self.k)]
            self.x_list = [[] for _ in range(self.k)]
            for i in range(self.n):
                min_dis = 9999999
                for j in range(self.k):
                    dis = self.cauclate_dis(self.x[i], self.mean_list[j])
                    if min_dis > dis:
                        min_dis = dis
                        c = j
                self.c_list[c].append(i)
                self.x_list[c].append(self.x[i])
            mean_list_ = np.array([np.mean(c,0) for c in self.x_list])
            if (mean_list_ == self.mean_list).all():
                break
            self.mean_list = mean_list_

    def predict(self):
        dict_ = {}
        for i in range(len(self.c_list)):
            for j in self.c_list[i]:
                dict_[j]=i
        return [dict_[i] for i in range(self.n)]

    def compute_d(self,c):
        max_ = -1
        for c1 in range(len(c)):
            for c2 in range(len(c)):
                if c1 >= c2:
                    max_ = max(max_,self.cauclate_dis(c[c1],c[c2]))
        return max_

    def compute_mean_d(self):
        return np.mean([self.compute_d(i) for i in self.x_list])

def main():
    x = [[0,2],[0,0],[1,0],[5,0],[5,2],[1,5],
         [2,4],[3,5],[1,1],[1,4],[5,4],[5,3],[0,1],
         [-1,0],[1,1],[4,4],[3,4]]
    k_ = 5
    KM = K_menas(k_)
    KM.fit(x)
    result = KM.predict()
    print(result)

    k_list=[]
    for k in [1,2,3,4,5,6,7]:
        KM = K_menas(k)
        KM.fit(x)
        k_list.append([k,KM.compute_mean_d()])
    print(k_list)

    color_list =['r','k','b','g']
    for i,c in zip(range(len(result)),color_list[:len(result)]):
        positive_ = np.array(x)[np.array(result) == i]
        plt.scatter([k[0] for k in positive_],[k[1] for k in positive_] , c=c)
    plt.show()

if __name__ == '__main__':
    main()

#----------result----------------
/usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/shixian2/K_means.py
[3, 3, 3, 4, 2, 0, 0, 1, 3, 0, 2, 2, 3, 3, 3, 1, 1]
[[1, 7.211102550927978], [2, 4.319596107466319], [3, 3.7863912010391503], [4, 1.9715873793431686], [5, 1.612899020449196], [6, 1.0107491837076632], [7, 1.0337239967856842]]
统计学习方法第十四章作业:聚类—层次聚类聚合/分裂算法、K_means聚类算法 代码实现
上一篇:about the data dictionary?


下一篇:【spring】---spring的装配Bean方式