聚类层次【python,机器学习,算法】
import matplotlib.pyplot as plt
import numpy as np
# 设置了随机数种子,让随机数生成变得可重复,即在设置过后,每次运行代码得到的随机数都是一样的。
np.random.seed(0)
cluster1 = np.random.randn(30, 2) + np.array([0, 7])
cluster2 = np.random.randn(30, 2) + np.array([8, 0])
cluster3 = np.random.randn(30, 2) + np.array([8, 8])
# 用于沿着垂直方向(行方向)堆叠数组,得到一个总的数据集
data = np.vstack([cluster1, cluster2, cluster3])
# 1. 初始化每个数据点为一个独立的簇
def initialize_clusters(data):
return [[point] for point in data]
# 2. 计算簇中心之间的距离
def compute_distances(clusters):
distances = np.zeros((len(clusters), len(clusters)))
for i in range(len(clusters)):
for j in range(len(clusters)):
if i != j:
# 使用欧式距离计算两个簇的距离
distances[i][j] = np.sqrt(sum((np.mean(clusters[i],
axis=0) - np.mean(
clusters[j], axis=0)) ** 2))
return distances
# 找距离最近的两个簇
def find_closest_clusters(distances):
min_distance = np.inf
# 用于保存最近两个簇对应的索引
closest_clusters = None
for i in range(len(distances)):
for j in range(len(distances)):
if i != j and distances[i][j] < min_distance:
min_distance = distances[i][j]
closest_clusters = i, j
return closest_clusters
# 3. 合并最近的两个簇为一个新的簇,并更新簇中心点
def merge_clusters(clusters, closest_clusters):
i, j = closest_clusters
merged_cluster = clusters[i] + clusters[j] # 将最近的两个簇更新为一个簇
new_clusters = [cluster for idx, cluster in enumerate(clusters) if
idx not in (i, j)]
# 这里将没有合并的簇放进新的簇列表里面
new_clusters.append(merged_cluster)
return new_clusters
def hierarchical_clustering(data, k):
# 初始化每个数据点为一个独立的簇
clusters = initialize_clusters(data)
# 开始迭代合并最相似的簇
while len(clusters) > k:
# 计算簇中心之间的距离,并找到最近的两个簇
distances = compute_distances(clusters)
closest_clusters = find_closest_clusters(distances)
# 合并最近的两个簇为一个新的簇,并更新簇中心点
clusters = merge_clusters(clusters, closest_clusters)
return clusters
# 执行层次聚类算法
k = 4
clusters = hierarchical_clustering(data, k)
# 打印聚类结果
for idx, cluster in enumerate(clusters):
print(f"Cluster {idx + 1}: ", cluster)
# 绘制聚类结果的图表
plt.figure(figsize=(8, 6))
colors = ["red", "green", "blue", "yellow"]
for i in range(k):
for p in clusters[i]:
plt.scatter(x=p[0], y=p[1], color=colors[i])
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Hierarchical Clustering")
plt.show()