K-means algorithm for iris data clustering

K-means 是一种经典的聚类的算法,简单好用,火的一塌糊涂,对于刚刚入坑的小白们有着重要的学习价值,好了不虾扯蛋了,上代码。

iris datasets row= 150 ,column 4, 3-type, each type has 4 features . ok baby , let us to code
调包,预处理数据集

import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
# 特征向量,并且是按顺序排列的
X = iris.data  
# 标签
lable = iris.target  
# 数据集预处理,以花萼面积为横坐标,以花瓣面积做纵坐标
arr = np.array(X)
hua_e = arr[:, 0] * arr[:, 1]
hua_ban = arr[:, 2] * arr[:, 3]

定义 main 函数

# 这里聚3类,k取3
k = 3
b = init_data(k)
test_hua_e = [hua_e[i] for i in range(len(hua_e)) if (i in b)]
test_hua_ban = [hua_ban[i] for i in range(len(hua_ban)) if (i in b)]
test_lable = [lable[i] for i in range(len(lable)) if (i in b)]
x = hua_e
y = hua_ban
x0 = test_hua_e
y0 = test_hua_ban
# 第一次随机聚类
n = 0
ds = getDistance(x, y, x0, y0, k)
temp = cluster(ds, x)
temp1 = EDistance(x, y, x0, y0, k)
n = n + 1
center = cent(temp)
x0 = center[0]
y0 = center[1]
ds = getDistance(x, y, x0, y0, k)
temp = cluster(ds, x)
temp2 = EDistance(x, y, x0, y0, k)
n = n + 1
# 比较两次平方误差 判断是否相等,不相等继续迭代
while np.abs(temp2 - temp1) != 0:
    temp1 = temp2
    center = cent(temp)
    x0 = center[0]
    y0 = center[1]
    ds = getDistance(x, y, x0, y0, k)
    temp = cluster(ds, x)
    temp2 = EDistance(x, y, x0, y0, k)
    n = n + 1
    print(n, temp2)
# 结果可视化
print("迭代次数: ", n)  # 统计出迭代次数
print('质心位置:', x0, y0)
plt.scatter(x0, y0, color='r', s=50, marker='s')
plt.scatter(x, y, c=temp, s=25, marker='o')
plt.show()

初始化数据

# 在集合中随机放入了3条数据
def init_data(k):
    b = set()
    while (len(b) < k):
        b.add(np.random.randint(0, 150))
    return (b)

每个点到中心点距离

def getDistance(point_x, point_y, cent_x, cent_y, k):
    x = point_x
    y = point_y
    x0 = cent_x
    y0 = cent_y
    i = 0
    j = 0
    ds = [[] for i in range(len(x))]

    while i < len(x):
        while j < k:
            M = np.sqrt((x[i] - x0[j]) * (x[i] - x0[j]) + (y[i] - y0[j]) * (y[i] - y0[j]))
            M = round(M, 1)
            j = j + 1
            ds[i].append(M)
        j = 0
        i = i + 1
    return (ds)

计算每次迭代的距离误差

def EDistance(point_x, point_y, cent_x, cent_y, k):
    x = point_x
    y = point_y
    x0 = cent_x
    y0 = cent_y
    i = 0
    j = 0
    sum = 0
    while i < k:
        while j < len(x):
            M = (x[j] - x0[i]) * (x[j] - x0[i]) + (y[j] - y0[i]) * (y[j] - y0[i])
            M = round(M, 1)
            sum += M
            j = j + 1
        j = 0
        i = i + 1
    return (sum)

计算中心点和更新中心点

# 计算中心点
def cent(lable):
    temp = lable
    mean_x = []
    mean_y = []
    i = 0
    j = 0
    while i < 3:
        cent_x = 0
        cent_y = 0
        count = 0
        while j < len(x):
            if i == temp[j]:
                count = count + 1
                cent_x = cent_x + x[j]
                cent_y = cent_y + y[j]
            j = j + 1
        cent_x = cent_x / count
        cent_y = cent_y / count
        # 更新中心点
        mean_x.append(cent_x)
        mean_y.append(cent_y)
        j = 0
        i = i + 1
    return [mean_x, mean_y]

按K值依次聚类

def cluster(ds, x):
    x = x
    x = len(x)
    i = 0
    temp = []
    while i < x:
        temp.append(ds[i].index(min(ds[i])))
        i = i + 1
    return (temp)

迭代结果:
K-means algorithm for iris data clustering
可视化结果
K-means algorithm for iris data clustering

总的代码:

import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import numpy as np
iris = load_iris()
# 特征向量,并且是按顺序排列的
X = iris.data
# 标签
lable = iris.target
# 数据集预处理,以花萼面积为横坐标,以花瓣面积做纵坐标
arr = np.array(X)
hua_e = arr[:, 0] * arr[:, 1]
hua_ban = arr[:, 2] * arr[:, 3]

# 在集合中随机放入了3条数据
def init_data(k):
    b = set()
    while (len(b) < k):
        b.add(np.random.randint(0, 150))
    return (b)
# 每个点到中心点距离距离
def getDistance(point_x, point_y, cent_x, cent_y, k):
    x = point_x
    y = point_y
    x0 = cent_x
    y0 = cent_y
    i = 0
    j = 0
    ds = [[] for i in range(len(x))]

    while i < len(x):
        while j < k:
            M = np.sqrt((x[i] - x0[j]) * (x[i] - x0[j]) + (y[i] - y0[j]) * (y[i] - y0[j]))
            M = round(M, 1)
            j = j + 1
            ds[i].append(M)
        j = 0
        i = i + 1
    return (ds)
# 计算距离误差
def EDistance(point_x, point_y, cent_x, cent_y, k):
    x = point_x
    y = point_y
    x0 = cent_x
    y0 = cent_y
    i = 0
    j = 0
    sum = 0
    while i < k:
        while j < len(x):
            M = (x[j] - x0[i]) * (x[j] - x0[i]) + (y[j] - y0[i]) * (y[j] - y0[i])
            M = round(M, 1)
            sum += M
            j = j + 1
        j = 0
        i = i + 1
    return (sum)

# 计算中心点
def cent(lable):
    temp = lable
    mean_x = []
    mean_y = []
    i = 0
    j = 0
    while i < 3:
        cent_x = 0
        cent_y = 0
        count = 0
        while j < len(x):
            if i == temp[j]:
                count = count + 1
                cent_x = cent_x + x[j]
                cent_y = cent_y + y[j]
            j = j + 1
        cent_x = cent_x / count
        cent_y = cent_y / count
        # 更新中心点
        mean_x.append(cent_x)
        mean_y.append(cent_y)
        j = 0
        i = i + 1
    return [mean_x, mean_y]


# 按照k值聚类
def cluster(ds, x):
    x = x
    x = len(x)
    i = 0
    temp = []
    while i < x:
        temp.append(ds[i].index(min(ds[i])))
        i = i + 1
    return (temp)

# 主程序部分
# 这里聚3类,k取3
k = 3
b = init_data(k)
test_hua_e = [hua_e[i] for i in range(len(hua_e)) if (i in b)]
test_hua_ban = [hua_ban[i] for i in range(len(hua_ban)) if (i in b)]
test_lable = [lable[i] for i in range(len(lable)) if (i in b)]
x = hua_e
y = hua_ban
x0 = test_hua_e
y0 = test_hua_ban
# 第一次随机聚类
n = 0
ds = getDistance(x, y, x0, y0, k)
temp = cluster(ds, x)
temp1 = EDistance(x, y, x0, y0, k)
n = n + 1
center = cent(temp)
x0 = center[0]
y0 = center[1]
ds = getDistance(x, y, x0, y0, k)
temp = cluster(ds, x)
temp2 = EDistance(x, y, x0, y0, k)
n = n + 1
# 比较两次平方误差 判断是否相等,不相等继续迭代
while np.abs(temp2 - temp1) != 0:
    temp1 = temp2
    center = cent(temp)
    x0 = center[0]
    y0 = center[1]
    ds = getDistance(x, y, x0, y0, k)
    temp = cluster(ds, x)
    temp2 = EDistance(x, y, x0, y0, k)
    n = n + 1
    print(n, temp2)
# 结果可视化
print("迭代次数: ", n)  # 统计出迭代次数
print('质心位置:', x0, y0)
plt.scatter(x0, y0, color='r', s=50, marker='s')
plt.scatter(x, y, c=temp, s=25, marker='o')
plt.show()


上一篇:Task05 使用sklearn构建完整的分类项目


下一篇:机器学习算法(一): 基于逻辑回归的分类预测