导入库
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
读入数据
data = pd.read_csv('./mushrooms.csv') # 5 rows × 22 columns
'''
查看数据基本情况
data.head()
data.info()
data.describe()
'''
画图
plt.figure(figsize=(30, 150))
column_per_line = 3 # 每一列三张图
columns = len(data.columns)
for i, attr in enumerate(data.columns):
plt.subplot(8,column_per_line,i+1)
plt.hist(data.iloc[:,i])
plt.title(attr.capitalize())
font = {'family' : 'Times New Roman',
'weight' : 'normal',
'size' : 16,
}
plt.show()
veil-type
列属性只有一个值,不具备区分度,删除该列
data.drop('veil-type',axis=1,inplace=True)
对表格内其他内容进行编码
# 二元属性编码
le = LabelEncoder()
for col in data.columns:
if len(set(data[col]))==2: # 若改列只有两个值,则对其进行转换
data[col] = le.fit_transform(data[col])
# 多元属性编码
data = pd.get_dummies(data)
'''
# 查看每列有哪几种元素
for i in data.columns:
print(i,data[i].unique())
'''
对数据进行聚类
def cluster(X):
score = -1
duration = 0
start = time.time()
for clusters_num in range(2, ?): # 聚类数目2-? 其中,?自定义
kmeans = KMeans(n_clusters=clusters_num).fit(data) # KMeans聚类
test_preds = kmeans.predict(data)
test_score = silhouette_score(data,test_preds) # 计算轮廓系数
print("component score is ", test_score)
if (score < test_score):
best_clustersnum = clusters_num
score = test_score
best_clusters=kmeans
end = time.time()
duration += end - start
print('耗时: {:.4f}s'.format(duration))
print('当k为{}时,轮廓系数最高为{:.3f}'.format(best_clustersnum, score))
return best_clusters, score
clu, score = cluster(data)
降维
pca=PCA(n_components=20)
pca.fit(data)
components = pd.DataFrame(np.round(pca.components_, 4), columns = [data.keys()])
components.plot.bar(figsize=(25, 5))
plt.show()
'''
维度达到一百多,画出来的图很可怕
'''
计算各主成分的方差解释比
var_ratio = pca.explained_variance_ratio_
print(sum(var_ratio))
逐个计算var_ratio中的各元素之和,当至少取前多少个元素时,和大于0.7?
var_sum=0
for comp in range(data.shape[1]):
pca_comp=PCA(n_components=comp+1)
pca_comp.fit(data)
var_ratio = pca_comp.explained_variance_ratio_
var_sum=sum(var_ratio)
print(comp+1,var_sum)
'''
1 0.1716352224987523
2 0.2936478973280815
3 0.3816348269514529
...
N 0.71xxxxxxxxxxxxxx
N即为所求
'''
聚类效果绘制
import visuals as vs # 需使用visuals.py
preds = clu.predict(data)
centers = KMeans(n_clusters=?).fit(data_pca).cluster_centers_ # n_cluster=? 上一步轮廓系数最高的数目
vs.cluster_results(data_pca,preds,centers)
结果示意图
使用数据下载 https://download.csdn.net/download/weixin_42038760/19646606