数据挖掘与分析练习——蘑菇(涉及聚类、降维)

导入库

import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

读入数据

data = pd.read_csv('./mushrooms.csv') # 5 rows × 22 columns
'''
查看数据基本情况
data.head()
data.info()
data.describe()
'''

画图

plt.figure(figsize=(30, 150))
column_per_line = 3 # 每一列三张图
columns = len(data.columns)

for i, attr in enumerate(data.columns):
    plt.subplot(8,column_per_line,i+1)
    plt.hist(data.iloc[:,i])
    plt.title(attr.capitalize())

font = {'family' : 'Times New Roman',
        'weight' : 'normal',
        'size'   : 16,
        }

plt.show()

veil-type列属性只有一个值,不具备区分度,删除该列

data.drop('veil-type',axis=1,inplace=True)

对表格内其他内容进行编码

# 二元属性编码
le = LabelEncoder()
for col in data.columns:
    if len(set(data[col]))==2: # 若改列只有两个值,则对其进行转换
        data[col] = le.fit_transform(data[col])
# 多元属性编码
data = pd.get_dummies(data)

'''
# 查看每列有哪几种元素
for i in data.columns:
    print(i,data[i].unique())
'''

对数据进行聚类

def cluster(X):
    score = -1
    duration = 0
    start = time.time()
    for clusters_num in range(2, ?): # 聚类数目2-? 其中,?自定义
        kmeans = KMeans(n_clusters=clusters_num).fit(data) # KMeans聚类
        test_preds = kmeans.predict(data)
        test_score = silhouette_score(data,test_preds) # 计算轮廓系数
        print("component score is ", test_score)

        if (score < test_score):
            best_clustersnum = clusters_num
            score = test_score
            best_clusters=kmeans

    end = time.time()
    duration += end - start

    print('耗时: {:.4f}s'.format(duration))
    print('当k为{}时,轮廓系数最高为{:.3f}'.format(best_clustersnum, score))

    return best_clusters, score

clu, score = cluster(data)

降维

pca=PCA(n_components=20)
pca.fit(data)

components = pd.DataFrame(np.round(pca.components_, 4), columns = [data.keys()])
components.plot.bar(figsize=(25, 5))
plt.show()
'''
维度达到一百多,画出来的图很可怕
'''

计算各主成分的方差解释比

var_ratio =  pca.explained_variance_ratio_
print(sum(var_ratio))

逐个计算var_ratio中的各元素之和,当至少取前多少个元素时,和大于0.7?

var_sum=0
for comp in range(data.shape[1]):
    pca_comp=PCA(n_components=comp+1)
    pca_comp.fit(data)
    var_ratio =  pca_comp.explained_variance_ratio_
    var_sum=sum(var_ratio)
    print(comp+1,var_sum)
'''
1 0.1716352224987523
2 0.2936478973280815
3 0.3816348269514529
...
N 0.71xxxxxxxxxxxxxx
N即为所求
'''

聚类效果绘制

import visuals as vs # 需使用visuals.py
preds = clu.predict(data)
centers = KMeans(n_clusters=?).fit(data_pca).cluster_centers_ # n_cluster=? 上一步轮廓系数最高的数目
vs.cluster_results(data_pca,preds,centers)

结果示意图
数据挖掘与分析练习——蘑菇(涉及聚类、降维)

使用数据下载 https://download.csdn.net/download/weixin_42038760/19646606

上一篇:【Java题解】面试题 03.02. 栈的最小值


下一篇:232. 用栈实现队列