以商圈为例,进行层次聚类,k-means聚类,T-SNE,PCA降维
data的地址:链接: https://pan.baidu.com/s/1A95Dg1O3ovnkpST1c1EFLw 密码: i3cc
层次聚类,k-means聚类,T-SNE,PCA降维
1.读取数据
import pandas as pd
df = pd.read_excel("business_circle.xls")
1.1归一化
for i in range(1,5):
df[i] = (df.iloc[:,i] - df.iloc[:,i].mean())/(df.iloc[:,i].std())
2.层次聚类
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.cluster.hierarchy as sch #用于进行层次聚类,画层次聚类图的工具包
import scipy.spatial.distance as ssd
from scipy.cluster.vq import vq,kmeans,whiten
import numpy as np
disMat = sch.distance.pdist(df[[1,2,3,4]],'euclidean')
# 进行层次聚类:
Z = sch.linkage(disMat, method='average')
# 将层级聚类结果以树状图表示出来并保存
P = sch.dendrogram(Z)
# 根据linkage matrix Z得到聚类结果:
cluster = sch.fcluster(Z, t=1, criterion='inconsistent')
print("Original cluster by hierarchy clustering:\n",cluster)
3.k-means聚类
#选择聚类的数目
from sklearn.cluster import KMeans
list_for_best = []
x_list_for_plot = []
for i in range(2,10):
kmeans = KMeans(n_clusters=i, max_iter=3000, n_init=40, \
init='k-means++',n_jobs=-1)
kmeans.fit(df[[1,2,3,4]])
print("inertia: {}".format(kmeans.inertia_))
list_for_best.append(kmeans.inertia_)
x_list_for_plot.append(i)
import matplotlib.pyplot as plt
#选取类别的个数
plt.figure(figsize=(18,4))
plt.plot(x_list_for_plot,list_for_best,label="error",color="red",linewidth=1)
plt.xlabel("n_features")
plt.ylabel("error")
plt.legend()
plt.show()
# list_for_best
#10和7看一下
#由上图可知,3为拐点
kmeans = KMeans(n_clusters=3)
kmeans.fit(df[[1,2,3,4]])
label_pred = kmeans.labels_ #获取聚类标签
centroids = kmeans.cluster_centers_ #获取聚类中心
inertia = kmeans.inertia_ # 获取聚类准则的总和
label_pred
4.TSNE降维
’
''
6、可视化
'''
import matplotlib.pyplot as plt
# 使用T-SNE算法,对权重进行降维,准确度比PCA算法高,但是耗时长
tsne = TSNE(n_components=2)
decomposition_data = tsne.fit_transform(df[[1,2,3,4]])
x = []
y = []
for i in decomposition_data:
x.append(i[0])
y.append(i[1])
fig = plt.figure(figsize=(10, 10))
ax = plt.axes()
plt.scatter(x, y, c=kmeans.labels_, marker="x")
plt.xticks(())
plt.yticks(())
# plt.show()
plt.savefig('./sample.png', aspect=1)
5.PCA降维
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
from sklearn.datasets.samples_generator import make_blobs
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(df[[1,2,3,4]])
5.1解释的比例
pca.explained_variance_ratio_