<第一周> city中国城市聚类 testdata学生上网聚类 例子

中国城市聚类###

# -*- coding: utf-8 -*-
kmeans算法
"""
Created on Thu May 18 22:55:45 2017 @author: sfzyk
"""
import numpy as np
#import sklearn as skl
from sklearn.cluster import KMeans
import os
os.chdir(r"D:\mechine_learning\mooc_data")
def loaddata(file):
fr=open(file)
lines=fr.readlines()#按照行分割 \n为标志(存在
city_data=[]
city_name=[]
for line in lines:
d=line.split(",")
city_name.append(d[0])
city_data.append([float(d[i]) for i in range(1,len(d))])
return city_name,city_data
city_name,city_data=loaddata("31省市居民家庭消费水平-city.txt") km=KMeans(n_clusters=10) label=km.fit_predict(city_data) expenses=np.sum(km.cluster_centers_,axis=1) city_cluster=[] for i in range(km.n_clusters):
city_cluster.append([])
city_cluster[i].append(expenses[i]) for i in range(len(city_name)):
city_cluster[label[i]].append(city_name[i]) # city_cluster,key=lambda x : x[0]
city_cluster.sort(key=lambda x:x[0]) for i in range(len(city_cluster)):
print(city_cluster[i])

学生上网数据聚类###

bdscan算法

# -*- coding: utf-8 -*-
"""
Created on Mon May 22 16:24:53 2017 @author: sfzyk
"""
import numpy as np
import sklearn as skl
from sklearn import metrics
import matplotlib.pyplot as plt mac2id=dict()
onlinetimes=[]
f=open("学生月上网时间分布-TestData.txt",encoding='utf-8')
#这里的encoding 是有必要的 不知道在开始指定coding是什么意思
for line in f:
mac=line.split(',')[2]
onlinetime=int(line.split(',')[6])
starttime=int(line.split(',')[4].split(' ')[1].split(':')[0])
if mac not in mac2id:
mac2id[mac]=len(onlinetimes)
onlinetimes.append((starttime,onlinetime))
else:
onlinetimes[mac2id[mac]]=[(starttime,onlinetime)]
real_X=np.array(onlinetimes).reshape((-1,2)) X=real_X[:,0:1]
dbscan=skl.cluster.DBSCAN(eps=0.03,min_samples=20).fit(X)
labels=dbscan.labels_ ratio=len(labels[labels[:]==-1])/len(labels)
print("noise ratio %f"%ratio) n_clusters_ = len(set(labels))-(1 if -1 in labels else 0) print("Estimated number of clusters:%d "%n_clusters_) print("Silhouette coefficient:%0.3f"%metrics.silhouette_score(X,labels)) for i in range(n_clusters_):
print("Clusters ",i,":")
print(list(X[labels==i].flatten()))
#flatten nX1 - 1Xn
plt.hist(X,24)

这里有一个Silhouette coefficient是一个轮廓系数,用于评价聚类效果

上一篇:使用K近邻算法改进约会网站的配对效果


下一篇:mysql查看字段注释(帮助信息)指令