参见《机器学习实战》
# -*- coding:cp936 -*-
#===============================================================================
# 设计KNN最近邻分类器:
# 找出每个元素在数据集中的最近邻的K个数据,统计这K个数据所属的类,所属类最多的那个类就是该元素所属的类
#===============================================================================
import numpy as np def loadHaiLunData(f_name):
with open(f_name) as fHandle:
fLines = fHandle.readlines()
dataLines = len(fLines)
label = []
dataSetMat = np.zeros((dataLines,3))
for i in range(dataLines):
lineList = fLines[i].strip().split('\t')
dataSetMat[i,:] = lineList[0:3]
label.append(int(lineList[-1]))
return dataSetMat,label def dataNorm(dataSet):
numOfEle = dataSet.shape[0]
minEle = dataSet.min(0)
maxEle = dataSet.max(0)
normedData = (dataSet-np.tile(minEle,(numOfEle,1)))/np.tile(maxEle-minEle,(numOfEle,1))
return normedData def classifyKnn(inX, dataSet, label, k):
#===========================================================================
# inX:输入向量
# dataSet:保存数据特征的数组,每一行为若干个特征的参数,与label对应
# label:表明当前这个数据集中的每一个元素属于哪一类
# k:设定最近邻的个数
#=========================================================================== #首先对数据集进行归一化
# dataSet = dataNorm(dataSet)
numOfEle = dataSet.shape[0]
index = 0
diffDistance = dataSet - np.tile(inX, (numOfEle,1))
diffDistance = diffDistance**2
squareDistance = diffDistance.sum(1)
# squareDistance = squareDistance**0.5
knnIndex = squareDistance.argsort()
#统计最近的k个近邻的label,看哪个label类别最多就可将该训练元素判为对应类
staticDict = {}
for i in range(k):
staticDict[label[knnIndex[i]]]=staticDict.get(label[knnIndex[i]],0)+1
itemList = staticDict.items()
argmax = np.argmax(itemList, axis = 0)
return itemList[argmax[1]][0] def testHaiLunClassify(k = 3, hRatio = 0.5):
dataSet,label = loadHaiLunData('datingTestSet2.txt')
# hRatio = 0.5
totalNum = dataSet.shape[0]
testNum = int(totalNum*hRatio)
dataNormed = dataNorm(dataSet)
errorClass = 0
for i in range(testNum):
classRes = classifyKnn(dataNormed[i,:], dataNormed[testNum:,:], label[testNum:], k)
if classRes != label[i]:
errorClass += 1
# print "classify error, No. %d should be label %d but got %d"%(i, label[i],classRes)
errorRate = errorClass/float(testNum)
# print "Error rate: %f"%(errorRate)
return errorRate if __name__ == '__main__':
errorList = []
kRange = range(1,50,1)
for k in kRange:
errorList.append(testHaiLunClassify(k))
print errorList
import matplotlib.pyplot as plt
fig = plt.figure(1)
# ax = fig.add_subplot(111)
plt.plot(kRange, errorList,'rs-')
plt.show()