参见《机器学习实战》
1 # -*- coding:cp936 -*- 2 #=============================================================================== 3 # 设计KNN最近邻分类器: 4 # 找出每个元素在数据集中的最近邻的K个数据,统计这K个数据所属的类,所属类最多的那个类就是该元素所属的类 5 #=============================================================================== 6 import numpy as np 7 8 def loadHaiLunData(f_name): 9 with open(f_name) as fHandle: 10 fLines = fHandle.readlines() 11 dataLines = len(fLines) 12 label = [] 13 dataSetMat = np.zeros((dataLines,3)) 14 for i in range(dataLines): 15 lineList = fLines[i].strip().split(‘\t‘) 16 dataSetMat[i,:] = lineList[0:3] 17 label.append(int(lineList[-1])) 18 return dataSetMat,label 19 20 21 def dataNorm(dataSet): 22 numOfEle = dataSet.shape[0] 23 minEle = dataSet.min(0) 24 maxEle = dataSet.max(0) 25 normedData = (dataSet-np.tile(minEle,(numOfEle,1)))/np.tile(maxEle-minEle,(numOfEle,1)) 26 return normedData 27 28 def classifyKnn(inX, dataSet, label, k): 29 #=========================================================================== 30 # inX:输入向量 31 # dataSet:保存数据特征的数组,每一行为若干个特征的参数,与label对应 32 # label:表明当前这个数据集中的每一个元素属于哪一类 33 # k:设定最近邻的个数 34 #=========================================================================== 35 36 #首先对数据集进行归一化 37 # dataSet = dataNorm(dataSet) 38 numOfEle = dataSet.shape[0] 39 index = 0 40 diffDistance = dataSet - np.tile(inX, (numOfEle,1)) 41 diffDistance = diffDistance**2 42 squareDistance = diffDistance.sum(1) 43 # squareDistance = squareDistance**0.5 44 knnIndex = squareDistance.argsort() 45 #统计最近的k个近邻的label,看哪个label类别最多就可将该训练元素判为对应类 46 staticDict = {} 47 for i in range(k): 48 staticDict[label[knnIndex[i]]]=staticDict.get(label[knnIndex[i]],0)+1 49 itemList = staticDict.items() 50 argmax = np.argmax(itemList, axis = 0) 51 return itemList[argmax[1]][0] 52 53 def testHaiLunClassify(k = 3, hRatio = 0.5): 54 dataSet,label = loadHaiLunData(‘datingTestSet2.txt‘) 55 # hRatio = 0.5 56 totalNum = dataSet.shape[0] 57 testNum = int(totalNum*hRatio) 58 dataNormed = dataNorm(dataSet) 59 errorClass = 0 60 for i in range(testNum): 61 classRes = classifyKnn(dataNormed[i,:], dataNormed[testNum:,:], label[testNum:], k) 62 if classRes != label[i]: 63 errorClass += 1 64 # print "classify error, No. %d should be label %d but got %d"%(i, label[i],classRes) 65 errorRate = errorClass/float(testNum) 66 # print 67 # print "Error rate: %f"%(errorRate) 68 return errorRate 69 70 if __name__ == ‘__main__‘: 71 errorList = [] 72 kRange = range(1,50,1) 73 for k in kRange: 74 errorList.append(testHaiLunClassify(k)) 75 print errorList 76 import matplotlib.pyplot as plt 77 fig = plt.figure(1) 78 # ax = fig.add_subplot(111) 79 plt.plot(kRange, errorList,‘rs-‘) 80 plt.show() 81