Machine Learning in Action 读书笔记
第2章 k-近邻算法
文章目录
一、k-近邻分类算法
简单的说,k-近邻算法采用测量不同特征值之间的距离方法进行分类
(1)方法概述
- 优点:精度高、对异常值不敏感、无数据输入假定
- 缺点:计算复杂度高、空间复杂度高
- 使用数据范围:数值型和标称型
(2)k-近邻算法python代码
def classify0(inX, dataSet, labels, k): # inX为用于分类的输入向量x
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize, 1)) -dataSet
# print(dataSetSize, dataSet, diffMat)
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort() #argsort()是numpy中自带的,用于数组形式,返回的是数组元素排序的索引值
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 如果字典中没有该关键字,就返回0
# print(classCount)
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
二、示例
使用k-近邻算法改进约会网站的配对效果
(1)准备数据:从文本文件中解析数据
''' 将文本记录转换为Numpy的解析程序 '''
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() # 用于移除字符串头尾指定的字符,默认为空格或换行符
listFromLine = line.split('\t') # 返回为列表类型
returnMat[index,:] = listFromLine[0:3] # [index,:] 取二维数组的index行和所有列
# if index == 0 or index == 1:
# print(returnMat[index,:])
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
样本包含三个特征:每年获得的飞行常客里程数、玩视频游戏所耗实践百分比、每周消费的冰激凌公升数;最后一个数据为对此样本的喜欢程度(不喜欢、魅力一般、极具魅力)
(2)分析数据:使用Matplotlib创建散点图
# 分析数据:使用matplotlib创建散点图
fig = plt.figure()
ax = fig.add_subplot(111) # 111表示画布区域,将画布分为1*1 并选择第一个区域
# ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2])
# 带有样本分类标签的约会数据散点图
# ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2],
# 15.0*array(datingLabels), 15.0*array(datingLabels))
# plt.show()
ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1],
15.0 * array(datingLabels), 15.0 * array(datingLabels))
plt.show()
根据样本特征:每年获得的飞行常客里程数、玩视频游戏所耗实践百分比 制作的散点图
根据样本特征:玩视频游戏所耗实践百分比、每周消费的冰激凌公升数 制作的散点图(分类更明显)
(3)准备数据:归一化数值
从样本可以看出,飞行常客里程数的值远大于其他特征值,但是这三种特征值的权重被认为同等重要,所以飞行常客里程数严重影响了计算结果,而在处理这种不同取值范围的特征值时,通常采用数值归一化方法,如将取值范围处理为0~1或-1~1之间。
'''归一化数值 使用newValue = (oldValue-min)/(max=min)归一到0~1之间'''
def autoNorm(dataSet):
minVals = dataSet.min(0) # 参数0使得函数可以从列中选取最小值,而不是选取当前行的最小值
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1)) # 特征矩阵为1000*3,minValue和range都为1*3,tile()函数将变量内容复制成输入矩阵同样大小的矩阵
normDataSet = normDataSet/tile(ranges, (m, 1)) # 在numpy库中,矩阵除法需要使用函数linalg.solve(matA,matB)
return normDataSet, ranges, minVals
(4)测试算法:验证分类器
'''分类器针对约会网站的测试代码'''
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('input/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0] # 1000
numTestVecs = int(m*hoRatio) # 100
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m], 3)
print("the classifier came back with:%d,the real answer is:%d"\
%(classifierResult, datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is:%f"%(errorCount/float(numTestVecs)))
(5)使用算法:构建完整可用系统
'''约会网站预测函数'''
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spend playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('input/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
print("you will probably like this person:", resultList[classifierResult - 1])
三、约会网站配对分类完整代码
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
''' k-近邻算法 '''
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize, 1)) -dataSet
# print(dataSetSize, dataSet, diffMat)
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 如果字典中没有该关键字,就返回0
# print(classCount)
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
''' 将文本记录转换为Numpy的解析程序 '''
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
# if index == 0 or index == 1:
# print(returnMat[index,:])
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
'''归一化数值'''
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet/tile(ranges, (m, 1))
return normDataSet, ranges, minVals
'''分类器针对约会网站的测试代码'''
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('input/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio) # 100
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m], 3)
print("the classifier came back with:%d,the real answer is:%d"\
%(classifierResult, datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is:%f"%(errorCount/float(numTestVecs)))
'''约会网站预测函数'''
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spend playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('input/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
print("you will probably like this person:", resultList[classifierResult - 1])
if __name__ == "__main__":
# 测试分类器
group, labels = createDataSet()
classify = classify0([0,0], group, labels, 3)
# print(classify)
# 将文本记录转换为numpy解析程序测试
datingDataMat, datingLabels = file2matrix('input/datingTestSet2.txt')
# 分析数据:使用matplotlib创建散点图
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2])
# 带有样本分类标签的约会数据散点图
# ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2],
# 15.0*array(datingLabels), 15.0*array(datingLabels))
# plt.show()
ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1],
15.0 * array(datingLabels), 15.0 * array(datingLabels))
plt.show()
# 数据归一化处理
normMat, ranges, minVals = autoNorm(datingDataMat)
# print(normMat)
# print(ranges)
# print(minVals)
# 测试算法
errorRate = datingClassTest()
print(errorRate)
# 使用算法
classifyPerson()
四、本章小结
- k-近邻算法是分类数据最简单有效的算法
- 使用算法时我们必须有接近实际数据的训练样本数据
- k-近邻算法必须报讯全部数据集,如果训练数据集很大,必须使用大量的存储空间
- 由于必须对数据集中的每个数据计算距离值,实际使用时可能非常耗时
- k-近邻算法无法给出任何数据的基础结构信息