KNN实现简单手写识别
数据集见Source/数据集/手写数字 at main · ziwenhahaha/Source (github.com)
导包
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
随便读一张图看看
img_arr = plt.imread('./digist/5/5_1.bmp')
plt.imshow(img_arr)
查看图片形状
img_arr.shape
迭代目录,把所有图片的特征存入feature,标签存入target
feature = []
target = []
for i in range(10):
for j in range(1,501):
#./digist/5/5_1.bmp
img_path = f"./digist/{i}/{i}_{j}.bmp"
#print(imp_path)
img_arr = plt.imread(img_path)
feature.append(img_arr)
target.append(i)
转一下numpy.array格式,方面后面坍塌成二维
feature = np.array(feature)
target = np.array(target)
查看形状
feature.shape
坍塌成二维
#将featrue变形成2维
feature = feature.reshape((5000,28*28))
切分数据集
#切分数据集
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.1,random_state=2021)
寻找最优的超参数
#寻找最优的超参数
ks = np.arange(5,150,5)
scores = []
for k in ks:
knn = KNeighborsClassifier(k)
%time score = cross_val_score(knn,x_train,y_train,cv=5).mean()
scores.append(score)
print(k)
plt.plot(ks,scores)
scores = np.array(scores)
max_value_index = np.argmax(scores)
print('最优的k值:',ks[max_value_index])
建立模型,用最优的超参数进行模型训练
#建立模型,基于最优的超参数进行模型训练
knn = KNeighborsClassifier(5)
knn.fit(x_train,y_train)
print(knn.score(x_test,y_test))
print('模型分类结果:',knn.predict(x_test))
print('真实的分类结果:',y_test)
pd.Series(np.array(knn.predict(x_test))==np.array(y_test)).value_counts()
用训练好的模型,去检测别的图片
#将训练好的模型作用的未知的数据中
img_arr = plt.imread('./123.jpg')
plt.imshow(img_arr)
将某个数字剪下来
#将5裁剪下来
five_img_arr = img_arr[300:435,185:295]
plt.imshow(five_img_arr)
明显剪下来不符合28*28,所以需要等比例压缩
#将five_img_arr进行像素的等比例压缩(28*28),在对其进行扁平化处理即可
import scipy.ndimage as ndimage
five_img_arr_zoom = ndimage.zoom(five_img_arr,zoom=(28/135,28/110))
预测
x_train[1].shape #随便查看一个样本,发现维度是784
knn.predict([five_img_arr_zoom.reshape(784,)])
#最后结果:array([5]) 预测成功,图片就是5
完整代码
#导包
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
#随便读一张图看看
img_arr = plt.imread('./digist/5/5_1.bmp')
plt.imshow(img_arr)
#查看图片形状
img_arr.shape
#迭代目录,把所有图片的特征存入feature,标签存入target
feature = []
target = []
for i in range(10):
for j in range(1,501):
#./digist/5/5_1.bmp
img_path = f"./digist/{i}/{i}_{j}.bmp"
#print(imp_path)
img_arr = plt.imread(img_path)
feature.append(img_arr)
target.append(i)
#转一下numpy.array格式,方面后面坍塌成二维
feature = np.array(feature)
target = np.array(target)
#查看形状
feature.shape
#坍塌成二维
feature = feature.reshape((5000,28*28))
#切分数据集
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.1,random_state=2021)
#寻找最优的超参数
ks = np.arange(5,150,5)
scores = []
for k in ks:
knn = KNeighborsClassifier(k)
%time score = cross_val_score(knn,x_train,y_train,cv=5).mean()
scores.append(score)
print(k)
plt.plot(ks,scores)
scores = np.array(scores)
max_value_index = np.argmax(scores)
print('最优的k值:',ks[max_value_index])
#建立模型,基于最优的超参数进行模型训练
knn = KNeighborsClassifier(5)
knn.fit(x_train,y_train)
print(knn.score(x_test,y_test))
print('模型分类结果:',knn.predict(x_test))
print('真实的分类结果:',y_test)
pd.Series(np.array(knn.predict(x_test))==np.array(y_test)).value_counts()
#将训练好的模型检测别的图片
img_arr = plt.imread('./123.jpg')
plt.imshow(img_arr)
#将某个数字剪下来
five_img_arr = img_arr[300:435,185:295]
plt.imshow(five_img_arr)
#明显剪下来不符合28*28,所以需要等比例压缩
#将five_img_arr进行像素的等比例压缩(28*28),在对其进行扁平化处理即可
import scipy.ndimage as ndimage
five_img_arr_zoom = ndimage.zoom(five_img_arr,zoom=(28/135,28/110))
预测
x_train[1].shape #随便查看一个样本,发现维度是784
knn.predict([five_img_arr_zoom.reshape(784,)])
#最后结果:array([5]) 预测成功,图片就是5