下面是训练集和测试集的部分图像
SVM模型代码(进行了调参):
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import accuracy_score
import os
from sklearn.model_selection import GridSearchCV
from time import time
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print(tip + '正确率:%.2f%%' % (100 * np.mean(acc)))
def save_image(im, i):
im *= (256 / 17)
im = 255 - im
a = im.astype(np.uint8)
output_path = './HandWritten'
if not os.path.exists(output_path):
os.mkdir(output_path)
Image.fromarray(a).resize(size=(100, 100)).save(output_path + ('\\%d.png' % i))
if __name__ == "__main__":
print('Load Training File Start...')
data = pd.read_csv('optdigits.tra', header=None)
x, y = data[list(range(64))], data[64]
x, y = x.values, y.values # 转换为numpy形式,返回DataFrame的Numpy表示。
images = x.reshape(-1, 8, 8) # 不知道多少行,反正每一行是一个8*8的矩阵,对应着图片
print('images.shape = ', images.shape)
y = y.ravel().astype(np.int)
print('Load Test Data Start...')
data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
x_test, y_test = np.split(data, (-1,), axis=1) # axis=1 按行方向拆分数据,也就是水平方向
print(y_test.shape)
images_test = x_test.reshape(-1, 8, 8)
y_test = y_test.ravel().astype(np.int)
print('Load Data OK...')
# x, x_test, y, y_test = train_test_split(x, y, test_size=0.4, random_state=1)
# images = x.reshape(-1, 8, 8)
# images_test = x_test.reshape(-1, 8, 8)
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(15, 9), facecolor='w')
for index, image in enumerate(images[:16]):
plt.subplot(4, 8, index + 1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('训练图片: %i' % y[index])
for index, image in enumerate(images_test[:16]):
plt.subplot(4, 8, index + 17)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
save_image(image.copy(), index)
plt.title('测试图片: %i' % y_test[index])
plt.tight_layout()
plt.show()
params = {'C':np.logspace(0, 3, 7), 'gamma':np.logspace(-5, 0, 11)}
model = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=params, cv=3)
#model = svm.SVC(C=1, kernel='rbf', gamma=0.001)
print('Start Learning...')
t0 = time()
model.fit(x, y)
t1 = time()
t = t1 - t0
print('训练+CV耗时:%d分钟%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
print ('最优参数:\t', model.best_params_)
# clf.fit(x, y)
print('Learning is OK...')
print('训练集准确率:', accuracy_score(y, model.predict(x)))
y_hat = model.predict(x_test)
print('测试集准确率:', accuracy_score(y_test, model.predict(x_test)))
print(y_hat)
print(y_test)
err_images = images_test[y_test != y_hat]
err_y_hat = y_hat[y_test != y_hat]
err_y = y_test[y_test != y_hat]
print(err_y_hat)
print(err_y)
plt.figure(figsize=(10, 8), facecolor='w')
for index, image in enumerate(err_images):
if index >= 12:
break
plt.subplot(3, 4, index + 1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('错分为:%i,真实值:%i' % (err_y_hat[index], err_y[index]))
plt.tight_layout()
plt.show()
结果:
训练耗时:4分钟40.544秒
最优参数: {'C': 10.0, 'gamma': 0.001}
训练集准确率: 1.0
测试集准确率: 0.9827490261547023
下面是识别错误例子(人都看不出来是啥数字。。。。):
XGBOOST模型(进行了调参):
import pandas as pd
import xgboost as xgb
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print(tip + '正确率:%.2f%%' % (100 * np.mean(acc)))
if __name__ == '__main__':
print('Load Training File Start...')
data = pd.read_csv('optdigits.tra', header=None)
x, y = data[list(range(64))], data[64]
x, y = x.values, y.values # 转换为numpy形式,返回DataFrame的Numpy表示。
images = x.reshape(-1, 8, 8) # 得到图片对应的矩阵
print('images.shape = ', images.shape)
y = y.ravel().astype(np.int) # 由一个列向量拉开成行向量
print('Load Test Data Start...')
data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
x_test, y_test = np.split(data, (-1,), axis=1) # axis=1 按行方向拆分数据,也就是水平方向
print(y_test.shape)
images_test = x_test.reshape(-1, 8, 8)
y_test = y_test.ravel().astype(np.int)
print('Load Data OK...')
t0 = time()
#xgb模型参数
params = {'objective': 'multi:softmax', # 定义多分类问题
'num_class': 10, # 类别个数
'eta': 0.1, # 学习率
'silent': 1 # 是否打印中间结果,1就是不打印
}
# train = xgb.DMatrix(x, label=y)
# test = xgb.DMatrix(x_test, label=y_test)
num_round = 5
#bst = xgb.train(params, train, num_round)
cv_params = {'eta': [0.1, 0.01],'n_estimators': np.linspace(100, 600, 20, dtype=int)}
gbm = xgb.XGBClassifier(**params)
#调参,训练模型
opt_clf = GridSearchCV(estimator=gbm, param_grid=cv_params, cv=3)
opt_clf.fit(x, y)
#pred = opt_clf.predict(x_test)
t1 = time()
t = t1 - t0
print('训练模型耗时:%d分钟%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
print('最优参数:\t', opt_clf.best_params_)
#accuracy = accuracy_score(y_test, pred)
print('训练集准确率: ', accuracy_score(y, opt_clf.predict(x)))
print('测试集准确率: ',accuracy_score(y_test, opt_clf.predict(x_test)))
# #
# t0 = time()
# #n_estimators的值已调出最优值 1390
# cv_params = {'n_estimators': np.linspace(100, 1000, 10, dtype=int)}
# regress_model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=187, silent=False, objective='multi:softmax')
# model = GridSearchCV(regress_model, param_grid=cv_params, verbose=2, refit=True, cv=5, n_jobs=-1)
# model.fit(x,y)
#
# t1 = time()
# t = t1 - t0
# print('训练模型耗时:%d分钟%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
# print('最优参数:\t', model.best_params_)
# # 对测试集进行预测
# y_hat = model.predict(x)
# show_accuracy(y,y_hat,"训练集")
#
# y_hat_test = model.predict(x_test)
# show_accuracy(y_test, y_hat_test, "测试集")
# #print('训练集准确率:', accuracy_score(y, model.predict(x)))
# #print('测试集准确率:', accuracy_score(y_test, model.predict(x_test)))
结果:
训练模型耗时:29分钟59.371秒
最优参数: {'eta': 0.1, 'n_estimators': 284}
训练集准确率: 1.0
测试集准确率: 0.9671675013912076
总结:
从最后的运行结果可以看出SVM比xgboost的效果好些,并且svm运行时间也快于xgboost。
xgboost耗时较多的原因主要是调参的原因,若不进行调参,则很快就能训练出模型,但由于使用了GridSearchCV()来对n_estimators进行调参,所以运行时间大大增加,所以参数cv的值最好调小一些,不然运行时间太慢,在此次实验中将cv设为3,都需要跑半个小时才运行出来,最后结果还没SVM的效果好。