手写数字识别——SVM和XGBOOST

下面是训练集和测试集的部分图像

手写数字识别——SVM和XGBOOST

SVM模型代码(进行了调参):

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import accuracy_score
import os
from sklearn.model_selection import GridSearchCV
from time import time


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print(tip + '正确率:%.2f%%' % (100 * np.mean(acc)))


def save_image(im, i):
    im *= (256 / 17)
    im = 255 - im
    a = im.astype(np.uint8)
    output_path = './HandWritten'
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    Image.fromarray(a).resize(size=(100, 100)).save(output_path + ('\\%d.png' % i))


if __name__ == "__main__":
    print('Load Training File Start...')
    data = pd.read_csv('optdigits.tra', header=None)
    x, y = data[list(range(64))], data[64]
    x, y = x.values, y.values  # 转换为numpy形式,返回DataFrame的Numpy表示。
    images = x.reshape(-1, 8, 8)  # 不知道多少行,反正每一行是一个8*8的矩阵,对应着图片
    print('images.shape = ', images.shape)
    y = y.ravel().astype(np.int)

    print('Load Test Data Start...')
    data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
    x_test, y_test = np.split(data, (-1,), axis=1) # axis=1 按行方向拆分数据,也就是水平方向
    print(y_test.shape)
    images_test = x_test.reshape(-1, 8, 8)
    y_test = y_test.ravel().astype(np.int)
    print('Load Data OK...')

    # x, x_test, y, y_test = train_test_split(x, y, test_size=0.4, random_state=1)
    # images = x.reshape(-1, 8, 8)
    # images_test = x_test.reshape(-1, 8, 8)

    matplotlib.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(15, 9), facecolor='w')
    for index, image in enumerate(images[:16]):
        plt.subplot(4, 8, index + 1)
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        plt.title('训练图片: %i' % y[index])
    for index, image in enumerate(images_test[:16]):
        plt.subplot(4, 8, index + 17)
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        save_image(image.copy(), index)
        plt.title('测试图片: %i' % y_test[index])
    plt.tight_layout()
    plt.show()

    params = {'C':np.logspace(0, 3, 7), 'gamma':np.logspace(-5, 0, 11)}
    model = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=params, cv=3)
    #model = svm.SVC(C=1, kernel='rbf', gamma=0.001)
    print('Start Learning...')
    t0 = time()
    model.fit(x, y)
    t1 = time()
    t = t1 - t0
    print('训练+CV耗时:%d分钟%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
    print ('最优参数:\t', model.best_params_)
    # clf.fit(x, y)
    print('Learning is OK...')
    print('训练集准确率:', accuracy_score(y, model.predict(x)))
    y_hat = model.predict(x_test)
    print('测试集准确率:', accuracy_score(y_test, model.predict(x_test)))
    print(y_hat)
    print(y_test)

    err_images = images_test[y_test != y_hat]
    err_y_hat = y_hat[y_test != y_hat]
    err_y = y_test[y_test != y_hat]
    print(err_y_hat)
    print(err_y)
    plt.figure(figsize=(10, 8), facecolor='w')
    for index, image in enumerate(err_images):
        if index >= 12:
            break
        plt.subplot(3, 4, index + 1)
        plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
        plt.title('错分为:%i,真实值:%i' % (err_y_hat[index], err_y[index]))
    plt.tight_layout()
    plt.show()

结果:

训练耗时:4分钟40.544秒
最优参数:     {'C': 10.0, 'gamma': 0.001}
训练集准确率: 1.0
测试集准确率: 0.9827490261547023

 

下面是识别错误例子(人都看不出来是啥数字。。。。):

 

手写数字识别——SVM和XGBOOST

 

XGBOOST模型(进行了调参):

import pandas as pd
import xgboost as xgb
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print(tip + '正确率:%.2f%%' % (100 * np.mean(acc)))

if __name__ == '__main__':
    print('Load Training File Start...')
    data = pd.read_csv('optdigits.tra', header=None)
    x, y = data[list(range(64))], data[64]
    x, y = x.values, y.values  # 转换为numpy形式,返回DataFrame的Numpy表示。
    images = x.reshape(-1, 8, 8)  # 得到图片对应的矩阵
    print('images.shape = ', images.shape)
    y = y.ravel().astype(np.int)  # 由一个列向量拉开成行向量

    print('Load Test Data Start...')
    data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
    x_test, y_test = np.split(data, (-1,), axis=1)  # axis=1 按行方向拆分数据,也就是水平方向
    print(y_test.shape)
    images_test = x_test.reshape(-1, 8, 8)
    y_test = y_test.ravel().astype(np.int)
    print('Load Data OK...')

    t0 = time()
    #xgb模型参数
    params = {'objective': 'multi:softmax',  # 定义多分类问题
              'num_class': 10,  # 类别个数
              'eta': 0.1,  # 学习率
              'silent': 1  # 是否打印中间结果,1就是不打印
              }
    # train = xgb.DMatrix(x, label=y)
    # test = xgb.DMatrix(x_test, label=y_test)
    num_round = 5
    #bst = xgb.train(params, train, num_round)
    cv_params = {'eta': [0.1, 0.01],'n_estimators': np.linspace(100, 600, 20, dtype=int)}
    gbm = xgb.XGBClassifier(**params)
    #调参,训练模型
    opt_clf = GridSearchCV(estimator=gbm, param_grid=cv_params, cv=3)
    opt_clf.fit(x, y)
    #pred = opt_clf.predict(x_test)
    t1 = time()
    t = t1 - t0
    print('训练模型耗时:%d分钟%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
    print('最优参数:\t', opt_clf.best_params_)
    #accuracy = accuracy_score(y_test, pred)
    print('训练集准确率: ', accuracy_score(y, opt_clf.predict(x)))
    print('测试集准确率: ',accuracy_score(y_test, opt_clf.predict(x_test)))

    # #
    # t0 = time()
    # #n_estimators的值已调出最优值 1390
    # cv_params = {'n_estimators': np.linspace(100, 1000, 10, dtype=int)}
    # regress_model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=187, silent=False, objective='multi:softmax')
    # model = GridSearchCV(regress_model, param_grid=cv_params, verbose=2, refit=True, cv=5, n_jobs=-1)
    # model.fit(x,y)
    #
    # t1 = time()
    # t = t1 - t0
    # print('训练模型耗时:%d分钟%.3f秒' % (int(t / 60), t - 60 * int(t / 60)))
    # print('最优参数:\t', model.best_params_)
    # # 对测试集进行预测
    # y_hat = model.predict(x)
    # show_accuracy(y,y_hat,"训练集")
    #
    # y_hat_test = model.predict(x_test)
    # show_accuracy(y_test, y_hat_test, "测试集")
    # #print('训练集准确率:', accuracy_score(y, model.predict(x)))
    # #print('测试集准确率:', accuracy_score(y_test, model.predict(x_test)))




结果:

训练模型耗时:29分钟59.371秒
最优参数:     {'eta': 0.1, 'n_estimators': 284}
训练集准确率:  1.0
测试集准确率:  0.9671675013912076

 

总结:

从最后的运行结果可以看出SVM比xgboost的效果好些,并且svm运行时间也快于xgboost。

xgboost耗时较多的原因主要是调参的原因,若不进行调参,则很快就能训练出模型,但由于使用了GridSearchCV()来对n_estimators进行调参,所以运行时间大大增加,所以参数cv的值最好调小一些,不然运行时间太慢,在此次实验中将cv设为3,都需要跑半个小时才运行出来,最后结果还没SVM的效果好。

上一篇:高级算法梳理——XGBoost


下一篇:XGBoost的参数说明