模型训练及调参

1.模型选择

根据二分类问题进行模型训练

from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=12)
classifiers = [
    KNeighborsClassifier(n_neighbors=5),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier()]

# 计划用pandas的dataframe存放每一个模型的accuracy和AUC值,结果保存在result中
cols=["Classifier", "Accuracy", "AUC"]
result = pd.DataFrame(columns=cols)

# 对9个分类器模型循环处理,依次输出他们的accuracy和auc值,并它结果保存到result中
for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.2%}".format(acc))
    
    y_pred = clf.predict_proba(X_test)
    fpr, tpr, thresholds =roc_curve(y_test, y_pred[:,1])
    auc_value = auc(fpr, tpr)
    print("AUC: {0:0.2f}".format(auc_value))
    
    # 把当前循环的分类器模型结果保存在result_clf这个dataframe中
    result_clf = pd.DataFrame([[name, acc*100, auc_value]], columns=cols)
    
    # 把result_clf合并到result中
    result = result.append(result_clf)
    
print("="*30)

选择auc值最大的算法

2.调参

(1)网格搜索(GridSearchCV)

from sklearn.model_selection import GridSearchCV
n_estimators=[80,100]
learning_rate = [0.01, 0.1, 0.2, 0.3]
max_depth = [3,5,8]

# 搜索参数为字典
param_grid = dict(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth)
clf = GridSearchCV(model, param_grid, scoring="roc_auc", cv=5)
clf.fit(X_train, y_train)

# 输出最佳学习率和其对应的分数
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))

选取最佳参数训练模型

(2)自动化机器学习(tpot)

参考 https://blog.csdn.net/anshuai_aw1/article/details/82498947
https://blog.csdn.net/hgy0403/article/details/81291307

from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
 
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,
                                                    train_size=0.75, test_size=0.25)
 
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline.py')

例子

from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
 
iris = load_digits()
iris.data[0:5], iris.target
print iris
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    train_size=0.75, test_size=0.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
 
tpot = TPOTClassifier(verbosity=2, max_time_mins=2,config_dict="TPOT light",population_size=10,mutation_rate=0.9,crossover_rate=0.1)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

3.模型训练

lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
y_pred = clf.predict_proba(X_test)
fpr, tpr, thresholds =roc_curve(y_test, y_pred[:,1])
auc_value = auc(fpr, tpr)
print(acc,auc_value)

4.保存模型

from sklearn.externals import joblib
joblib.dump(model,'lr.model')

# 调用模型
xgb=joblib.load('l.model')
上一篇:关于树的重心的一个奇妙性质


下一篇:概率论与数理统计基础