1.模型选择
根据二分类问题进行模型训练
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=12)
classifiers = [
KNeighborsClassifier(n_neighbors=5),
LogisticRegression(),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
XGBClassifier()]
# 计划用pandas的dataframe存放每一个模型的accuracy和AUC值,结果保存在result中
cols=["Classifier", "Accuracy", "AUC"]
result = pd.DataFrame(columns=cols)
# 对9个分类器模型循环处理,依次输出他们的accuracy和auc值,并它结果保存到result中
for clf in classifiers:
clf.fit(X_train, y_train)
name = clf.__class__.__name__
print("="*30)
print(name)
print('****Results****')
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2%}".format(acc))
y_pred = clf.predict_proba(X_test)
fpr, tpr, thresholds =roc_curve(y_test, y_pred[:,1])
auc_value = auc(fpr, tpr)
print("AUC: {0:0.2f}".format(auc_value))
# 把当前循环的分类器模型结果保存在result_clf这个dataframe中
result_clf = pd.DataFrame([[name, acc*100, auc_value]], columns=cols)
# 把result_clf合并到result中
result = result.append(result_clf)
print("="*30)
选择auc值最大的算法
2.调参
(1)网格搜索(GridSearchCV)
from sklearn.model_selection import GridSearchCV
n_estimators=[80,100]
learning_rate = [0.01, 0.1, 0.2, 0.3]
max_depth = [3,5,8]
# 搜索参数为字典
param_grid = dict(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth)
clf = GridSearchCV(model, param_grid, scoring="roc_auc", cv=5)
clf.fit(X_train, y_train)
# 输出最佳学习率和其对应的分数
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
选取最佳参数训练模型
(2)自动化机器学习(tpot)
参考 https://blog.csdn.net/anshuai_aw1/article/details/82498947
https://blog.csdn.net/hgy0403/article/details/81291307
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,
train_size=0.75, test_size=0.25)
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
random_state=42, verbosity=2)
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline.py')
例子
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
iris = load_digits()
iris.data[0:5], iris.target
print iris
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
train_size=0.75, test_size=0.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
tpot = TPOTClassifier(verbosity=2, max_time_mins=2,config_dict="TPOT light",population_size=10,mutation_rate=0.9,crossover_rate=0.1)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
3.模型训练
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
y_pred = clf.predict_proba(X_test)
fpr, tpr, thresholds =roc_curve(y_test, y_pred[:,1])
auc_value = auc(fpr, tpr)
print(acc,auc_value)
4.保存模型
from sklearn.externals import joblib
joblib.dump(model,'lr.model')
# 调用模型
xgb=joblib.load('l.model')