机器学习算法系列(5)模型融合

一、算法原理

模型融合:Voting(投票法)、Blending(线性混合)、Stacking(堆叠)。

模型融合,不是说随意的融合能够达到好的效果。进行融合时,所需集成个体应该好而不同。好指的是个体学习器的性能要好,不同指的是个体模型的类别不同。

(1)Voting

这里举个西瓜书的例子,在介绍例子之前,首先提前介绍简单投票法,以分类问题为例,就是每个分类器对样例进行投票,哪个类别得到的票数最多的就是融合后模型的结果。
机器学习算法系列(5)模型融合
在上面的例子中,采用的就是简单的投票法。图b各个模型输出都一样,因此没有什么效果。图c每个分类器的精度只有33%,融合后反而更糟。也就是说,想要模型融合有效果,个体学习器要有一定的准确率,并且要有多样性,学习器之间具有差异,即好而不同

(2)Stacking
机器学习算法系列(5)模型融合
(3)Blending

第一步:把trian先分成两部分,比如70%的数据作为newtrain,剩下30%的数据作为validation。
第二步:第一层,使用newtrain训练多个模型,然后去预测validation的label。
第三步:第二层,使用validation在第一层训模型上的预测结果做为新特征继续训练第二层的模型。
第四步:使用test,带入第一层模型训练预测得到预测值,把预测值当做新的特征带入第二层模型进行训练,得到最终结果。

二、算法案例

#导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")#忽略警告

%matplotlib inline

#每次运行代码时设置相同的seed
SEED = 222
np.random.seed(SEED)
df = pd.read_csv('input.csv')
df.head()

机器学习算法系列(5)模型融合
1.数据预处理

y = 1*(df.cand_pty_affiliation == 'REP')
X = df.drop(['cand_pty_affiliation'],axis = 1)
X = pd.get_dummies(X,sparse = True)
X.drop(X.columns[X.std() == 0],axis = 1,inplace = True)
#标签分布
df.cand_pty_affiliation.value_counts(normalize = True).plot(kind = 'bar',title = 'Share of No.donations')
plt.show()

机器学习算法系列(5)模型融合
2.构建模型

#数据集切分,test_size = 0.95为了加快模型速度
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size = 0.95,random_state = SEED)

(1)模型融合

#导入库
from sklearn.svm import SVC,LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
#写好模型训练、预测、评分的函数
def get_models():
    nb = GaussianNB()
    svc = SVC(C = 100,probability = True)
    knn = KNeighborsClassifier(n_neighbors = 3)
    lr = LogisticRegression(C = 100,random_state = SEED)
    nn = MLPClassifier((80,10),early_stopping = False,random_state = SEED)
    gb = GradientBoostingClassifier(n_estimators = 10,random_state = SEED)
    rf = RandomForestClassifier(n_estimators = 10,max_features = 3,random_state = SEED)
    
    models = {'svm':svc,'knn':knn,'naive':nb,'mlp-nn':nn,'random forest':rf,'gbm':gb,'logisitic':lr} 
    return models

def train_predict(model_list):
    P = np.zeros((ytest.shape[0],len(model_list)))
    P = pd.DataFrame(P)
    
    print('Fitting models.')
    cols = list()
    for i,(name,m) in enumerate(models.items()):
        print('%s...'% name,end = '',flush = False)
        m.fit(xtrain,ytrain)
        P.iloc[:,i] = m.predict_proba(xtest)[:,1]
        cols.append(name)
        print('done')
    P.columns = cols
    print('Done.\n')
    return P

def score_models(P,y):
    print('Scoring models.')
    for m in P.columns:
        score = roc_auc_score(y,P.loc[:,m])
        print('%-26s:%.3f'%(m,score))
    print('Done.\n')
%%time

models = get_models()
P = train_predict(models)
score_models(P,ytest)

机器学习算法系列(5)模型融合

#简单的平均法模型融合
print("Ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest,P.mean(axis=1)))

机器学习算法系列(5)模型融合

#画roc_curve线模板
from sklearn.metrics import roc_curve

def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label):

    plt.figure(figsize=(10, 8))
    plt.plot([0, 1], [0, 1], 'k--')
    
    cm = [plt.cm.rainbow(i)
      for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
    
    for i in range(P_base_learners.shape[1]):
        p = P_base_learners[:, i]
        fpr, tpr, _ = roc_curve(ytest, p)
        plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])

    fpr, tpr, _ = roc_curve(ytest, P_ensemble)
    plt.plot(fpr, tpr, label=ens_label, c=cm[0])
        
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(frameon=False)
    plt.show()


plot_roc_curve(ytest, P.values, P.mean(axis=1), list(P.columns), "ensemble")

机器学习算法系列(5)模型融合
(2)Blending

base_learners = get_models()
meta_learner = GradientBoostingClassifier(n_estimators = 1000,
                                          loss = 'exponential',
                                          max_features = 4,
                                          max_depth = 3,
                                          subsample = 0.5,
                                          learning_rate = 0.005, 
                                          random_state = SEED)
xtrain_base,xpred_base,ytrain_base,ypred_base = train_test_split(xtrain,ytrain,test_size = 0.5,random_state = SEED)
def train_base_learners(base_learners,inp,out,verbose=True):
    if verbose: print("Fitting models.")
    for i, (name,m) in enumerate(base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        m.fit(inp,out)
        if verbose: print("done")

def predict_base_learners(pred_base_learners,inp,verbose=True):
    P = np.zeros((inp.shape[0], len(pred_base_learners)))

    if verbose: print("Generating base learner predictions.")
    for i, (name, m) in enumerate(pred_base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        p = m.predict_proba(inp)
        # With two classes, need only predictions for one class
        P[:, i] = p[:, 1]
        if verbose: print("done")

    return P
%%time

train_base_learners(base_learners,xtrain_base,ytrain_base)
P_base = predict_base_learners(base_learners, xpred_base)

机器学习算法系列(5)模型融合

%%time

meta_learner.fit(P_base,ypred_base)

机器学习算法系列(5)模型融合

def ensemble_predict(base_learners, meta_learner,inp,verbose = True):
    P_pred = predict_base_learners(base_learners,inp,verbose = verbose)
    return P_pred, meta_learner.predict_proba(P_pred)[:,1]
%%time

P_pred,p = ensemble_predict(base_learners,meta_learner,xtest)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest,p))

机器学习算法系列(5)模型融合
(3)Stacking

from sklearn.base import clone

def stacking(base_learners,meta_learner,X,y,generator):
    """Simple training routine for stacking."""

    # Train final base learners for test time
    print("Fitting final base learners...", end="")
    train_base_learners(base_learners,X,y,verbose=False)
    print("done")

    # Generate predictions for training meta learners
    # Outer loop:
    print("Generating cross-validated predictions...")
    cv_preds,cv_y = [],[]
    for i,(train_idx, test_idx) in enumerate(generator.split(X)):

        fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx]
        fold_xtest, fold_ytest = X[test_idx, :], y[test_idx]

        # Inner loop: step 4 and 5
        fold_base_learners = {name: clone(model)
                              for name, model in base_learners.items()}
        train_base_learners(
            fold_base_learners, fold_xtrain, fold_ytrain, verbose=False)

        fold_P_base = predict_base_learners(
            fold_base_learners, fold_xtest, verbose=False)

        cv_preds.append(fold_P_base)
        cv_y.append(fold_ytest)
        print("Fold %i done" % (i + 1))

    print("CV-predictions done")
    
    # Be careful to get rows in the right order
    cv_preds = np.vstack(cv_preds)
    cv_y = np.hstack(cv_y)

    # Train meta learner
    print("Fitting meta learner...", end="")
    meta_learner.fit(cv_preds, cv_y)
    print("done")

    return base_learners, meta_learner
%%time

from sklearn.model_selection import KFold

# Train with stacking
cv_base_learners, cv_meta_learner = stacking(
    get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2))

P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))

机器学习算法系列(5)模型融合

%%time

from mlens.ensemble import SuperLearner

# Instantiate the ensemble with 10 folds
sl = SuperLearner(folds = 10,random_state = SEED,verbose = 2,backend = "multiprocessing")

# Add the base learners and the meta learner
sl.add(list(base_learners.values()),proba=True) 
sl.add_meta(meta_learner,proba=True)

# Train the ensemble
sl.fit(xtrain,ytrain)

# Predict the test set
p_sl = sl.predict_proba(xtest)

print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest,p_sl[:, 1]))

机器学习算法系列(5)模型融合

plot_roc_curve(ytest,p.reshape(-1, 1),P.mean(axis=1),["Simple average"],"Super Learner")

机器学习算法系列(5)模型融合
三、算法总结

Stacking与Blending相比,Blending的优势在于:

(1)Blending比较简单,而Stacking相对比较复杂;

(2)能够防止信息泄露:generalizers和stackers使用不同的数据;

(3)不需要和你的队友分享你的随机种子。

Stacking与Blending相比,Blending的缺点在于:

(1)只用了整体数据的一部分;

(2)最终模型可能对留出集(holdout set)过拟合;

(3)Stacking多次交叉验证要更加稳健。

上一篇:第六章——模型融合


下一篇:Histopathologic Cancer Detection(densenet169)学习笔记