有时候模型集成可以取得不错的效果。常用的模型集成包括:
- Votting:简单投票或加权平均
- Stacking:简单来说就是学习各个基本模型的预测值来预测最终的结果
我们初步选用 Stacking 集成学习器,采用 LogisticRegression、SVC、GaussianNB、SGDClassifier 、RandomForestClassifier、HistGradientBoostingClassifier作为基分类器。
导入必要的包
import pandas as pd
import numpy as np
import copy
import json
import pickle
import joblib
import lightgbm as lgb
import optuna
import warnings
import gc
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
import matplotlib.pyplot as plt
import seaborn as sns
# Setting configuration.
pd.set_option('display.float_format', lambda x: '%.5f' %x)
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
optuna.logging.set_verbosity(optuna.logging.WARNING)
SEED = 42
创建数据集
print('Loading data...')
path = '../datasets/Home-Credit-Default-Risk/selected_data.csv'
df = pd.read_csv(path, index_col='SK_ID_CURR')
Loading data...
# Split data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(
df.drop(columns="TARGET"),
df["TARGET"],
test_size=0.25,
random_state=SEED
)
print("X_train shape:", X_train.shape)
print('train:', y_train.value_counts(), sep='\n')
print('valid:', y_valid.value_counts(), sep='\n')
X_train shape: (230633, 835)
train:
TARGET
0 211999
1 18634
Name: count, dtype: int64
valid:
TARGET
0 70687
1 6191
Name: count, dtype: int64
无序分类(unordered)特征原始编码对于树集成模型(tree-ensemble like XGBoost)是可用的,但对于线性回归模型(like Lasso or LogisticRegression)则必须使用one-hot重编码。因此,我们先把数据重编码。
# Specific feature names and categorical features
feature_name = X_train.columns.tolist()
categorical_feature = X_train.select_dtypes(object).columns.tolist()
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
# Encode categorical features
encoder = make_column_transformer(
(OneHotEncoder(
drop='if_binary',
min_frequency=0.02,
max_categories=20,
sparse_output=False,
handle_unknown='ignore'
), categorical_feature),
remainder='passthrough',
verbose_feature_names_out=False,
verbose=True
)
print('fitting...')
encoder.fit(X_train)
print('encoding...')
train_dummies = encoder.transform(X_train)
valid_dummies = encoder.transform(X_valid)
print('train data shape:', X_train.shape)
fitting...
[ColumnTransformer] . (1 of 2) Processing onehotencoder, total= 4.7s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total= 0.0s
encoding...
train data shape: (230633, 835)
del df, X_train, X_valid
gc.collect()
2948
创建优化器
先定义一个评估函数
# Define a cross validation strategy
# We use the cross_val_score function of Sklearn.
# However this function has not a shuffle attribute, we add then one line of code,
# in order to shuffle the dataset prior to cross-validation
def evaluate(model, X, y, n_folds = 5, verbose=True):
kf = KFold(n_folds, shuffle=True, random_state=SEED).get_n_splits(X)
scores = cross_val_score(
model,
X,
y,
scoring="roc_auc",
cv = kf
)
if verbose:
print(f"valid auc: {scores.mean():.3f} +/- {scores.std():.3f}")
return scores.mean()
然后,我们定义一个优化器,对这些基分类器超参数调优。
class Objective:
estimators = (
LogisticRegression,
SGDClassifier,
GaussianNB,
RandomForestClassifier,
HistGradientBoostingClassifier
)
def __init__(self, estimator, X, y):
# assert isinstance(estimator, estimators), f"estimator must be one of {estimators}"
self.model = estimator
self.X = X
self.y = y
def __call__(self, trial):
# Create hyperparameter space
if isinstance(self.model, LogisticRegression):
search_space = dict(
class_weight = 'balanced',
C = trial.suggest_float('C', 0.01, 100.0, log=True),
l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0) # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
)
elif isinstance(self.model, SGDClassifier):
search_space = dict(
class_weight = 'balanced',
loss = trial.suggest_categorical('loss', ['hinge', 'log_loss', 'modified_huber']),
alpha = trial.suggest_float('alpha', 1e-5, 10.0, log=True),
penalty = 'elasticnet',
l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0),
early_stopping = True
)
elif isinstance(self.model, GaussianNB):
search_space = dict(
priors = None
)
elif isinstance(self.model, RandomForestClassifier):
search_space = dict(
class_weight = 'balanced',
n_estimators = trial.suggest_int('n_estimators', 50, 500, step=50),
max_depth = trial.suggest_int('max_depth', 2, 20),
max_features = trial.suggest_float('max_features', 0.2, 0.9),
random_state = SEED
)
elif isinstance(self.model, HistGradientBoostingClassifier):
search_space = dict(
class_weight = 'balanced',
learning_rate = trial.suggest_float('learning_rate', 1e-3, 10.0, log=True),
max_iter = trial.suggest_int('max_iter', 50, 500, step=50),
max_depth = trial.suggest_int('max_depth', 2, 20),
max_features = trial.suggest_float('max_features', 0.2, 0.9),
l2_regularization = trial.suggest_float('l2_regularization', 1e-3, 10.0, log=True),
random_state = SEED,
verbose = 0
)
# Setting hyperparameters
self.model.set_params(**search_space)
# Training with 5-fold CV:
score = evaluate(self.model, self.X, self.y)
return score
超参数优化
并行执行贝叶斯优化
def timer(func):
import time
import functools
def strfdelta(tdelta, fmt):
hours, remainder = divmod(tdelta, 3600)
minutes, seconds = divmod(remainder, 60)
return fmt.format(hours, minutes, seconds)
@functools.wraps(func)
def wrapper(*args, **kwargs):
click = time.time()
result = func(*args, **kwargs)
delta = strfdelta(time.time() - click, "{:.0f} hours {:.0f} minutes {:.0f} seconds")
print(f"{func.__name__} cost time {delta}")
return result
return wrapper
# Creating a pipeline & Hyperparameter tuning
@timer
def tuning(model, X, y):
# create a study object
study = optuna.create_study(direction="maximize")
# Invoke optimization of the objective function.
objective = Objective(model, X, y)
study.optimize(
objective,
n_trials = 50,
timeout = 2400,
gc_after_trial = True,
show_progress_bar = True
)
print(model, 'best score:', study.best_value)
return study
Objective.estimators
(sklearn.linear_model._logistic.LogisticRegression,
sklearn.linear_model._stochastic_gradient.SGDClassifier,
sklearn.naive_bayes.GaussianNB,
sklearn.ensemble._forest.RandomForestClassifier,
sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier)
# opt_results = []
# for model in Objective.estimators:
# study = tuning(model(), train_dummies, y_train)
# opt_results.append(study)
# print(model)
# print(study.best_trial.params)
模型训练
集成模型调优
# define the search space and the objecive function
def stacking_obj(trial):
stacking = StackingClassifier(
# The `estimators` parameter corresponds to the list of the estimators which are stacked.
estimators = [
('Logit', LogisticRegression(
class_weight = 'balanced',
C = trial.suggest_float('Logit__C', 0.01, 100.0, log=True),
l1_ratio = trial.suggest_float('Logit__l1_ratio', 0.0, 1.0) # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
)),
('SGD', SGDClassifier(
class_weight = 'balanced',
loss = trial.suggest_categorical('SGD__loss', ['hinge', 'log_loss', 'modified_huber']),
alpha = trial.suggest_float('SGD__alpha', 1e-5, 10.0, log=True),
penalty = 'elasticnet',
l1_ratio = trial.suggest_float('SGD__l1_ratio', 0.0, 1.0),
early_stopping = True
)),
('GaussianNB', GaussianNB())
],
# The final_estimator will use the predictions of the estimators as input
final_estimator = LogisticRegression(
class_weight = 'balanced',
C = trial.suggest_float('final__C', 0.01, 100.0, log=True),
# The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
l1_ratio = trial.suggest_float('final__l1_ratio', 0.0, 1.0)
),
verbose = 1
)
score = evaluate(stacking, train_dummies, y_train, n_folds = 3)
return score
# create a study object.
study = optuna.create_study(
study_name = 'stacking-study', # Unique identifier of the study.
direction = 'maximize'
)
# Invoke optimization of the objective function.
study.optimize(
stacking_obj,
n_trials = 100,
timeout = 3600,
gc_after_trial = True,
show_progress_bar = True
)
valid auc: 0.676 +/- 0.017
valid auc: 0.669 +/- 0.021
valid auc: 0.673 +/- 0.016
valid auc: 0.451 +/- 0.121
valid auc: 0.592 +/- 0.045
valid auc: 0.666 +/- 0.017
valid auc: 0.675 +/- 0.014
valid auc: 0.666 +/- 0.021
valid auc: 0.672 +/- 0.016
valid auc: 0.667 +/- 0.021
valid auc: 0.672 +/- 0.012
joblib.dump(study, path + "stacking-study.pkl")
study = joblib.load(path + "stacking-study.pkl")
print("Best trial until now:")
print(" Value: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
print(f" {key}: {value}")
Best trial until now:
Value: 0.6761396385434888
Params:
Logit__C: 0.020329668727865235
Logit__l1_ratio: 0.5165207006926232
SGD__loss: modified_huber
SGD__alpha: 1.6638099778831132
SGD__l1_ratio: 0.7330208370976262
final__C: 14.1468564043383
final__l1_ratio: 0.4977751012657087
stacking = StackingClassifier(
# The `estimators` parameter corresponds to the list of the estimators which are stacked.
estimators = [
('Logit', LogisticRegression(
class_weight = 'balanced',
C = 0.020329668727865235,
l1_ratio = 0.5165207006926232 # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
)),
('SGD', SGDClassifier(
class_weight = 'balanced',
loss = 'modified_huber',
alpha = 1.6638099778831132,
penalty = 'elasticnet',
l1_ratio = 0.7330208370976262,
early_stopping = True
)),
('GaussianNB', GaussianNB())
],
# The final_estimator will use the predictions of the estimators as input
final_estimator = LogisticRegression(
class_weight = 'balanced',
C = 14.1468564043383,
# The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1.
l1_ratio = 0.4977751012657087
),
verbose = 1
)
score = evaluate(stacking, train_dummies, y_train)
valid auc: 0.674 +/- 0.009
stacking.fit(train_dummies, y_train)
train_auc = roc_auc_score(y_train, stacking.predict_proba(train_dummies)[:, 1])
valid_auc = roc_auc_score(y_valid, stacking.predict_proba(valid_dummies)[:, 1])
print('train auc:', train_auc)
print('valid auc:', valid_auc)
train auc: 0.6753919322392181
valid auc: 0.6752015627178207