文章目录
主要给出了xgboost和lightGBM调参的实现代码。这个寻优过程挺快的,可以指定不同的测试集。给定参数的寻优范围可以实现自动寻优。
XGBoost
主要需要更改的地方:
- param 中指定搜索的范围
- **param 将参数传入XGBClassifier
- 设置需要测试的轮数 n_trials
# xgboost
trainrecords = pd.read_csv(r"./traindata/traindata_combine.csv",encoding="utf-8")
X_train,y_train = wash_data(trainrecords)
devrecords = pd.read_csv(r"./traindata/devdata_combine.csv",encoding="utf-8")
X_test,y_test = wash_data(devrecords) # 处理缺失值、规范字段顺序
sampler = TPESampler(seed=10) # for reproducibility
# X_train = X_train.iloc[:100,]
# y_train = y_train.iloc[:100,]
def objective(trial):
param = {
'objective': 'binary:logistic',
'metric': 'logloss',
'boosting_type': 'gbdt',
'verbosity':3,
#'num_class':3,
'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
#'num_leaves': trial.suggest_int('num_leaves', 2, 512),
'learning_rate': 0.05,#trial.suggest_loguniform('learning_rate', 0.01, 0.1),
'n_estimators': 2000,#trial.suggest_int('n_estimators', 1000, 3000),
'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
'subsample': 0.8,#trial.suggest_uniform('subsample', 0.5, 1.0),
#'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
'max_depth':8#trial.suggest_int('max_depth', 5, 15),
}
xgb = XGBClassifier(**param) # 这个很重要!!!!
gbm = xgb.fit(X_train,y_train)
return accuracy_score(y_test, np.round(gbm.predict(X_test)))
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=10) # 设置需要测试的次数
# 输出模型的最好结参数
study.best_params
print(f"最好的参数{study.best_params}")
print(f"用时{time()-t0}")
LightGBM
iris = load_iris()
X = iris.data
y = iris.target
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 14)
sampler = TPESampler(seed=10) # for reproducibility
def objective(trial):
dtrain = lgb.Dataset(X_train, label=y_train) #lgb的训练集生成方式
param = {
'objective': 'binary',
'metric': 'logloss',
#'verbosity': -1,
'boosting_type': 'gbdt',
#'num_class':3,
'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
'num_leaves': trial.suggest_int('num_leaves', 2, 512),
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
}
gbm = lgb.train(param, dtrain)
return accuracy_score(y_test, np.round(gbm.predict(X_test)))
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=2)
# 输出模型的最好结参数
study.best_params
print(f"用时{time()-t0}")
# 预测
dtrain = lgb.Dataset(X_train, label=y_train)
param = {
'objective': 'binary',
'metric': 'logloss',
'verbosity': -1,
'boosting_type': 'gbdt',
'num_class':3,
'lambda_l1': 0.0234448167468032,
'lambda_l2': 7.075730911992614e-07,
'num_leaves': 173,
'learning_rate': 4.887601625186522e-05,
'n_estimators': 1824,
'feature_fraction': 0.9712805361251421,
'bagging_fraction': 0.8498709168727996,
'bagging_freq': 2,
'min_child_samples': 17,
}
gbm = lgb.train(param, dtrain)
gbm_pred = gbm.predict(X_test)
print(accuracy_score(y_test, np.round(gbm.predict(X_test))))