天池案例-二手车交易价格预测-回归(xgb+lgb)

1 案例描述

赛题以预测二手车的交易价格为任务,评测标准:评价标准为MAE(Mean Absolute Error)。

2 代码详情

# 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

# 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 数据降维处理的
from sklearn.decomposition import PCA, FastICA, FactorAnalysis, SparsePCA

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split

# Step 1:选定评价指标
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Step 2:数据读取和数据分析
# 通过Pandas对于数据进行读取 (pandas是一个很友好的数据读取函数库)
Train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
TestA_data = pd.read_csv('used_car_testA_20200313.csv', sep=' ')

# 输出数据的大小信息
print('Train data shape:', Train_data.shape)
print('TestA data shape:', TestA_data.shape)

# Step 3:特征与标签工程构建
# 1) 提取数值类型特征列名
numerical_cols = Train_data.select_dtypes(exclude='object').columns
print(numerical_cols)
categorical_cols = Train_data.select_dtypes(include='object').columns
print(categorical_cols)

# 2) 构建训练和测试样本
# 选择特征列(手动选取)
feature_cols = [col for col in numerical_cols if
                col not in ['SaleID', 'name', 'regDate', 'creatDate', 'price', 'model', 'brand', 'regionCode',
                            'seller']]
feature_cols = [col for col in feature_cols if 'Type' not in col]

# 提取特征列,标签列构造训练样本和测试样本
X_data = Train_data[feature_cols]
Y_data = Train_data['price']

X_test = TestA_data[feature_cols]

print('X train shape:', X_data.shape)
print('X test shape:', X_test.shape)


# 定义了一个统计函数,方便后续信息统计
def Sta_inf(data):
    print('_min', np.min(data))
    print('_max:', np.max(data))
    print('_mean', np.mean(data))
    print('_ptp', np.ptp(data))
    print('_std', np.std(data))
    print('_var', np.var(data))


# 3) 统计标签的基本分布信息
print('Sta of label:')
Sta_inf(Y_data)
plt.hist(Y_data)  # 绘制标签的统计图,查看标签分布
plt.show()
plt.close()

# 4) 缺省值用-1填补
X_data = X_data.fillna(-1)
X_test = X_test.fillna(-1)

# Step 4:模型训练与预测

# 1) 利用xgb进行五折交叉验证查看模型的参数效果
xgr = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, gamma=0, subsample=0.8, \
                       colsample_bytree=0.9, max_depth=7)  # ,objective ='reg:squarederror'
scores_train = []
scores = []
sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 5折交叉验证方式
test = np.zeros((X_test.shape[0], 1))  # 设置测试集,输出矩阵。每一组数据输出:[0,0,0,0]以概率值填入
print("test", test)
for train_ind, val_ind in sk.split(X_data, Y_data):
    print('************************************ {} ************************************'.format(str(sk.n_splits)))
    train_x = X_data.iloc[train_ind].values
    train_y = Y_data.iloc[train_ind]

    val_x = X_data.iloc[val_ind].values
    val_y = Y_data.iloc[val_ind]

    xgr.fit(train_x, train_y)
    pred_train_xgb = xgr.predict(train_x)
    pred_xgb = xgr.predict(val_x)
    pred_xgb = pred_xgb.reshape(-1, 1)
    #
    score_train = mean_absolute_error(train_y, pred_train_xgb)
    scores_train.append(score_train)
    score = mean_absolute_error(val_y, pred_xgb)
    print("预测训练集的平均绝对误差评分", score)
    scores.append(score)
    # 预测
    X_test_pred = X_test.values
    pred_test = xgr.predict(X_test_pred)
    pred_test2 = pred_test.reshape(-1, 1)
    test += pred_test2

print('Train mae:', np.mean(score_train))
print('Val mae', np.mean(scores))
# 预测值
sub_Weighted = test / sk.n_splits


# 2) 定义xgb和lgb模型函数验证查看模型的参数效果
def build_model_xgb(x_train, y_train):
    model = xgb.XGBRegressor(n_estimators=150, learning_rate=0.1, gamma=0, subsample=0.8, \
                             colsample_bytree=0.9, max_depth=7)  # , objective ='reg:squarederror'
    model.fit(x_train, y_train)
    return model


def build_model_lgb(x_train, y_train):
    estimator = lgb.LGBMRegressor(num_leaves=127, n_estimators=150)
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
    }
    gbm = GridSearchCV(estimator, param_grid)
    gbm.fit(x_train, y_train)
    return gbm


# 3)切分数据集(Train,Val)进行模型训练,评价和预测
x_train, x_val, y_train, y_val = train_test_split(X_data, Y_data, test_size=0.3)

# lgb
print('Train lgb...')
model_lgb = build_model_lgb(x_train, y_train)
val_lgb = model_lgb.predict(x_val)
MAE_lgb = mean_absolute_error(y_val, val_lgb)
print('MAE of val with lgb:', MAE_lgb)

print('Predict lgb...')
model_lgb_pre = build_model_lgb(X_data, Y_data)
subA_lgb = model_lgb_pre.predict(X_test)
print('Sta of Predict lgb:')
Sta_inf(subA_lgb)

# xgb
print('Train xgb...')
model_xgb = build_model_xgb(x_train, y_train)
val_xgb = model_xgb.predict(x_val)
MAE_xgb = mean_absolute_error(y_val, val_xgb)
print('MAE of val with xgb:', MAE_xgb)

print('Predict xgb...')
model_xgb_pre = build_model_xgb(X_data, Y_data)
subA_xgb = model_xgb_pre.predict(X_test)
print('Sta of Predict xgb:')
Sta_inf(subA_xgb)

# Step 5:模型集成
# 1)进行两模型的结果加权融合
val_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * val_lgb + (
        1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * val_xgb  # 这里我们采取了简单的加权融合的方式,验证集
val_Weighted[val_Weighted < 0] = 10  # 由于我们发现预测的最小值有负数,而真实情况下,price为负是不存在的,由此我们进行对应的后修正
print('MAE of val with Weighted ensemble:', mean_absolute_error(y_val, val_Weighted))

sub_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * subA_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * subA_xgb  # 测试集

# Step 6:结果输出
sub = pd.DataFrame()
sub['SaleID'] = TestA_data.SaleID
sub['price'] = sub_Weighted
sub.to_csv('sub_Weighted.csv', index=False)

3 归纳总结

上一篇:20210325_23期_心跳检测_Task04_模型调参


下一篇:LGB+XGB+CNN一般写法