1 案例描述
赛题以预测二手车的交易价格为任务,评测标准:评价标准为MAE(Mean Absolute Error)。
2 代码详情
# 基础工具 import numpy as np import pandas as pd import warnings import matplotlib import matplotlib.pyplot as plt import seaborn as sns from scipy.special import jn from IPython.display import display, clear_output import time # 模型预测的 from sklearn import linear_model from sklearn import preprocessing from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # 数据降维处理的 from sklearn.decomposition import PCA, FastICA, FactorAnalysis, SparsePCA import lightgbm as lgb import xgboost as xgb from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split # Step 1:选定评价指标 from sklearn.metrics import mean_squared_error, mean_absolute_error # Step 2:数据读取和数据分析 # 通过Pandas对于数据进行读取 (pandas是一个很友好的数据读取函数库) Train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ') TestA_data = pd.read_csv('used_car_testA_20200313.csv', sep=' ') # 输出数据的大小信息 print('Train data shape:', Train_data.shape) print('TestA data shape:', TestA_data.shape) # Step 3:特征与标签工程构建 # 1) 提取数值类型特征列名 numerical_cols = Train_data.select_dtypes(exclude='object').columns print(numerical_cols) categorical_cols = Train_data.select_dtypes(include='object').columns print(categorical_cols) # 2) 构建训练和测试样本 # 选择特征列(手动选取) feature_cols = [col for col in numerical_cols if col not in ['SaleID', 'name', 'regDate', 'creatDate', 'price', 'model', 'brand', 'regionCode', 'seller']] feature_cols = [col for col in feature_cols if 'Type' not in col] # 提取特征列,标签列构造训练样本和测试样本 X_data = Train_data[feature_cols] Y_data = Train_data['price'] X_test = TestA_data[feature_cols] print('X train shape:', X_data.shape) print('X test shape:', X_test.shape) # 定义了一个统计函数,方便后续信息统计 def Sta_inf(data): print('_min', np.min(data)) print('_max:', np.max(data)) print('_mean', np.mean(data)) print('_ptp', np.ptp(data)) print('_std', np.std(data)) print('_var', np.var(data)) # 3) 统计标签的基本分布信息 print('Sta of label:') Sta_inf(Y_data) plt.hist(Y_data) # 绘制标签的统计图,查看标签分布 plt.show() plt.close() # 4) 缺省值用-1填补 X_data = X_data.fillna(-1) X_test = X_test.fillna(-1) # Step 4:模型训练与预测 # 1) 利用xgb进行五折交叉验证查看模型的参数效果 xgr = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, gamma=0, subsample=0.8, \ colsample_bytree=0.9, max_depth=7) # ,objective ='reg:squarederror' scores_train = [] scores = [] sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 5折交叉验证方式 test = np.zeros((X_test.shape[0], 1)) # 设置测试集,输出矩阵。每一组数据输出:[0,0,0,0]以概率值填入 print("test", test) for train_ind, val_ind in sk.split(X_data, Y_data): print('************************************ {} ************************************'.format(str(sk.n_splits))) train_x = X_data.iloc[train_ind].values train_y = Y_data.iloc[train_ind] val_x = X_data.iloc[val_ind].values val_y = Y_data.iloc[val_ind] xgr.fit(train_x, train_y) pred_train_xgb = xgr.predict(train_x) pred_xgb = xgr.predict(val_x) pred_xgb = pred_xgb.reshape(-1, 1) # score_train = mean_absolute_error(train_y, pred_train_xgb) scores_train.append(score_train) score = mean_absolute_error(val_y, pred_xgb) print("预测训练集的平均绝对误差评分", score) scores.append(score) # 预测 X_test_pred = X_test.values pred_test = xgr.predict(X_test_pred) pred_test2 = pred_test.reshape(-1, 1) test += pred_test2 print('Train mae:', np.mean(score_train)) print('Val mae', np.mean(scores)) # 预测值 sub_Weighted = test / sk.n_splits # 2) 定义xgb和lgb模型函数验证查看模型的参数效果 def build_model_xgb(x_train, y_train): model = xgb.XGBRegressor(n_estimators=150, learning_rate=0.1, gamma=0, subsample=0.8, \ colsample_bytree=0.9, max_depth=7) # , objective ='reg:squarederror' model.fit(x_train, y_train) return model def build_model_lgb(x_train, y_train): estimator = lgb.LGBMRegressor(num_leaves=127, n_estimators=150) param_grid = { 'learning_rate': [0.01, 0.05, 0.1, 0.2], } gbm = GridSearchCV(estimator, param_grid) gbm.fit(x_train, y_train) return gbm # 3)切分数据集(Train,Val)进行模型训练,评价和预测 x_train, x_val, y_train, y_val = train_test_split(X_data, Y_data, test_size=0.3) # lgb print('Train lgb...') model_lgb = build_model_lgb(x_train, y_train) val_lgb = model_lgb.predict(x_val) MAE_lgb = mean_absolute_error(y_val, val_lgb) print('MAE of val with lgb:', MAE_lgb) print('Predict lgb...') model_lgb_pre = build_model_lgb(X_data, Y_data) subA_lgb = model_lgb_pre.predict(X_test) print('Sta of Predict lgb:') Sta_inf(subA_lgb) # xgb print('Train xgb...') model_xgb = build_model_xgb(x_train, y_train) val_xgb = model_xgb.predict(x_val) MAE_xgb = mean_absolute_error(y_val, val_xgb) print('MAE of val with xgb:', MAE_xgb) print('Predict xgb...') model_xgb_pre = build_model_xgb(X_data, Y_data) subA_xgb = model_xgb_pre.predict(X_test) print('Sta of Predict xgb:') Sta_inf(subA_xgb) # Step 5:模型集成 # 1)进行两模型的结果加权融合 val_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * val_lgb + ( 1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * val_xgb # 这里我们采取了简单的加权融合的方式,验证集 val_Weighted[val_Weighted < 0] = 10 # 由于我们发现预测的最小值有负数,而真实情况下,price为负是不存在的,由此我们进行对应的后修正 print('MAE of val with Weighted ensemble:', mean_absolute_error(y_val, val_Weighted)) sub_Weighted = (1 - MAE_lgb / (MAE_xgb + MAE_lgb)) * subA_lgb + (1 - MAE_xgb / (MAE_xgb + MAE_lgb)) * subA_xgb # 测试集 # Step 6:结果输出 sub = pd.DataFrame() sub['SaleID'] = TestA_data.SaleID sub['price'] = sub_Weighted sub.to_csv('sub_Weighted.csv', index=False)