XGBoost预测数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import glob
import datetime
import numpy as np

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, mean_absolute_error
np.seterr(divide='ignore', invalid='ignore')

plt.rcParams['font.sans-serif'] = 'SimHei'  #显示中文
plt.rcParams['axes.unicode_minus'] = False  #显示负号
plt.rcParams['figure.dpi'] = 100  # 图像分辨率
plt.rcParams['text.color'] = 'black'  # 文字颜色
plt.style.use('ggplot')#设置的背景图




csv_files = glob.glob('PRSA.csv')#返回符合条件的文件列表

df = pd.read_csv(csv_files[0],
                 index_col='No',
                 parse_dates={'datetime': [1,2,3,4]},
                 date_parser=lambda x: datetime.datetime . strptime(x, '%Y %m %d %H')
                )

df.set_index('datetime',inplace=True)#把datatime设置成索引列,并且不设置副本,直接在本表中修改。
df.head()
df.dropna(axis=0, how='any', inplace=True)#axis为0,删除包含缺失值的行,按行删除,any指存在一个缺失值的行或者列就删除,inplace表示直接在原数据上进行修改,返回None.
df.info()#方便快速浏览数据集,打印数据帧的完整摘要。

df.describe()
temp = df[['TEMP']]#只索引列名为'TEMP'的列,返回pd.DateFrame只取温度和时间两列

split_date = '2016-01'
temp_train = temp.loc[temp.index <= split_date].copy()#loc根据index进行索引

temp_test = temp.loc[temp.index > split_date].copy()

_ = temp_test.rename(columns={'TEMP': 'TEST SET'}).join(temp_train.rename(columns={'TEMP': 'TRAINING SET'}),how='outer') .plot(figsize=(8,3), title='Temperature', style='.')
plt.show()


def create_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index

    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day

    X = df[['hour', 'dayofweek', 'quarter', 'month', 'year',
            'dayofyear', 'dayofmonth']]
    if label:
        y = df[label]
        return X, y
    return X


X_train, y_train = create_features(temp_train, label='TEMP')
X_test, y_test = create_features(temp_test, label='TEMP')

reg = xgb.XGBRegressor(n_estimators=100)#总共迭代100次,也就是说决策树有100个

reg.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_test, y_test)],early_stopping_rounds=50,verbose=True )
_= xgb.plot_importance(reg, height=0.9)
plt.show()
def mean_absolute_percentage_error(y_true, y_pred):
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100#先求出数组真实值和预测值之间的差,再求出准确率的百分比的绝对值,最后求平均值
temp_test['Prediction'] = reg.predict(X_test)
mse = mean_squared_error(y_true=temp_test['TEMP'], y_pred=temp_test['Prediction'])
mae = mean_absolute_error(y_true=temp_test['TEMP'], y_pred=temp_test['Prediction'])
mape = mean_absolute_percentage_error(y_true=temp_test['TEMP'], y_pred=temp_test['Prediction'])

temp_all = pd.concat([temp_test, temp_train], sort=False)#连接函数,sort=True表示会按给定的列值进行排序。
_ = temp_all[['TEMP','Prediction']].plot(figsize=(8, 3))
plt.show()
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_xbound(lower='2016-01-01', upper='2016-02-01')
ax.set_ylim(-20, 15)
plot = plt.suptitle('January 2016 Forecast vs Actuals')
plt.show()
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_xbound(lower='2016-01-01', upper='2016-01-08')
ax.set_ylim(-10, 20)
plot = plt.suptitle('First Week of January Forecast vs Actuals')
plt.show()
temp_test['error'] = temp_test['TEMP'] - temp_test['Prediction']
temp_test['abs_error'] = temp_test['error'].apply(np.abs)
error_by_day = temp_test.groupby(['year','month','dayofmonth']) .mean()[['TEMP','Prediction','error','abs_error']]
error_by_day.sort_values('abs_error', ascending=True).head(10)
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(10)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_ylim(10, 40)
ax.set_xbound(lower='2016-5-27', upper='2016-5-29')
plot = plt.suptitle('May 28, 2016 - Best Predicted Day')
plt.show()
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(10)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_ylim(0, 20)
ax.set_xbound(lower='2016-3-22', upper='2016-3-24')
plot = plt.suptitle('Mar 23, 2016 - Best Predicted Day')
plt.show()
error_by_day.sort_values('abs_error', ascending=False).head(10)
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(10)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_ylim(-20, 10)
ax.set_xbound(lower='2016-1-22', upper='2016-1-24')
plot = plt.suptitle('Jan 23, 2016 - Worst Predicted Day')

上一篇:XGBoost算法分析与案例调参实例


下一篇:第113天: Python XGBoost 算法项目实战