import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import glob
import datetime
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, mean_absolute_error
np.seterr(divide='ignore', invalid='ignore')
plt.rcParams['font.sans-serif'] = 'SimHei' #显示中文
plt.rcParams['axes.unicode_minus'] = False #显示负号
plt.rcParams['figure.dpi'] = 100 # 图像分辨率
plt.rcParams['text.color'] = 'black' # 文字颜色
plt.style.use('ggplot')#设置的背景图
csv_files = glob.glob('PRSA.csv')#返回符合条件的文件列表
df = pd.read_csv(csv_files[0],
index_col='No',
parse_dates={'datetime': [1,2,3,4]},
date_parser=lambda x: datetime.datetime . strptime(x, '%Y %m %d %H')
)
df.set_index('datetime',inplace=True)#把datatime设置成索引列,并且不设置副本,直接在本表中修改。
df.head()
df.dropna(axis=0, how='any', inplace=True)#axis为0,删除包含缺失值的行,按行删除,any指存在一个缺失值的行或者列就删除,inplace表示直接在原数据上进行修改,返回None.
df.info()#方便快速浏览数据集,打印数据帧的完整摘要。
df.describe()
temp = df[['TEMP']]#只索引列名为'TEMP'的列,返回pd.DateFrame只取温度和时间两列
split_date = '2016-01'
temp_train = temp.loc[temp.index <= split_date].copy()#loc根据index进行索引
temp_test = temp.loc[temp.index > split_date].copy()
_ = temp_test.rename(columns={'TEMP': 'TEST SET'}).join(temp_train.rename(columns={'TEMP': 'TRAINING SET'}),how='outer') .plot(figsize=(8,3), title='Temperature', style='.')
plt.show()
def create_features(df, label=None):
"""
Creates time series features from datetime index
"""
df['date'] = df.index
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
X = df[['hour', 'dayofweek', 'quarter', 'month', 'year',
'dayofyear', 'dayofmonth']]
if label:
y = df[label]
return X, y
return X
X_train, y_train = create_features(temp_train, label='TEMP')
X_test, y_test = create_features(temp_test, label='TEMP')
reg = xgb.XGBRegressor(n_estimators=100)#总共迭代100次,也就是说决策树有100个
reg.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_test, y_test)],early_stopping_rounds=50,verbose=True )
_= xgb.plot_importance(reg, height=0.9)
plt.show()
def mean_absolute_percentage_error(y_true, y_pred):
"""Calculates MAPE given y_true and y_pred"""
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100#先求出数组真实值和预测值之间的差,再求出准确率的百分比的绝对值,最后求平均值
temp_test['Prediction'] = reg.predict(X_test)
mse = mean_squared_error(y_true=temp_test['TEMP'], y_pred=temp_test['Prediction'])
mae = mean_absolute_error(y_true=temp_test['TEMP'], y_pred=temp_test['Prediction'])
mape = mean_absolute_percentage_error(y_true=temp_test['TEMP'], y_pred=temp_test['Prediction'])
temp_all = pd.concat([temp_test, temp_train], sort=False)#连接函数,sort=True表示会按给定的列值进行排序。
_ = temp_all[['TEMP','Prediction']].plot(figsize=(8, 3))
plt.show()
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_xbound(lower='2016-01-01', upper='2016-02-01')
ax.set_ylim(-20, 15)
plot = plt.suptitle('January 2016 Forecast vs Actuals')
plt.show()
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_xbound(lower='2016-01-01', upper='2016-01-08')
ax.set_ylim(-10, 20)
plot = plt.suptitle('First Week of January Forecast vs Actuals')
plt.show()
temp_test['error'] = temp_test['TEMP'] - temp_test['Prediction']
temp_test['abs_error'] = temp_test['error'].apply(np.abs)
error_by_day = temp_test.groupby(['year','month','dayofmonth']) .mean()[['TEMP','Prediction','error','abs_error']]
error_by_day.sort_values('abs_error', ascending=True).head(10)
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(10)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_ylim(10, 40)
ax.set_xbound(lower='2016-5-27', upper='2016-5-29')
plot = plt.suptitle('May 28, 2016 - Best Predicted Day')
plt.show()
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(10)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_ylim(0, 20)
ax.set_xbound(lower='2016-3-22', upper='2016-3-24')
plot = plt.suptitle('Mar 23, 2016 - Best Predicted Day')
plt.show()
error_by_day.sort_values('abs_error', ascending=False).head(10)
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(10)
_ = temp_all[['Prediction','TEMP']].plot(ax=ax, style=['-','.'])
ax.set_ylim(-20, 10)
ax.set_xbound(lower='2016-1-22', upper='2016-1-24')
plot = plt.suptitle('Jan 23, 2016 - Worst Predicted Day')