时间序列
数据描述
探索Apple公司股价数据
导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline
# For reading stock data from yahoo
import pandas_datareader.data as web
# For time stamps
from datetime import datetime
设置时间节点
# 设置数据获取的结束时间和开始时间
end = datetime.now() # 当前日期为截止时间
start = datetime(end.year - 2, end.month, end.day) #两年前的今天
读取数据并存为一个名叫apple的数据框
apple = web.get_data_yahoo('AAPL',start,end)
查看每一列的数据类型
apple.dtypes
High float64
Low float64
Open float64
Close float64
Volume float64
Adj Close float64
dtype: object
查看前5条数据
apple.head()
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2019-05-06 | 52.209999 | 50.875000 | 51.072498 | 52.119999 | 129772400.0 | 51.050892 |
2019-05-07 | 51.855000 | 50.207500 | 51.470001 | 50.715000 | 155054800.0 | 49.674709 |
2019-05-08 | 51.334999 | 50.437500 | 50.474998 | 50.724998 | 105358000.0 | 49.684498 |
2019-05-09 | 50.419998 | 49.165001 | 50.099998 | 50.180000 | 139634400.0 | 49.150684 |
2019-05-10 | 49.712502 | 48.192501 | 49.355000 | 49.294998 | 164834800.0 | 48.469784 |
计算数据集中最早的日期和最晚的日期间隔天数
(apple.index.max() - apple.index.min()).days
730
统计数据中总月数
apple_months = apple.resample('BM').mean()
len(apple_months.index)
25
按照时间顺序可视化Adj Close值
# makes the plot and assign it to a variable
appl_open = apple['Adj Close'].plot(title = "Apple Stock Adj Close")
# changes the size of the graph
fig = appl_open.get_figure()
fig.set_size_inches(13.5, 9)
Apple各维度的直方图
apple.hist(figsize=(12,12))
array([[<AxesSubplot:title={'center':'High'}>,
<AxesSubplot:title={'center':'Low'}>],
[<AxesSubplot:title={'center':'Open'}>,
<AxesSubplot:title={'center':'Close'}>],
[<AxesSubplot:title={'center':'Volume'}>,
<AxesSubplot:title={'center':'Adj Close'}>]], dtype=object)
Apple股票的每日收益
apple['Daily Return'] = apple['Adj Close'].pct_change()
plt.figure(figsize=(15,8))
apple['Daily Return'].plot(legend=True, linestyle='--', marker='o',title='APPLE Adj Close')
<AxesSubplot:title={'center':'APPLE Adj Close'}, xlabel='Date'>
时间序列
平稳性检测
单位根检测
# 单位根检测
from statsmodels.tsa.stattools import adfuller as ADF
print(u'The ADF test result of the original sequence is:', ADF(apple[u'Adj Close']))
# 返回值是adf,p值,usedlag,nobs,临界值,icbest,results,restore
# 第一部分是τ(tao)统计量的值。
# 第二部分是p_value的值。
# 第三部分是结果使用的延迟阶数。
# 第四部分是ADF回归和计算临界值所使用的观察次数。
# 第五部分是临界值。
# 第六部分是最大的信息准则的值(如果autolag 非空),也就是AIC或者BIC的值。
# 当我们看序列是否平稳的结果时,一般首先看第二部分的p_value值。如果p_value值比0.05小,证明有单位根,也就是说序列平稳。如果p_value比0.05大则证明非平稳。
The ADF test result of the original sequence is: (-0.5969761423413814, 0.8716638319139194, 1, 503, {'1%': -3.4434175660489905, '5%': -2.8673031724657454, '10%': -2.5698395516760275}, 2128.9282169596872)
p值大于0.05,说明单位根不存在,序列不平稳
平稳性处理
对数变换
Adj_Close = apple["Adj Close"] #Adj Close
Adj_Close_log = np.log(Adj_Close)
Adj_Close_log.plot() # 对数序列
<AxesSubplot:xlabel='Date'>
平滑法
f = plt.figure(facecolor='white')
# 对size个数据进行移动平均
rol_mean = Adj_Close_log.rolling(window=12).mean()
Adj_Close_log.plot(color='blue', label='Original') # 对数序列
rol_mean.plot(color='red', label='Weighted Rolling Mean') #平滑序列
plt.legend(loc='best')
plt.title('Rolling Mean')
plt.show()
Date
2019-05-06 NaN
2019-05-07 NaN
2019-05-08 NaN
2019-05-09 NaN
2019-05-10 NaN
...
2021-04-29 4.895680
2021-04-30 4.895320
2021-05-03 4.894096
2021-05-04 4.890082
2021-05-05 4.885809
Name: Adj Close, Length: 505, dtype: float64
差分
num = 1 # 差分阶数
diff_Adj_Close=rol_mean.diff(num) # num阶差分
diff_Adj_Close.dropna(inplace=True) # 删除缺失值
diff_Adj_Close.plot()
<AxesSubplot:xlabel='Date'>
再次进行单位根检验
print(u'The ADF test result of the original sequence is:', ADF(diff_Adj_Close))
The ADF test result of the original sequence is: (-3.9514817311837405, 0.001686997925420686, 18, 472, {'1%': -3.444280551073031, '5%': -2.867682902679315, '10%': -2.5700419306592934}, -4296.831982906071)
经过差分运算后,p值小于0.05
模型定阶
# 对模型定阶,确定p、q的阶数
# 设定p=1, q=1。下使用ARMA模型进行数据拟合
from statsmodels.tsa.arima_model import ARMA
model = ARMA(diff_Adj_Close, order=(1, 1))
result_arma = model.fit( disp=-1, method='css')
E:\Anaconda3\lib\site-packages\statsmodels\tsa\arima_model.py:472: FutureWarning:
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been deprecated in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and
statsmodels.tsa.SARIMAX. These will be removed after the 0.12 release.
statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained.
To silence this warning and continue using ARMA and ARIMA until they are
removed, use:
import warnings
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
FutureWarning)
warnings.warn(ARIMA_DEPRECATION_WARN, FutureWarning)
E:\Anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:581: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
warnings.warn('A date index has been provided, but it has no'
样本拟合
predict_ts = result_arma.predict()
# 一阶差分还原
diff_shift_ts = diff_Adj_Close.shift(1)
diff_recover_1 = predict_ts.add(diff_shift_ts)
# 再次一阶差分还原
rol_shift_ts = rol_mean.shift(1)
diff_recover = diff_recover_1.add(rol_shift_ts)
# 移动平均还原
rol_sum = Adj_Close_log.rolling(window=11).sum()
rol_recover = diff_recover*12 - rol_sum.shift(1)
# 对数还原
log_recover = np.exp(rol_recover)
log_recover.dropna(inplace=True)
模型评估
Adj_Close = apple['Adj Close'].loc[log_recover.index] # 过滤没有预测的记录
plt.figure(facecolor='white')
log_recover.plot(color='blue', label='Predict')
Adj_Close.plot(color='red', label='Original')
plt.legend(loc='best')
plt.title('RMSE: %.4f'% np.sqrt(sum((log_recover-Adj_Close)**2)/Adj_Close.size))
plt.show()