来自股票价格预测bilibili课程。
源自jupyter notebook文件main.ipynb。
代码用tf1书写,使用tf2会因为版本不对应而报错,tf2版本的代码后续再研究。
股票价格预测
1、数据初步处理
- 导入库
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline #可以matplotlib的图表直接嵌入到Notebook之中
from sklearn.preprocessing import MinMaxScaler
import time
- 导入数据库
data = pd.read_csv('data_stocks.csv')
- 一些数据处理的指令
data.describe() #对列数据进行分析(如平均值,最大值等)
data.info() #显示数据集类型
- 数据集分割
data.drop('DATE', axis=1, inplace=True)#把日期行舍去
data_train = data.iloc[:int(data.shape[0] * 0.8), :] #前80%为训练集
data_test = data.iloc[int(data.shape[0] * 0.8):, :] #后20%为测试集
- 归一化处理数据
scaler = MinMaxScaler(feature_range=(-1, 1)) #缩放到-1到1之间
scaler.fit(data_train) #fit:计算出最大值和最小值
data_train = scaler.transform(data_train) #归一化处理
data_test = scaler.transform(data_test)
2、同步预测-回归
属于回归问题,即输入共500维特征,输出1维特征;同步预测,主要使用多层感知机(MLP),损失函数用均方误差(MSE)。
- 划分特征和标签
X_train = data_train[:, 1:] #特征
y_train = data_train[:, 0] #标签
X_test = data_test[:, 1:]
y_test = data_test[:, 0]
- 结构定义
input_dim = X_train.shape[1] #输入维数
hidden_1 = 1024 #隐藏层层数
hidden_2 = 512
hidden_3 = 256
hidden_4 = 128
output_dim = 1 #输出维度
batch_size = 256 #批数据的大小
epochs = 10
将代码reset一下,避免块反复调用引起错误
tf.reset_default_graph()
- 模型构建
X = tf.placeholder(shape=[None, input_dim], dtype=tf.float32) #shape是设置维数
Y = tf.placeholder(shape=[None], dtype=tf.float32)
# 模型的参数设置
W1 = tf.get_variable('W1', [input_dim, hidden_1], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b1 = tf.get_variable('b1', [hidden_1], initializer=tf.zeros_initializer())
W2 = tf.get_variable('W2', [hidden_1, hidden_2], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b2 = tf.get_variable('b2', [hidden_2], initializer=tf.zeros_initializer())
W3 = tf.get_variable('W3', [hidden_2, hidden_3], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b3 = tf.get_variable('b3', [hidden_3], initializer=tf.zeros_initializer())
W4 = tf.get_variable('W4', [hidden_3, hidden_4], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b4 = tf.get_variable('b4', [hidden_4], initializer=tf.zeros_initializer())
W5 = tf.get_variable('W5', [hidden_4, output_dim], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b5 = tf.get_variable('b5', [output_dim], initializer=tf.zeros_initializer())
#matmul是矩阵乘,相当于公式w1*X+b1构建全连接层
h1 = tf.nn.relu(tf.add(tf.matmul(X, W1), b1))
h2 = tf.nn.relu(tf.add(tf.matmul(h1, W2), b2))
h3 = tf.nn.relu(tf.add(tf.matmul(h2, W3), b3))
h4 = tf.nn.relu(tf.add(tf.matmul(h3, W4), b4))
out = tf.transpose(tf.add(tf.matmul(h4, W5), b5)) #transpose转置
- 损失函数计算
cost = tf.reduce_mean(tf.squared_difference(out, Y)) #预测值(out)与真实值(Y)的平方均值[squared_difference:差的平方;reduce_mean:取均值]
optimizer = tf.train.AdamOptimizer().minimize(cost) #优化器是使损失cost尽量最小
- 模型运行
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()) #全局变量参数初始化
for e in range(epochs): #每一个epochs中
shuffle_indices = np.random.permutation(np.arange(y_train.shape[0])) #获得一个打乱的索引
X_train = X_train[shuffle_indices]
y_train = y_train[shuffle_indices]
for i in range(y_train.shape[0] // batch_size): #数据总条数/批数据=一共的批数
start = i * batch_size
batch_x = X_train[start : start + batch_size] #创造批数据
batch_y = y_train[start : start + batch_size]
sess.run(optimizer, feed_dict={X: batch_x, Y: batch_y}) #用批数据来计算模型
if i % 50 == 0:
print('MSE Train:', sess.run(cost, feed_dict={X: X_train, Y: y_train}))
print('MSE Test:', sess.run(cost, feed_dict={X: X_test, Y: y_test}))
y_pred = sess.run(out, feed_dict={X: X_test}) #使用x_test预测得到预测值y_pred
y_pred = np.squeeze(y_pred) #去掉维度
plt.plot(y_test, label='test')
plt.plot(y_pred, label='pred') #原始值和预测值的曲线比较
plt.title('Epoch ' + str(e) + ', Batch ' + str(i))
plt.legend()
plt.show()
3、利用keras实现同步预测
- 引入包;初始化定义参数
from keras.layers import Input, Dense
from keras.models import Model
X_train = data_train[:, 1:]
y_train = data_train[:, 0]
X_test = data_test[:, 1:]
y_test = data_test[:, 0]
input_dim = X_train.shape[1]
hidden_1 = 1024
hidden_2 = 512
hidden_3 = 256
hidden_4 = 128
output_dim = 1
batch_size = 256
epochs = 10
- 模型定义
X = Input(shape=[input_dim,]) #','后面写数据个数[与之前的区别]
h = Dense(hidden_1, activation='relu')(X) #激活函数用'relu'
h = Dense(hidden_2, activation='relu')(h)
h = Dense(hidden_3, activation='relu')(h)
h = Dense(hidden_4, activation='relu')(h)
Y = Dense(output_dim, activation='sigmoid')(h) #最后一层激活函数用'sigmoid'
- 模型运行
model = Model(X, Y)
model.compile(loss='mean_squared_error', optimizer='adam') #直接设定损失函数和优化器形式
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, shuffle=False) #shuffle设为false保证了时间序列关系不变
y_pred = model.predict(X_test) #用该模型计算出预测值
print('MSE Train:', model.evaluate(X_train, y_train, batch_size=batch_size))
print('MSE Test:', model.evaluate(X_test, y_test, batch_size=batch_size))
plt.plot(y_test, label='test')
plt.plot(y_pred, label='pred')
plt.legend()
plt.show()
4、异步预测-LSTM
异步预测是指使用历史若干时刻大盘指数,预测当前时刻的大盘指数,即[None,5,1]=>[None,1],涉及时序关系,使用Keras实现异步预测,主要使用RNN中的LSTM(Long Short-Term Memory)
- 引入包;初始化定义参数
from keras.layers import Input, Dense, LSTM
from keras.models import Model
output_dim = 1
batch_size = 256
epochs = 10
seq_len = 5 #序列(5个预测1个)
hidden_size = 128 #隐藏层1个
- 参数设置
# 实际是取股票股价那一列进行运算
X_train = np.array([data_train[i : i + seq_len, 0] for i in range(data_train.shape[0] - seq_len)])[:, :, np.newaxis]
y_train = np.array([data_train[i + seq_len, 0] for i in range(data_train.shape[0] - seq_len)])
X_test = np.array([data_test[i : i + seq_len, 0] for i in range(data_test.shape[0] - seq_len)])[:, :, np.newaxis]
y_test = np.array([data_test[i + seq_len, 0] for i in range(data_test.shape[0] - seq_len)])
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) #数据类型显示
X = Input(shape=[X_train.shape[1], X_train.shape[2],])
h = LSTM(hidden_size, activation='relu')(X)
Y = Dense(output_dim, activation='sigmoid')(h)
- LSTM模型训练
model = Model(X, Y)
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, shuffle=False)
y_pred = model.predict(X_test)
print('MSE Train:', model.evaluate(X_train, y_train, batch_size=batch_size))
print('MSE Test:', model.evaluate(X_test, y_test, batch_size=batch_size))
plt.plot(y_test, label='test')
plt.plot(y_pred, label='pred')
plt.legend()
plt.show()
5、异步预测-多特征量
使用历史若干时刻500支个股股价及大盘指数,预测当前大盘指数,即[None,5,501]=>[None,1]
- 引入包;初始化定义参数
from keras.layers import Input, Dense, LSTM
from keras.models import Model
output_dim = 1
batch_size = 256
epochs = 10
seq_len = 5
hidden_size = 128
- 参数设置
区别在于取了所有列的数据,即个股股价,而非只取大盘指数那一列。
X_train = np.array([data_train[i : i + seq_len, :] for i in range(data_train.shape[0] - seq_len)])
y_train = np.array([data_train[i + seq_len, 0] for i in range(data_train.shape[0] - seq_len)])
X_test = np.array([data_test[i : i + seq_len, :] for i in range(data_test.shape[0] - seq_len)])
y_test = np.array([data_test[i + seq_len, 0] for i in range(data_test.shape[0] - seq_len)])
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
#为(33007,5,501)(33007,)(8249,5,501)(8249,)
X = Input(shape=[X_train.shape[1], X_train.shape[2],]) #其中第一项为5,第二项为501,第三个','则是全部数据量
h = LSTM(hidden_size, activation='relu')(X)
Y = Dense(output_dim, activation='sigmoid')(h)
- LSTM模型训练
model = Model(X, Y)
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, shuffle=False)
y_pred = model.predict(X_test)
print('MSE Train:', model.evaluate(X_train, y_train, batch_size=batch_size))
print('MSE Test:', model.evaluate(X_test, y_test, batch_size=batch_size))
plt.plot(y_test, label='test')
plt.plot(y_pred, label='pred')
plt.legend()
plt.show()
最后结果并不是最好,loss变大了,说明特征多并不是最好,会引入噪音。