B站 刘二大人 传送门 循环神经网络(基础篇)
课件链接:https://pan.baidu.com/s/1vZ27gKp8Pl-qICn_p2PaSw
提取码:cxe4
模型还是将输入“hello”训练输出为“ohlol”,用GRU网络实现。按照计算图实现GRU之后,像前面一样又尝试了加入embedding的方法,效果略有改进。
GRU的网络结构示意图和公式:
下面的公式和上面一样,就是写法不同。
首先看公式有6个线性层,之后又进行了非线性运算,所以模型定义了6个linear。输入是4维向量(h、e、l、o),不打算改变维度,所以输出也都是4维。只有5个输入,所以批量设为1.向前传播根据计算公式和计算图,返回hidden继续在下一轮计算继续用。和上一篇LSTM大同小异,只有网络结构和网络输出上有点不一样。
import torch
input_size = 4
batch_size = 1
class GRU(torch.nn.Module):
def __init__(self):
super(GRU, self).__init__()
self.linearrx = torch.nn.Linear(4, 4)
self.linearzx = torch.nn.Linear(4, 4)
self.linearnx = torch.nn.Linear(4, 4)
self.linearrh = torch.nn.Linear(4, 4)
self.linearzh = torch.nn.Linear(4, 4)
self.linearnh = torch.nn.Linear(4, 4)
self.sigmoid = torch.nn.Sigmoid()
self.tanh = torch.nn.Tanh()
def forward(self, x, hidden):
r = self.sigmoid(self.linearrx(x) + self.linearrh(hidden))
z = self.sigmoid(self.linearzx(x) + self.linearzh(hidden))
n = self.tanh(self.linearnx(x) + r * self.linearnh(hidden))
hidden = (1 - z) * n + z * hidden
return hidden
net = GRU()
def train():
idx2char = ['e', 'h', 'l', 'o'] # 方便最后输出结果
x_data = [1, 0, 2, 2, 3] # 输入向量
y_data = [3, 1, 2, 3, 2] # 标签
one_hot_lookup = [[1, 0, 0, 0], # 查询ont hot编码 方便转换
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]
x_one_hot = [one_hot_lookup[x] for x in x_data] # 按"1 0 2 2 3"顺序取one_hot_lookup中的值赋给x_one_hot
'''运行结果为x_one_hot = [ [0, 1, 0, 0],
[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 0, 1, 0],
[0, 0, 0, 1] ]
刚好对应输入向量,也对应着字符值'hello'
'''
inputs = torch.Tensor(x_one_hot).view(-1, batch_size, input_size)
labels = torch.LongTensor(y_data).view(-1, 1) # 增加维度方便计算loss
# ---计算损失和更新
criterion = torch.nn.CrossEntropyLoss() # 交叉熵
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
# ---计算损失和更新
for epoch in range(100):#开始训练
loss = 0
optimizer.zero_grad()
hidden = torch.zeros(batch_size, input_size)#提供初始化隐藏层(h0)
print('Predicten string:', end='')
for input, label in zip(inputs,labels):#并行遍历数据集 一个一个训练
hidden = net(input, hidden)
loss += criterion(hidden, label)#hidden.shape=(1,4) label.shape=1
_, idx = hidden.max(dim=1)#从第一个维度上取出预测概率最大的值和该值所在序号
print(idx2char[idx.item()], end='')#按上面序号输出相应字母字符
loss.backward()
optimizer.step()
print(', Epoch [%d/100] loss=%.4f' %(epoch+1, loss.item()))
下面是输出结果。
下面是加入embedding的代码。嵌入维度设的是10,所以网络x的线性层要改成输入10,输出4的,hidden的线性层不用改。
class emb_GRU(torch.nn.Module):
def __init__(self):
super(emb_GRU, self).__init__()
self.linearrx = torch.nn.Linear(10, 4)
self.linearzx = torch.nn.Linear(10, 4)
self.linearnx = torch.nn.Linear(10, 4)
self.linearrh = torch.nn.Linear(4, 4)
self.linearzh = torch.nn.Linear(4, 4)
self.linearnh = torch.nn.Linear(4, 4)
self.sigmoid = torch.nn.Sigmoid()
self.tanh = torch.nn.Tanh()
def forward(self, x, hidden):
r = self.sigmoid(self.linearrx(x) + self.linearrh(hidden))
z = self.sigmoid(self.linearzx(x) + self.linearzh(hidden))
n = self.tanh(self.linearnx(x) + r * self.linearnh(hidden))
hidden = (1 - z) * n + z * hidden
return hidden
model = emb_GRU()
def emb_train():
idx2char = ['e', 'h', 'l', 'o'] # 方便最后输出结果
x_data = torch.LongTensor([[1, 0, 2, 2, 3]]).view(5, 1)
y_data = [3, 1, 2, 3, 2] # 标签
labels = torch.LongTensor(y_data).view(-1, 1) # 增加维度方便计算loss
emb = torch.nn.Embedding(4, 10)
inputs = emb(x_data)
# ---计算损失和更新
criterion = torch.nn.CrossEntropyLoss() # 交叉熵
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# ---计算损失和更新
for epoch in range(100):
loss = 0
optimizer.zero_grad()
hidden = torch.zeros(batch_size, input_size)#提供初始化隐藏层(h0)
print('Predicten string:', end='')
for input, label in zip(inputs,labels):#并行遍历数据集 一个一个训练
hidden = model(input, hidden)
loss += criterion(hidden, label)
_, idx = hidden.max(dim=1)#从第一个维度上取出预测概率最大的值和该值所在序号
print(idx2char[idx.item()], end='')#按上面序号输出相应字母字符
loss.backward(retain_graph=True)#运行时报错,错误提示是下面这段话,根据提示修改参数就可以了。retain_graph=True
'''Trying to backward through the graph a second time,
but the saved intermediate results have already been freed.
Specify retain_graph=True when calling backward the first time.'''
optimizer.step()
print(', Epoch [%d/100] loss=%.4f' %(epoch+1, loss.item()))
train()
print('华丽的分割线--------------------------------------------------------')
emb_train()
结果输出: