普通的长短时记忆神经网络只能对正向数据进行处理,而双向长短时记忆神经网络可以处理内容的上下文,通过新闻分类大赛结果可以看到双向长短时记忆神经网络具有一定的提升。
在pytorch中实现双向长短时记忆神经网络和简单,只需在参数中设置bidirectional=True即可,同时对循环神经网络中的ht和ct的定义num_directions=2,Linear 层in_features参数也变成hidden_size*2,如果不清楚可以参考pytorch官方文档,其他内容保持不变。
导入包
import torch
import torch.nn as nn
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from collections import Counter
from torchtext.datasets import AG_NEWS
import pandas as pd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
读取数据
# 训练数据
dataset = pd.read_csv("data/train_set.csv", sep="\t")
# 训练集
train_dataset = dataset.sample(frac=0.9)
# 测试集
#test_dataset = dataset[~dataset.index.isin(train_dataset.index)]
训练数据初始化
tokenizer = get_tokenizer("basic_english")
counter = Counter()
for (lebel, line) in train_dataset.iloc:
counter.update(tokenizer(line))
# 词表
vocab = Vocab(counter, min_freq=1)
# 单词总数
vocab_num = len(vocab)
# 将英文句子转成ID列表
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
# 将标签转成ID
label_pipeline = lambda x: int(x)
# 将训练集句子转成ID向量
def get_label_line_tensors(data):
lines = []
labels = []
for (label, line) in data:
lines.append(torch.tensor(text_pipeline(line)).to(device))
labels.append(torch.tensor([label_pipeline(label)]).to(device))
return labels, lines
定义模型
# 定义模型
class RNN_LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, n_layers, vocab_size, dropout_p=0.1):
super(RNN_LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.vocab_size = vocab_size
self.dropout_p = dropout_p
# 定义神经网络层
self.embeding = nn.Embedding(vocab_size, input_size)
self.rnn = nn.LSTM(input_size, hidden_size, n_layers, dropout=self.dropout_p, bidirectional=True)
self.out = nn.Linear(hidden_size*2, output_size)
self.softmax = nn.LogSoftmax(dim=0)
# 前馈
def forward(self, input_words, hidden, cell):
seq_len = input_words.size()[0]
embeding = self.embeding(input_words).view(seq_len, 1, -1)
out, (hn, cn) = self.rnn(embeding, (hidden, cell))
out = self.softmax(self.out(out))
return out, (hn, cn)
# 初始化隐含层
def init_hidden(self):
hidden = torch.zeros(self.n_layers*2, 1, self.hidden_size)
return hidden.to(device)
# 初始化 cell state
def init_cell_state(self):
cell = torch.zeros(self.n_layers*2, 1, self.hidden_size)
return cell.to(device)
定义训练方法
# 数据训练
def train(line, label, loss):
optimizer.zero_grad()
hidden = model.init_hidden()
cell = model.init_cell_state()
o, (h, c) = model(line, hidden, cell)
#oh = torch.sum(o, dim=0)
oh = o[-1:].squeeze(0)
l = loss(oh, label)
l.backward()
optimizer.step()
return l
执行训练
labels, lines = get_label_line_tensors(train_dataset.iloc)
import random
import time
lossNum = []
begin = time.time()
print(begin)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
for i in range(300001):
idx = random.randint(0, 179999)
l = train(lines[idx], labels[idx], loss)
lossNum.append(l)
if i % 10000==0 or i==(159999):
print(l)
print(time.time()-begin)
模型保存
torch.save(model, "model60.pkl")
定义一个简单的预测函数
#结果预测 为了快速度得到结果,使用简单粗暴的方法获取预测结果
def get_l(g):
id = -1
g1 = torch.max(g)
for i in range(14):
if g1==g[i]:
id = i
break
return id
执行测试数据预测并保存
# 训练数据
submit_dataset = pd.read_csv("data/test_a.csv", sep="\t")
t_lines_ = []
for line in submit_dataset.iloc:
t_lines_.append(torch.tensor(text_pipeline(line["text"])).to(device))
#预测结果
writes = []
for w in t_lines_:
check_h = model.init_hidden()
check_c = model.init_cell_state()
check_o, (_, _) = model(w, check_h, check_c)
writes.append(get_l(check_o[-1, -1]))
#写入数据到csv
tt = pd.DataFrame({"label": []})
tt["label"] = writes[0]
tt.to_csv("submit2.csv", sep="\t", index=False)