机器学习之阿里云天池大赛—新闻分类(二)双向长短记忆网络

普通的长短时记忆神经网络只能对正向数据进行处理,而双向长短时记忆神经网络可以处理内容的上下文,通过新闻分类大赛结果可以看到双向长短时记忆神经网络具有一定的提升。
在pytorch中实现双向长短时记忆神经网络和简单,只需在参数中设置bidirectional=True即可,同时对循环神经网络中的ht和ct的定义num_directions=2,Linear 层in_features参数也变成hidden_size*2,如果不清楚可以参考pytorch官方文档,其他内容保持不变。

导入包

import torch
import torch.nn as nn
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from collections import Counter
from torchtext.datasets import AG_NEWS
import pandas as pd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

读取数据

# 训练数据
dataset = pd.read_csv("data/train_set.csv", sep="\t")
# 训练集
train_dataset = dataset.sample(frac=0.9)


# 测试集
#test_dataset = dataset[~dataset.index.isin(train_dataset.index)]

训练数据初始化

tokenizer = get_tokenizer("basic_english")
counter = Counter()
for (lebel, line) in train_dataset.iloc:
    counter.update(tokenizer(line))

# 词表
vocab = Vocab(counter, min_freq=1)

# 单词总数
vocab_num = len(vocab)
# 将英文句子转成ID列表
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# 将标签转成ID
label_pipeline = lambda x: int(x)
# 将训练集句子转成ID向量
def get_label_line_tensors(data):
    lines = []
    labels = []
    for (label, line) in data:
        lines.append(torch.tensor(text_pipeline(line)).to(device))
        labels.append(torch.tensor([label_pipeline(label)]).to(device))
    return labels, lines

定义模型

# 定义模型
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,  n_layers, vocab_size, dropout_p=0.1):
        super(RNN_LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.vocab_size = vocab_size
        self.dropout_p = dropout_p
        
        # 定义神经网络层
        self.embeding = nn.Embedding(vocab_size, input_size)
        self.rnn = nn.LSTM(input_size, hidden_size, n_layers, dropout=self.dropout_p, bidirectional=True)
        self.out = nn.Linear(hidden_size*2, output_size)
        self.softmax = nn.LogSoftmax(dim=0)
        
    # 前馈
    def forward(self, input_words, hidden, cell):
        seq_len = input_words.size()[0]
        embeding = self.embeding(input_words).view(seq_len, 1, -1)
        out, (hn, cn) = self.rnn(embeding, (hidden, cell))
        out = self.softmax(self.out(out))
        return out, (hn, cn)
        
    # 初始化隐含层
    def init_hidden(self):
        hidden = torch.zeros(self.n_layers*2, 1, self.hidden_size)
        return hidden.to(device)
    
    # 初始化 cell state
    def init_cell_state(self):
        cell = torch.zeros(self.n_layers*2, 1, self.hidden_size)
        return cell.to(device)

定义训练方法

# 数据训练
def train(line, label, loss):
    optimizer.zero_grad()
    hidden = model.init_hidden()
    cell = model.init_cell_state()
    o, (h, c)  = model(line, hidden, cell)
    #oh = torch.sum(o, dim=0)
    oh = o[-1:].squeeze(0)
    l = loss(oh, label)
    
    l.backward()
    optimizer.step()
    return l

执行训练

labels, lines = get_label_line_tensors(train_dataset.iloc)
import random
import time
lossNum = []
begin = time.time()
print(begin)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
for i in range(300001):
    idx = random.randint(0, 179999)
    l = train(lines[idx], labels[idx], loss)
    lossNum.append(l)
    if i % 10000==0 or i==(159999):
        print(l)
print(time.time()-begin)

模型保存

torch.save(model, "model60.pkl")

定义一个简单的预测函数

#结果预测 为了快速度得到结果,使用简单粗暴的方法获取预测结果
def get_l(g):
    id = -1
    g1 = torch.max(g)
    for i in range(14):
        if g1==g[i]:
            id = i
            break
    return id

执行测试数据预测并保存

# 训练数据
submit_dataset = pd.read_csv("data/test_a.csv", sep="\t")
t_lines_ = []
for line in submit_dataset.iloc:
    t_lines_.append(torch.tensor(text_pipeline(line["text"])).to(device))

#预测结果
writes = []
for w in t_lines_:
    check_h = model.init_hidden()
    check_c = model.init_cell_state()
    check_o, (_, _) = model(w, check_h, check_c)
    writes.append(get_l(check_o[-1, -1]))
#写入数据到csv
tt = pd.DataFrame({"label": []})
tt["label"] = writes[0]
tt.to_csv("submit2.csv", sep="\t", index=False)

机器学习之阿里云天池大赛—新闻分类(二)双向长短记忆网络

上一篇:pmon长期持有cache buffers chains导致实例hang住一例


下一篇:《UNIX技术内幕》勘误及问题解答_26