【代码粗读】Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification

首先是对于下载下来的数据集进行初步处理将原数据集中的'<e1>','</e1>','<e2>','</e2>'标签替换为'ENT_1_START','ENT_2_END','ENT_2_START','ENT_2_END',处理过后生成json格式的文件:

 1 def load_data(path):
 2     ENT_1_START = '<e1>'
 3     ENT_1_END = '</e1>'
 4     ENT_2_START = '<e2>'
 5     ENT_2_END = '</e2>'
 6 
 7     nlp = NLP()
 8     data = []
 9     with open(path) as f:
10         lines = [line.strip() for line in f]
11     for idx in range(0, len(lines), 4):
12         id = int(lines[idx].split("\t")[0])
13         relation = lines[idx + 1]
14 
15         sentence = lines[idx].split("\t")[1][1:-1]
16         sentence = sentence.strip()
17 
18         sentence = sentence.replace(ENT_1_START, ' ENT_1_START ')
19         sentence = sentence.replace(ENT_1_END, ' ENT_1_END ')
20         sentence = sentence.replace(ENT_2_START, ' ENT_2_START ')
21         sentence = sentence.replace(ENT_2_END, ' ENT_2_END ')
22 
23         sentence = nlp.word_tokenize(sentence)
24 
25         ent1 = sentence.split(' ENT_1_START ')[-1].split(' ENT_1_END ')[0]
26         ent2 = sentence.split(' ENT_2_START ')[-1].split(' ENT_2_END ')[0]
27 
28 
29         data.append({
30             'label': relation,
31             'sentence': sentence,
32             'ent1': ent1,
33             'ent2': ent2,
34             'id': id,
35         })
36 
37     return data
38 
39 
40 def split(data, dev_size=800):
41     random.shuffle(data)
42     dev = data[:dev_size]
43     train = data[dev_size:]
44     return train, dev
45 
46 
47 def save_to_json(data, file):
48     writeout = json.dumps(data, indent=4)
49     fwrite(writeout, file)
50     print('[Info] Saving {} data to {}'.format(len(data), file))
51 
52 
53 # def download():
54 #     cmd = 'mkdir data/re_semeval/raw 2>/dev/null \n' \
55 #           'wget https://raw.githubusercontent.com/SeoSangwoo/Attention-Based-BiLSTM-relation-extraction/master/SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT -P data/re_semeval/raw \n' \
56 #           'wget https://raw.githubusercontent.com/SeoSangwoo/Attention-Based-BiLSTM-relation-extraction/master/SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT -P data/re_semeval/raw \n'
57 #     shell(cmd)
58 
59 
60 def main():
61     #download()
62 
63     data = {}
64     data_dir = 'D:\python\wgy_jupyter\pytorch_RelationExtraction_AttentionBiLSTM-master/data/re_semeval'
65 
66     raw_fname = os.path.join(data_dir, 'raw', 'TRAIN_FILE.TXT')
67     nontest_data = load_data(raw_fname)
68     data['train'], data['valid'] = split(nontest_data)
69 
70     raw_fname = os.path.join(data_dir, 'raw', 'TEST_FILE_FULL.TXT')
71     data['test'] = load_data(raw_fname)
72 
73     for key, value in data.items():
74         json_fname = os.path.join(data_dir, '{}.json'.format(key))
75         save_to_json(value, json_fname)

Json数据格式详情如下:

{
        "label": "Cause-Effect(e2,e1)",
        "sentence": "The clock ENT_1_START signal ENT_1_END was generated from an external cavity semiconductor ENT_2_START laser ENT_2_END .",
        "ent1": "signal",
        "ent2": "laser",
        "id": 6457
    },

随后将得到的Json数据格式的文件转为csv格式:

 1 def json2csv(json_fname, csv_fname, args):
 2     with open(json_fname) as f:
 3         data = json.load(f)
 4     csv_data = []
 5     for line in data:
 6         sentence = line['sentence']
 7         sentence = ' '.join(sentence.split()[:args.sent_max_len])
 8         if args.lower: sentence = sentence.lower()
 9 
10         csv_line = {
11             'tgt': line['label'],
12             'input': sentence,
13             'show_inp': sentence,
14             'ent1': line['ent1'],
15             'ent2': line['ent2'],
16             'id': line['id'],
17         }
18         csv_data += [csv_line]
19     with open(csv_fname, 'w') as f:
20         writer = csv.DictWriter(f, fieldnames=csv_line.keys())
21         writer.writeheader()
22         writer.writerows(csv_data)
23     print('[Info] Writing {} data to {}'.format(len(csv_data), csv_fname))
24 
25 
26 def get_args():
27     parser = configargparse.ArgumentParser(
28         description='Options for preprocessing')
29     parser.add_argument('-lower', action='store_true', default=False,
30                         help='whether to keep the uppercase')
31     parser.add_argument('-sent_max_len', default=100, type=int,
32                         help='the maximum number of words allowed in a sentence')
33     parser.add_argument('-tokenize', action='store_false', default=True,
34                         help='whether to tokenize the sentences')
35     parser.add_argument('-data_dir', default='data/re_semeval/', type=str,
36                         help='path to load data from')
37     args = parser.parse_args()
38     return args
39 
40 
41 def main():
42     args = get_args()
43     data_dir = args.data_dir
44 
45     for typ in 'train valid test'.split():
46         json_fname = os.path.join(data_dir, '{}.json'.format(typ))
47         csv_fname = os.path.join(data_dir, '{}.csv'.format(typ))
48 
49         json2csv(json_fname, csv_fname, args)

csv具体数据格式:

【代码粗读】Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification

 后续处理数据,生成dataset,dataloader等部分使用torchtext组件写成的,不熟悉,先把训练测试过程与论文一致的模型细节部分介绍:

 1 def run(proc_id, n_gpus, devices, args):
 2     '''
 3     :param proc_id:   0
 4     :param n_gpus:    1
 5     :param devices:   1
 6     :param args:
 7     :return:
 8     '''
 9     set_seed(args.seed)
10     dev_id = devices[proc_id]
11 
12     if n_gpus > 1:
13         dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
14             master_ip='127.0.0.1', master_port=args.tcp_port)
15         world_size = n_gpus
16         torch.distributed.init_process_group(backend="nccl",
17                                              init_method=dist_init_method,
18                                              world_size=world_size,
19                                              rank=dev_id)
20     device = torch.device(dev_id)
21 
22     dataset = Dataset(proc_id=proc_id, data_dir=args.save_dir,    #args.save_dir = tmp/
23                       train_fname=args.train_fname,             #args.train_fname = train.csv
24                       preprocessed=args.preprocessed, lower=args.lower,    #whether input data is preprocessed by spacy  default = True
25                       vocab_max_size=args.vocab_max_size, emb_dim=args.emb_dim,  #100000   100
26                       save_vocab_fname=args.save_vocab_fname, verbose=True, )   #vocab.json
27     train_dl, valid_dl, test_dl = \
28         dataset.get_dataloader(proc_id=proc_id, n_gpus=n_gpus, device=device,
29                                batch_size=args.batch_size)
30 
31     validator = Validator(dataloader=valid_dl, save_dir=args.save_dir,   #args.save_dir = tmp/
32                           save_log_fname=args.save_log_fname,            #args.save_log_fname = run_log.txt
33                           save_model_fname=args.save_model_fname,        #args.save_model_fname = model
34                           valid_or_test='valid',
35                           vocab_itos=dataset.INPUT.vocab.itos,
36                           label_itos=dataset.TGT.vocab.itos)
37     tester = Validator(dataloader=test_dl, save_log_fname=args.save_log_fname,
38                        save_dir=args.save_dir, valid_or_test='test',  ##args.save_dir = tmp/
39                        vocab_itos=dataset.INPUT.vocab.itos,
40                        label_itos=dataset.TGT.vocab.itos)
41     predictor = Predictor(args.save_vocab_fname)            #save_vocab_fnmae = vocab.json
42 
43     if args.load_model:   #path to pretrained model  如果由于训练模型的话加载
44         predictor.use_pretrained_model(args.load_model, device=device)
45         import pdb;
46         pdb.set_trace()
47 
48         predictor.pred_sent(dataset.INPUT)
49         tester.final_evaluate(predictor.model)
50 
51         return
52 
53     model = LSTMClassifier(emb_vectors=dataset.INPUT.vocab.vectors,
54                            emb_dropout=args.emb_dropout,        #args.emb_dropout = 0.3
55                            lstm_dim=args.lstm_dim,              #args.lstm_dim = 100
56                            lstm_n_layer=args.lstm_n_layer,      #args.lstm_n_layer = 1
57                            lstm_dropout=args.lstm_dropout,      #args.lstm_dropout = 0.3
58                            lstm_combine=args.lstm_combine,      #args.lstm_combine = add
59                            linear_dropout=args.linear_dropout,  #args.linear_dropout = 0.5
60                            n_linear=args.n_linear,              #args.n_linear = 1
61                            n_classes=len(dataset.TGT.vocab))    #
62     if args.init_xavier: model.apply(init_weights)
63     model = model.to(device)
64     args = model_setup(proc_id, model, args)
65 
66     # train()中包含eval部分,每次epoch当前效果最好的模型保存在self.save_model_fname目录下
67     train(proc_id, n_gpus, model=model, train_dl=train_dl,
68           validator=validator, tester=tester, epochs=args.epochs, lr=args.lr,
69           weight_decay=args.weight_decay)
70 
71     if proc_id == 0:
72         predictor.use_pretrained_model(args.save_model_fname, device=device)  #加载loss最小,效果最好的模型
73         bookkeep(predictor, validator, tester, args, dataset.INPUT)       #评估模型并使用例句查看效果等

第67行代码详细train()部分:

 1 def train(proc_id, n_gpus, model=None, train_dl=None, validator=None,
 2           tester=None, epochs=20, lr=0.001, log_every_n_examples=1,
 3           weight_decay=0):
 4     # opt = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),
 5     #                       lr=lr, momentum=0.9)
 6     opt = torch.optim.Adadelta(
 7         filter(lambda p: p.requires_grad, model.parameters()), lr=1.0, rho=0.9,
 8         eps=1e-6, weight_decay=weight_decay)
 9 
10     for epoch in range(epochs):   #100
11         if epoch - validator.best_epoch > 10:
12             return
13 
14         model.train()
15         pbar = tqdm(train_dl) if proc_id == 0 else train_dl
16         total_loss = 0
17         n_correct = 0
18         cnt = 0
19         for batch in pbar:
20             batch_size = len(batch.tgt)
21 
22             if proc_id == 0 and cnt % log_every_n_examples < batch_size:
23                 pbar.set_description('E{:02d}, loss:{:.4f}, acc:{:.4f}, lr:{}'
24                                      .format(epoch,
25                                              total_loss / cnt if cnt else 0,
26                                              n_correct / cnt if cnt else 0,
27                                              opt.param_groups[0]['lr']))
28                 pbar.refresh()
29 
30             loss, acc = model.loss_n_acc(batch.input, batch.tgt)
31             total_loss += loss.item() * batch_size
32             cnt += batch_size
33             n_correct += acc
34 
35             opt.zero_grad()
36             loss.backward()
37             clip_gradient(model, 1)
38             opt.step()
39 
40         if n_gpus > 1: torch.distributed.barrier()
41 
42         model.eval()
43         validator.evaluate(model, epoch)
44         # tester.evaluate(model, epoch)
45         if proc_id == 0:
46             summ = {
47                 'Eval': '(e{:02d},train)'.format(epoch),
48                 'loss': total_loss / cnt,
49                 'acc': n_correct / cnt,
50             }
51             validator.write_summary(summ=summ)
52             validator.write_summary(epoch=epoch)
53 
54             # tester.write_summary(epoch)

train()中第30行代码对应的模型详细构造:

  1 class LSTMClassifier(nn.Module):
  2     def __init__(self, vocab_size=50000, emb_dim=100, emb_vectors=None,
  3                  emb_dropout=0.3,
  4                  lstm_dim=256, lstm_n_layer=2, lstm_dropout=0.3,
  5                  bidirectional=True, lstm_combine='add',
  6                  n_linear=2, linear_dropout=0.5, n_classes=1,
  7                  crit=nn.CrossEntropyLoss()):
  8         super().__init__()
  9         vocab_size, emb_dim = emb_vectors.shape
 10         n_dirs = bidirectional + 1
 11         lstm_dir_dim = lstm_dim // n_dirs if lstm_combine == 'concat' else lstm_dim
 12 
 13         self.lstm_n_layer = lstm_n_layer
 14         self.n_dirs = n_dirs
 15         self.lstm_dir_dim = lstm_dir_dim
 16         self.lstm_combine = lstm_combine
 17 
 18         self.embedding_layer = nn.Embedding(*emb_vectors.shape)
 19         self.embedding_layer.from_pretrained(emb_vectors, padding_idx=1)
 20         # pad=1 in torchtext; embedding weights trainable
 21         self.embedding_dropout = nn.Dropout(p=emb_dropout)
 22 
 23         self.lstm = nn.LSTM(emb_dim, lstm_dir_dim,     #args.lstm_dim = 100
 24                             num_layers=lstm_n_layer,    #args.lstm_n_layer = 1
 25                             bidirectional=bidirectional,
 26                             batch_first=True)
 27         if lstm_n_layer > 1: self.lstm.dropout = lstm_dropout
 28         self.lstm_dropout = nn.Dropout(p=lstm_dropout)
 29 
 30         self.att_w = nn.Parameter(torch.randn(1, lstm_dim, 1))    #100/default = 256
 31         self.linear_layers = [nn.Linear(lstm_dim, lstm_dim) for _ in
 32                               range(n_linear - 1)]
 33         self.linear_layers = nn.ModuleList(self.linear_layers)
 34         self.linear_dropout = nn.Dropout(p=linear_dropout)
 35 
 36         self.label = nn.Linear(lstm_dim, n_classes)
 37         self.crit = crit
 38 
 39         self.opts = {
 40             'vocab_size': vocab_size,
 41             'emb_dim': emb_dim,
 42             'emb_dropout': emb_dropout,
 43             'emb_vectors': emb_vectors,
 44             'lstm_dim': lstm_dim,
 45             'lstm_n_layer': lstm_n_layer,
 46             'lstm_dropout': lstm_dropout,
 47             'lstm_combine': lstm_combine,
 48             'n_linear': n_linear,
 49             'linear_dropout': linear_dropout,
 50             'n_classes': n_classes,
 51             'crit': crit,
 52         }
 53     def re_attention(self, lstm_output, final_h, input):
 54         '''
 55 
 56         :param lstm_output:  [bs,seq_len,2*emb_dim]
 57         :param final_h: [2,bs,emb_dim]
 58         :param input: [bs,seq_len]     assert:emb_vectors.shape[1]=emb_dim
 59         :return:
 60         '''
 61         batch_size, seq_len = input.shape
 62 
 63         final_h = final_h.view(self.lstm_n_layer, self.n_dirs, batch_size,
 64                                self.lstm_dir_dim)[-1]    #[1,2,bs,100]取最后一层 ->[2,bs,100]
 65         final_h = final_h.permute(1, 0, 2)      #[bs,2,100]
 66         final_h = final_h.sum(dim=1)  # (batch_size,  100) 按列求和
 67 
 68         # final_h.size() = (batch_size, hidden_size)
 69         # output.size() = (batch_size, num_seq, hidden_size)
 70         if self.lstm_combine == 'add':
 71             lstm_output = lstm_output.view(batch_size, seq_len, 2,   #双向LSTM的output的2*hidden拆开,再在第3维度上求和
 72                                            self.lstm_dir_dim)
 73             lstm_output = lstm_output.sum(dim=2)
 74             # lstm_output(batch_size, seq_len, lstm_dir_dim=hidden)
 75         att = torch.bmm(torch.tanh(lstm_output),     #[bs,seq_len,hidden]*[bs,lstm_dir_dim,1]=[bs,seq_len,1]
 76                         self.att_w.repeat(batch_size, 1, 1))
 77         att = F.softmax(att, dim=1)  # att(batch_size, seq_len, 1)
 78         att = torch.bmm(lstm_output.transpose(1, 2), att).squeeze(2)  #[bs,lstm_dim,seq_len]*[bs,seq_len,1]=[bs,lstm_dim,1]->[bs,lst,_dim]
 79         attn_output = torch.tanh(att)  # attn_output(batch_size, lstm_dir_dim)
 80         return attn_output
 81 
 82     def forward(self, input):
 83         batch_size, seq_len, *_ = input.shape
 84         inp = self.embedding_layer(input)    #[bs,seq_len,emb_vectors.shape[1]]
 85         inp = self.embedding_dropout(inp)   #[bs,seq_len,emb_vectors.shape[1]]
 86 
 87         lstm_output, (final_h, final_c) = self.lstm(inp)  #lstm_output = [bs,seq_len,2*emb_dim=200]
 88 
 89         # outputs = []
 90         # for i in range(seq_len):
 91         #     cur_emb = inp[i:i + 1]  # .view(1, inp.size(1), inp.size(2))
 92         #
 93         #     o, hidden = self.lstm(cur_emb) if i == 0 else self.lstm(cur_emb, hidden)
 94         #     import pdb;pdb.set_trace()
 95         #     outputs += [o.unsqueeze(0)]
 96         #
 97         # outputs = torch.cat(outputs, dim=0)
 98 
 99         lstm_output = self.lstm_dropout(lstm_output)  #[bs,seq_len,2*emb_dim]
100 
101         attn_output = self.re_attention(lstm_output, final_h, input)  #[bs,lstm_dir_dim]
102         output = self.linear_dropout(attn_output)
103 
104         for layer in self.linear_layers:
105             output = layer(output)   #[bs,lstm_dir_dim]->[bs,lstm_dir_dim]
106             output = self.linear_dropout(output)
107             output = F.relu(output)
108 
109         logits = self.label(output)   #[bs,lstm_dim]->[bs,19]
110         return logits
111 
112     def loss_n_acc(self, input, target):
113         logits = self.forward(input)      #[bs,19]
114         logits_flat = logits.view(-1, logits.size(-1))
115         target_flat = target.view(-1)   #[1,bs]
116         loss = self.crit(logits_flat, target_flat)  # mean_score per batch
117 
118         pred_flat = logits_flat.max(dim=-1)[1]   #[bs,1]   返回最大值下标的索引
119         acc = (pred_flat == target_flat).sum()
120         return loss, acc.item()

这篇代码理解不透彻等我熟悉掌握torchtext工具后会重新复读此篇代码。

 

 

 

 

 

TorchText详解:https://blog.csdn.net/qq_43283527/article/details/107369395

 

上一篇:json(http://www.cnblogs.com/lanxuezaipiao/archive/2013/05/24/3096437.html)


下一篇:java基础课程笔记整理——Chapter2