首先是对于下载下来的数据集进行初步处理将原数据集中的'<e1>','</e1>','<e2>','</e2>'标签替换为'ENT_1_START','ENT_2_END','ENT_2_START','ENT_2_END',处理过后生成json格式的文件:
1 def load_data(path): 2 ENT_1_START = '<e1>' 3 ENT_1_END = '</e1>' 4 ENT_2_START = '<e2>' 5 ENT_2_END = '</e2>' 6 7 nlp = NLP() 8 data = [] 9 with open(path) as f: 10 lines = [line.strip() for line in f] 11 for idx in range(0, len(lines), 4): 12 id = int(lines[idx].split("\t")[0]) 13 relation = lines[idx + 1] 14 15 sentence = lines[idx].split("\t")[1][1:-1] 16 sentence = sentence.strip() 17 18 sentence = sentence.replace(ENT_1_START, ' ENT_1_START ') 19 sentence = sentence.replace(ENT_1_END, ' ENT_1_END ') 20 sentence = sentence.replace(ENT_2_START, ' ENT_2_START ') 21 sentence = sentence.replace(ENT_2_END, ' ENT_2_END ') 22 23 sentence = nlp.word_tokenize(sentence) 24 25 ent1 = sentence.split(' ENT_1_START ')[-1].split(' ENT_1_END ')[0] 26 ent2 = sentence.split(' ENT_2_START ')[-1].split(' ENT_2_END ')[0] 27 28 29 data.append({ 30 'label': relation, 31 'sentence': sentence, 32 'ent1': ent1, 33 'ent2': ent2, 34 'id': id, 35 }) 36 37 return data 38 39 40 def split(data, dev_size=800): 41 random.shuffle(data) 42 dev = data[:dev_size] 43 train = data[dev_size:] 44 return train, dev 45 46 47 def save_to_json(data, file): 48 writeout = json.dumps(data, indent=4) 49 fwrite(writeout, file) 50 print('[Info] Saving {} data to {}'.format(len(data), file)) 51 52 53 # def download(): 54 # cmd = 'mkdir data/re_semeval/raw 2>/dev/null \n' \ 55 # 'wget https://raw.githubusercontent.com/SeoSangwoo/Attention-Based-BiLSTM-relation-extraction/master/SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT -P data/re_semeval/raw \n' \ 56 # 'wget https://raw.githubusercontent.com/SeoSangwoo/Attention-Based-BiLSTM-relation-extraction/master/SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT -P data/re_semeval/raw \n' 57 # shell(cmd) 58 59 60 def main(): 61 #download() 62 63 data = {} 64 data_dir = 'D:\python\wgy_jupyter\pytorch_RelationExtraction_AttentionBiLSTM-master/data/re_semeval' 65 66 raw_fname = os.path.join(data_dir, 'raw', 'TRAIN_FILE.TXT') 67 nontest_data = load_data(raw_fname) 68 data['train'], data['valid'] = split(nontest_data) 69 70 raw_fname = os.path.join(data_dir, 'raw', 'TEST_FILE_FULL.TXT') 71 data['test'] = load_data(raw_fname) 72 73 for key, value in data.items(): 74 json_fname = os.path.join(data_dir, '{}.json'.format(key)) 75 save_to_json(value, json_fname)
Json数据格式详情如下:
{ "label": "Cause-Effect(e2,e1)", "sentence": "The clock ENT_1_START signal ENT_1_END was generated from an external cavity semiconductor ENT_2_START laser ENT_2_END .", "ent1": "signal", "ent2": "laser", "id": 6457 },
随后将得到的Json数据格式的文件转为csv格式:
1 def json2csv(json_fname, csv_fname, args): 2 with open(json_fname) as f: 3 data = json.load(f) 4 csv_data = [] 5 for line in data: 6 sentence = line['sentence'] 7 sentence = ' '.join(sentence.split()[:args.sent_max_len]) 8 if args.lower: sentence = sentence.lower() 9 10 csv_line = { 11 'tgt': line['label'], 12 'input': sentence, 13 'show_inp': sentence, 14 'ent1': line['ent1'], 15 'ent2': line['ent2'], 16 'id': line['id'], 17 } 18 csv_data += [csv_line] 19 with open(csv_fname, 'w') as f: 20 writer = csv.DictWriter(f, fieldnames=csv_line.keys()) 21 writer.writeheader() 22 writer.writerows(csv_data) 23 print('[Info] Writing {} data to {}'.format(len(csv_data), csv_fname)) 24 25 26 def get_args(): 27 parser = configargparse.ArgumentParser( 28 description='Options for preprocessing') 29 parser.add_argument('-lower', action='store_true', default=False, 30 help='whether to keep the uppercase') 31 parser.add_argument('-sent_max_len', default=100, type=int, 32 help='the maximum number of words allowed in a sentence') 33 parser.add_argument('-tokenize', action='store_false', default=True, 34 help='whether to tokenize the sentences') 35 parser.add_argument('-data_dir', default='data/re_semeval/', type=str, 36 help='path to load data from') 37 args = parser.parse_args() 38 return args 39 40 41 def main(): 42 args = get_args() 43 data_dir = args.data_dir 44 45 for typ in 'train valid test'.split(): 46 json_fname = os.path.join(data_dir, '{}.json'.format(typ)) 47 csv_fname = os.path.join(data_dir, '{}.csv'.format(typ)) 48 49 json2csv(json_fname, csv_fname, args)
csv具体数据格式:
后续处理数据,生成dataset,dataloader等部分使用torchtext组件写成的,不熟悉,先把训练测试过程与论文一致的模型细节部分介绍:
1 def run(proc_id, n_gpus, devices, args): 2 ''' 3 :param proc_id: 0 4 :param n_gpus: 1 5 :param devices: 1 6 :param args: 7 :return: 8 ''' 9 set_seed(args.seed) 10 dev_id = devices[proc_id] 11 12 if n_gpus > 1: 13 dist_init_method = 'tcp://{master_ip}:{master_port}'.format( 14 master_ip='127.0.0.1', master_port=args.tcp_port) 15 world_size = n_gpus 16 torch.distributed.init_process_group(backend="nccl", 17 init_method=dist_init_method, 18 world_size=world_size, 19 rank=dev_id) 20 device = torch.device(dev_id) 21 22 dataset = Dataset(proc_id=proc_id, data_dir=args.save_dir, #args.save_dir = tmp/ 23 train_fname=args.train_fname, #args.train_fname = train.csv 24 preprocessed=args.preprocessed, lower=args.lower, #whether input data is preprocessed by spacy default = True 25 vocab_max_size=args.vocab_max_size, emb_dim=args.emb_dim, #100000 100 26 save_vocab_fname=args.save_vocab_fname, verbose=True, ) #vocab.json 27 train_dl, valid_dl, test_dl = \ 28 dataset.get_dataloader(proc_id=proc_id, n_gpus=n_gpus, device=device, 29 batch_size=args.batch_size) 30 31 validator = Validator(dataloader=valid_dl, save_dir=args.save_dir, #args.save_dir = tmp/ 32 save_log_fname=args.save_log_fname, #args.save_log_fname = run_log.txt 33 save_model_fname=args.save_model_fname, #args.save_model_fname = model 34 valid_or_test='valid', 35 vocab_itos=dataset.INPUT.vocab.itos, 36 label_itos=dataset.TGT.vocab.itos) 37 tester = Validator(dataloader=test_dl, save_log_fname=args.save_log_fname, 38 save_dir=args.save_dir, valid_or_test='test', ##args.save_dir = tmp/ 39 vocab_itos=dataset.INPUT.vocab.itos, 40 label_itos=dataset.TGT.vocab.itos) 41 predictor = Predictor(args.save_vocab_fname) #save_vocab_fnmae = vocab.json 42 43 if args.load_model: #path to pretrained model 如果由于训练模型的话加载 44 predictor.use_pretrained_model(args.load_model, device=device) 45 import pdb; 46 pdb.set_trace() 47 48 predictor.pred_sent(dataset.INPUT) 49 tester.final_evaluate(predictor.model) 50 51 return 52 53 model = LSTMClassifier(emb_vectors=dataset.INPUT.vocab.vectors, 54 emb_dropout=args.emb_dropout, #args.emb_dropout = 0.3 55 lstm_dim=args.lstm_dim, #args.lstm_dim = 100 56 lstm_n_layer=args.lstm_n_layer, #args.lstm_n_layer = 1 57 lstm_dropout=args.lstm_dropout, #args.lstm_dropout = 0.3 58 lstm_combine=args.lstm_combine, #args.lstm_combine = add 59 linear_dropout=args.linear_dropout, #args.linear_dropout = 0.5 60 n_linear=args.n_linear, #args.n_linear = 1 61 n_classes=len(dataset.TGT.vocab)) # 62 if args.init_xavier: model.apply(init_weights) 63 model = model.to(device) 64 args = model_setup(proc_id, model, args) 65 66 # train()中包含eval部分,每次epoch当前效果最好的模型保存在self.save_model_fname目录下 67 train(proc_id, n_gpus, model=model, train_dl=train_dl, 68 validator=validator, tester=tester, epochs=args.epochs, lr=args.lr, 69 weight_decay=args.weight_decay) 70 71 if proc_id == 0: 72 predictor.use_pretrained_model(args.save_model_fname, device=device) #加载loss最小,效果最好的模型 73 bookkeep(predictor, validator, tester, args, dataset.INPUT) #评估模型并使用例句查看效果等
第67行代码详细train()部分:
1 def train(proc_id, n_gpus, model=None, train_dl=None, validator=None, 2 tester=None, epochs=20, lr=0.001, log_every_n_examples=1, 3 weight_decay=0): 4 # opt = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), 5 # lr=lr, momentum=0.9) 6 opt = torch.optim.Adadelta( 7 filter(lambda p: p.requires_grad, model.parameters()), lr=1.0, rho=0.9, 8 eps=1e-6, weight_decay=weight_decay) 9 10 for epoch in range(epochs): #100 11 if epoch - validator.best_epoch > 10: 12 return 13 14 model.train() 15 pbar = tqdm(train_dl) if proc_id == 0 else train_dl 16 total_loss = 0 17 n_correct = 0 18 cnt = 0 19 for batch in pbar: 20 batch_size = len(batch.tgt) 21 22 if proc_id == 0 and cnt % log_every_n_examples < batch_size: 23 pbar.set_description('E{:02d}, loss:{:.4f}, acc:{:.4f}, lr:{}' 24 .format(epoch, 25 total_loss / cnt if cnt else 0, 26 n_correct / cnt if cnt else 0, 27 opt.param_groups[0]['lr'])) 28 pbar.refresh() 29 30 loss, acc = model.loss_n_acc(batch.input, batch.tgt) 31 total_loss += loss.item() * batch_size 32 cnt += batch_size 33 n_correct += acc 34 35 opt.zero_grad() 36 loss.backward() 37 clip_gradient(model, 1) 38 opt.step() 39 40 if n_gpus > 1: torch.distributed.barrier() 41 42 model.eval() 43 validator.evaluate(model, epoch) 44 # tester.evaluate(model, epoch) 45 if proc_id == 0: 46 summ = { 47 'Eval': '(e{:02d},train)'.format(epoch), 48 'loss': total_loss / cnt, 49 'acc': n_correct / cnt, 50 } 51 validator.write_summary(summ=summ) 52 validator.write_summary(epoch=epoch) 53 54 # tester.write_summary(epoch)
train()中第30行代码对应的模型详细构造:
1 class LSTMClassifier(nn.Module): 2 def __init__(self, vocab_size=50000, emb_dim=100, emb_vectors=None, 3 emb_dropout=0.3, 4 lstm_dim=256, lstm_n_layer=2, lstm_dropout=0.3, 5 bidirectional=True, lstm_combine='add', 6 n_linear=2, linear_dropout=0.5, n_classes=1, 7 crit=nn.CrossEntropyLoss()): 8 super().__init__() 9 vocab_size, emb_dim = emb_vectors.shape 10 n_dirs = bidirectional + 1 11 lstm_dir_dim = lstm_dim // n_dirs if lstm_combine == 'concat' else lstm_dim 12 13 self.lstm_n_layer = lstm_n_layer 14 self.n_dirs = n_dirs 15 self.lstm_dir_dim = lstm_dir_dim 16 self.lstm_combine = lstm_combine 17 18 self.embedding_layer = nn.Embedding(*emb_vectors.shape) 19 self.embedding_layer.from_pretrained(emb_vectors, padding_idx=1) 20 # pad=1 in torchtext; embedding weights trainable 21 self.embedding_dropout = nn.Dropout(p=emb_dropout) 22 23 self.lstm = nn.LSTM(emb_dim, lstm_dir_dim, #args.lstm_dim = 100 24 num_layers=lstm_n_layer, #args.lstm_n_layer = 1 25 bidirectional=bidirectional, 26 batch_first=True) 27 if lstm_n_layer > 1: self.lstm.dropout = lstm_dropout 28 self.lstm_dropout = nn.Dropout(p=lstm_dropout) 29 30 self.att_w = nn.Parameter(torch.randn(1, lstm_dim, 1)) #100/default = 256 31 self.linear_layers = [nn.Linear(lstm_dim, lstm_dim) for _ in 32 range(n_linear - 1)] 33 self.linear_layers = nn.ModuleList(self.linear_layers) 34 self.linear_dropout = nn.Dropout(p=linear_dropout) 35 36 self.label = nn.Linear(lstm_dim, n_classes) 37 self.crit = crit 38 39 self.opts = { 40 'vocab_size': vocab_size, 41 'emb_dim': emb_dim, 42 'emb_dropout': emb_dropout, 43 'emb_vectors': emb_vectors, 44 'lstm_dim': lstm_dim, 45 'lstm_n_layer': lstm_n_layer, 46 'lstm_dropout': lstm_dropout, 47 'lstm_combine': lstm_combine, 48 'n_linear': n_linear, 49 'linear_dropout': linear_dropout, 50 'n_classes': n_classes, 51 'crit': crit, 52 } 53 def re_attention(self, lstm_output, final_h, input): 54 ''' 55 56 :param lstm_output: [bs,seq_len,2*emb_dim] 57 :param final_h: [2,bs,emb_dim] 58 :param input: [bs,seq_len] assert:emb_vectors.shape[1]=emb_dim 59 :return: 60 ''' 61 batch_size, seq_len = input.shape 62 63 final_h = final_h.view(self.lstm_n_layer, self.n_dirs, batch_size, 64 self.lstm_dir_dim)[-1] #[1,2,bs,100]取最后一层 ->[2,bs,100] 65 final_h = final_h.permute(1, 0, 2) #[bs,2,100] 66 final_h = final_h.sum(dim=1) # (batch_size, 100) 按列求和 67 68 # final_h.size() = (batch_size, hidden_size) 69 # output.size() = (batch_size, num_seq, hidden_size) 70 if self.lstm_combine == 'add': 71 lstm_output = lstm_output.view(batch_size, seq_len, 2, #双向LSTM的output的2*hidden拆开,再在第3维度上求和 72 self.lstm_dir_dim) 73 lstm_output = lstm_output.sum(dim=2) 74 # lstm_output(batch_size, seq_len, lstm_dir_dim=hidden) 75 att = torch.bmm(torch.tanh(lstm_output), #[bs,seq_len,hidden]*[bs,lstm_dir_dim,1]=[bs,seq_len,1] 76 self.att_w.repeat(batch_size, 1, 1)) 77 att = F.softmax(att, dim=1) # att(batch_size, seq_len, 1) 78 att = torch.bmm(lstm_output.transpose(1, 2), att).squeeze(2) #[bs,lstm_dim,seq_len]*[bs,seq_len,1]=[bs,lstm_dim,1]->[bs,lst,_dim] 79 attn_output = torch.tanh(att) # attn_output(batch_size, lstm_dir_dim) 80 return attn_output 81 82 def forward(self, input): 83 batch_size, seq_len, *_ = input.shape 84 inp = self.embedding_layer(input) #[bs,seq_len,emb_vectors.shape[1]] 85 inp = self.embedding_dropout(inp) #[bs,seq_len,emb_vectors.shape[1]] 86 87 lstm_output, (final_h, final_c) = self.lstm(inp) #lstm_output = [bs,seq_len,2*emb_dim=200] 88 89 # outputs = [] 90 # for i in range(seq_len): 91 # cur_emb = inp[i:i + 1] # .view(1, inp.size(1), inp.size(2)) 92 # 93 # o, hidden = self.lstm(cur_emb) if i == 0 else self.lstm(cur_emb, hidden) 94 # import pdb;pdb.set_trace() 95 # outputs += [o.unsqueeze(0)] 96 # 97 # outputs = torch.cat(outputs, dim=0) 98 99 lstm_output = self.lstm_dropout(lstm_output) #[bs,seq_len,2*emb_dim] 100 101 attn_output = self.re_attention(lstm_output, final_h, input) #[bs,lstm_dir_dim] 102 output = self.linear_dropout(attn_output) 103 104 for layer in self.linear_layers: 105 output = layer(output) #[bs,lstm_dir_dim]->[bs,lstm_dir_dim] 106 output = self.linear_dropout(output) 107 output = F.relu(output) 108 109 logits = self.label(output) #[bs,lstm_dim]->[bs,19] 110 return logits 111 112 def loss_n_acc(self, input, target): 113 logits = self.forward(input) #[bs,19] 114 logits_flat = logits.view(-1, logits.size(-1)) 115 target_flat = target.view(-1) #[1,bs] 116 loss = self.crit(logits_flat, target_flat) # mean_score per batch 117 118 pred_flat = logits_flat.max(dim=-1)[1] #[bs,1] 返回最大值下标的索引 119 acc = (pred_flat == target_flat).sum() 120 return loss, acc.item()
这篇代码理解不透彻等我熟悉掌握torchtext工具后会重新复读此篇代码。
TorchText详解:https://blog.csdn.net/qq_43283527/article/details/107369395