『NLP学习笔记』BERT命名实体识别(NER)实战

2023-10-17 22:30:46

BERT命名实体识别(NER)实战！

文章目录

一. 数据集介绍
二. 数据集读取&预处理
三. 数据分词tokenizer
四. 定义数据读取(继承Dataset)
五. 定义模型&优化器&学习率
六. 训练测试以及准确率
七. 模型预测
八. 整个代码
八. 参考

BERT技术详细介绍：https://zhangkaifang.blog.csdn.net/article/details/120507302

BERT命名实体识别模型如下：

一. 数据集介绍

实验使用的数据集是微软亚洲研究院提供的词性标注数据集，其目标是识别文本中具有特定意义的实体,包括人名、地名、机构名。链接：https://mirror.coggle.club/dataset/ner/msra.zip

百度云链接: https://pan.baidu.com/s/17MRMTrQKWJ6-HUI-rWL80A 提取码: oq7w

二. 数据集读取&预处理

import codecs

################## 1. 读取数据
# 训练数据和标签
train_lines = codecs.open('msra/train/sentences.txt').readlines()
train_lines = [x.replace(' ', '').strip() for x in train_lines]  # 用于移除字符串开头和结尾指定的字符（默认为空格或换行符）或字符序列。
train_tags = codecs.open('msra/train/tags.txt').readlines()
train_tags = [x.strip().split(' ') for x in train_tags]
train_tags = [[tag_type.index(x) for x in tag] for tag in train_tags]
train_lines, train_tags = train_lines[:20000], train_tags[:20000]  # 只取两万数据
print(train_lines[0], "\n", train_tags[0])
# 如何解决足球界长期存在的诸多矛盾，重振昔日津门足球的雄风，成为天津足坛上下内外到处议论的话题。 
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# 验证数据和标签
val_lines = codecs.open('msra/val/sentences.txt').readlines()
val_lines = [x.replace(' ', '').strip() for x in val_lines]
val_tags = codecs.open('msra/val/tags.txt').readlines()
val_tags = [x.strip().split(' ') for x in val_tags]
val_tags = [[tag_type.index(x) for x in tag] for tag in val_tags]

三. 数据分词tokenizer

注意：中文注意加list(train_lines),因为不加因为单词作为整体了。

################## 2. 对数据进行分词
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 中文注意加list(train_lines),因为不加因为单词作为整体了。
max_length = 64
train_encoding = tokenizer.batch_encode_plus(list(train_lines), truncation=True, padding=True, max_length=max_length)
val_encoding = tokenizer.batch_encode_plus(list(val_lines), truncation=True, padding=True, max_length=max_length)

四. 定义数据读取(继承Dataset)

注意：下面labels需要填充开头cls，结尾部分不够maxlen也要填0。

################## 3. 定义Dataset类对象
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx][:maxlen]) for key, value in self.encodings.items()}
        # 字级别的标注，注意填充cls，这里[0]代表cls。后面不够长的这里也是补充0，样本tokenizer的时候已经填充了
        # item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (63-len(self.labels[idx])))[:64]
        item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (maxlen - 1 - len(self.labels[idx])))[:maxlen]
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TextDataset(train_encoding, train_tags)
test_dataset = TextDataset(val_encoding, val_tags)
print(train_dataset[0])

# Dataset转换成Dataloader
batchsz = 32
train_loader = DataLoader(train_dataset, batch_size=batchsz, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchsz, shuffle=True)

五. 定义模型&优化器&学习率

from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup

################## 4. 定义模型
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=7)
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)

# 优化器和学习率
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=total_steps)  # Default value in run_glue.py

六. 训练测试以及准确率

注意：outputs输出结果中的logits，NER其实就是对每个token进行分类。

然后对dim=2维度上取argmax，找出每个位置所属的类别下标。

# 这里测试计算准确率中的：
a = torch.tensor([1, 2, 3, 4, 2])
b = torch.tensor([1, 2, 4, 3, 2])
print((a==b).float().mean())
print((a==b).float().mean().item())

代码如下：

from tqdm import tqdm

def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for idx, batch in enumerate(train_loader):
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # loss = outputs[0]

        loss = outputs.loss
        
        if idx % 20 == 0:
            with torch.no_grad():
                # 64 * 7
                print((outputs[1].argmax(2).data == labels.data).float().mean().item(), loss.item())
        
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += (outputs[1].argmax(2).data == labels.data).float().mean().item()
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

七. 模型预测

model = torch.load('bert-ner.pt')

tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']

def predcit(s):
    item = tokenizer([s], truncation=True, padding='longest', max_length=64) # 加一个list
    with torch.no_grad():
        input_ids = torch.tensor(item['input_ids']).to(device).reshape(1, -1)
        attention_mask = torch.tensor(item['attention_mask']).to(device).reshape(1, -1)
        labels = torch.tensor([0] * attention_mask.shape[1]).to(device).reshape(1, -1)
        
        outputs = model(input_ids, attention_mask, labels)
        outputs = outputs[0].data.cpu().numpy()
        
    outputs = outputs[0].argmax(1)[1:-1]
    ner_result = ''
    ner_flag = ''
    
    for o, c in zip(outputs,s):
        # 0 就是 O，没有含义
        if o == 0 and ner_result == '':
            continue
        
        # 
        elif o == 0 and ner_result != '':
            if ner_flag == 'O':
                print('机构：', ner_result)
            if ner_flag == 'P':
                print('人名：', ner_result)
            if ner_flag == 'L':
                print('位置：', ner_result)
                
            ner_result = ''
        
        elif o != 0:
            ner_flag = tag_type[o][2]
            ner_result += c
    return outputs

s = '整个华盛顿已笼罩在一片夜色之中，一个电话从美国总统府白宫打到了菲律宾总统府马拉卡南宫。'
# 识别出句子里面的实体识别（NER）
data = predcit(s)
s = '人工智能是未来的希望，也是中国和美国的冲突点。'
data = predcit(s)
s = '明天我们一起在海淀吃个饭吧，把叫刘涛和王华也叫上。'
data = predcit(s)
s = '同煤集团同生安平煤业公司发生井下安全事故 19名矿工遇难'
data = predcit(s)
s = '山东省*办公厅就平邑县玉荣商贸有限公司石膏矿坍塌事故发出通报'
data = predcit(s)
s = '[新闻直播间]黑龙江:龙煤集团一煤矿发生火灾事故'
data = predcit(s)

位置： 华盛顿
位置： 美国总统府白宫
位置： 菲律宾总统府马拉卡南宫
位置： 华盛顿
位置： 美国总统府白宫
位置： 菲律宾总统府马拉卡南宫
位置： 中国
位置： 美国
位置： 海淀
人名： 刘涛
人名： 王华
机构： 同煤集团同生安平煤业公司
机构： 山东省*办公厅
机构： 平邑县玉荣商贸有限公司
位置： 黑龙江
机构： 龙煤集团

八. 整个代码

此外提供了notebook版本代码，百度云: https://pan.baidu.com/s/1tiLqvsdzuBgWFb6defNyWg 提取码: gu8t

# !/usr/bin/env python
# -*- encoding: utf-8 -*-
"""=====================================
@author : kaifang zhang
@time   : 2021/12/19 1:33 PM
@contact: kaifang.zkf@dtwave-inc.com
====================================="""
import codecs
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import BertTokenizer
from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup

tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']
# B-ORG I-ORG 机构的开始位置和中间位置
# B-PER I-PER 人物名字的开始位置和中间位置
# B-LOC I-LOC 位置的开始位置和中间位置

################## 1. 读取数据
# 训练数据和标签
train_lines = codecs.open('msra/train/sentences.txt').readlines()
train_lines = [x.replace(' ', '').strip() for x in train_lines]  # 用于移除字符串开头和结尾指定的字符（默认为空格或换行符）或字符序列。
train_tags = codecs.open('msra/train/tags.txt').readlines()
train_tags = [x.strip().split(' ') for x in train_tags]
train_tags = [[tag_type.index(x) for x in tag] for tag in train_tags]
train_lines, train_tags = train_lines[:20000], train_tags[:20000]  # 只取两万数据
print(f"样例数据：{train_lines[0]} \n样例标签：{train_tags[0]}")

# 验证数据和标签
val_lines = codecs.open('msra/val/sentences.txt').readlines()
val_lines = [x.replace(' ', '').strip() for x in val_lines]
val_tags = codecs.open('msra/val/tags.txt').readlines()
val_tags = [x.strip().split(' ') for x in val_tags]
val_tags = [[tag_type.index(x) for x in tag] for tag in val_tags]  # 标签转换为数值

################## 2. 对数据进行分词
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 中文注意加list(train_lines),因为不加因为单词作为整体了。
maxlen = 64
train_encoding = tokenizer.batch_encode_plus(list(train_lines), truncation=True, padding=True, max_length=maxlen)
val_encoding = tokenizer.batch_encode_plus(list(val_lines), truncation=True, padding=True, max_length=maxlen)

################## 3. 定义Dataset类对象
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx][:maxlen]) for key, value in self.encodings.items()}
        # 字级别的标注，注意填充cls，这里[0]代表cls。后面不够长的这里也是补充0，样本tokenizer的时候已经填充了
        # item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (63-len(self.labels[idx])))[:64]
        item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (maxlen - 1 - len(self.labels[idx])))[:maxlen]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encoding, train_tags)
test_dataset = TextDataset(val_encoding, val_tags)
batchsz = 32
train_loader = DataLoader(train_dataset, batch_size=batchsz, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchsz, shuffle=True)

# print(train_dataset[0])

# 测试样本是否满足最大长度
for idx in range(len(train_dataset)):
    item = train_dataset[idx]
    for key in item:
        if item[key].shape[0] != 64:
            print(key, item[key].shape)
for idx in range(len(test_dataset)):
    item = test_dataset[idx]
    for key in item:
        if item[key].shape[0] != 64:
            print(key, item[key].shape)

################## 4. 定义模型
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=7)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

# 优化器和学习率
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=total_steps)  # Default value in run_glue.py

################## 4. 训练测试以及字符的分类准确率
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # shape: [32, 64]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        # loss = outputs.loss
        logits1 = outputs[1]  # shape: [32, 64, 7]
        out = logits1.argmax(dim=2)
        out1 = out.data
        # logits2 = outputs.logits

        if idx % 20 == 0:  # 看模型的准确率
            with torch.no_grad():
                # 假如输入的是64个字符，64 * 7
                print((outputs[1].argmax(2).data == labels.data).float().mean().item(), loss.item())

        total_train_loss += loss.item()

        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 参数更新
        optimizer.step()
        scheduler.step()

        iter_num += 1
        if (iter_num % 100 == 0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (
                epoch, iter_num, loss.item(), iter_num / total_iter * 100))

    print("Epoch: %d, Average training loss: %.4f" % (epoch, total_train_loss / len(train_loader)))

def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += (outputs[1].argmax(2).data == labels.data).float().mean().item()

    avg_val_accuracy = total_eval_accuracy / len(test_loader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f" % (total_eval_loss / len(test_loader)))
    print("-------------------------------")

# tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']
def predcit(s):
    item = tokenizer([s], truncation=True, padding='longest', max_length=64)  # 加一个list
    with torch.no_grad():
        input_ids = torch.tensor(item['input_ids']).to(device).reshape(1, -1)
        attention_mask = torch.tensor(item['attention_mask']).to(device).reshape(1, -1)
        labels = torch.tensor([0] * attention_mask.shape[1]).to(device).reshape(1, -1)

        outputs = model(input_ids, attention_mask, labels)
        outputs = outputs[0].data.cpu().numpy()

    outputs = outputs[0].argmax(1)[1:-1]
    ner_result = ''
    ner_flag = ''

    for o, c in zip(outputs, s):
        # 0 就是 O，没有含义
        if o == 0 and ner_result == '':
            continue
        #
        elif o == 0 and ner_result != '':
            if ner_flag == 'O':
                print('机构：', ner_result)
            if ner_flag == 'P':
                print('人名：', ner_result)
            if ner_flag == 'L':
                print('位置：', ner_result)

            ner_result = ''

        elif o != 0:
            ner_flag = tag_type[o][2]
            ner_result += c
    return outputs

# for epoch in range(4):
#     print("------------Epoch: %d ----------------" % epoch)
#     train()
#     validation()
# torch.save(model, 'bert-ner.pt')

model = torch.load('/data/aibox/kaifang/NLP学习资料/bert-ner.pt')
s = '整个华盛顿已笼罩在一片夜色之中，一个电话从美国总统府白宫打到了菲律宾总统府马拉卡南宫。'
# 识别出句子里面的实体识别（NER）
data = predcit(s)
s = '整个华盛顿已笼罩在一片夜色之中，一个电话从美国总统府白宫打到了菲律宾总统府马拉卡南宫。'
# 识别出句子里面的实体识别（NER）
data = predcit(s)
s = '人工智能是未来的希望，也是中国和美国的冲突点。'
data = predcit(s)
s = '明天我们一起在海淀吃个饭吧，把叫刘涛和王华也叫上。'
data = predcit(s)
s = '同煤集团同生安平煤业公司发生井下安全事故 19名矿工遇难'
data = predcit(s)
s = '山东省*办公厅就平邑县玉荣商贸有限公司石膏矿坍塌事故发出通报'
data = predcit(s)
s = '[新闻直播间]黑龙江:龙煤集团一煤矿发生火灾事故'
data = predcit(s)

八. 参考

主要参考dasou博主的视频：https://www.bilibili.com/video/BV1Ey4y1874y?p=6&spm_id_from=pageDriver

腾讯Bugly的专栏：图解BERT模型：从零开始构建BERT

Bert源代码解读-以BERT文本分类代码为例子：https://github.com/DA-southampton/Read_Bert_Code

BERT大火却不懂Transformer？读这一篇就够了：https://zhuanlan.zhihu.com/p/54356280

pytorch 中加载BERT模型, 获取词向量：https://blog.csdn.net/znsoft/article/details/107725285

Bert生成句向量(pytorch)

https://blog.csdn.net/weixin_41519463/article/details/100863313

学习率预热(transformers.get_linear_schedule_with_warmup)：https://blog.csdn.net/orangerfun/article/details/120400247

码农公寓

文章目录

一. 数据集介绍

二. 数据集读取&预处理

三. 数据分词tokenizer

四. 定义数据读取(继承Dataset)

五. 定义模型&优化器&学习率

六. 训练测试以及准确率

七. 模型预测

八. 整个代码

八. 参考

相关文章