HuggingFace - 简明教程

2023-09-30 20:12:40

本文学习自：
视频：https://www.bilibili.com/video/BV1a44y1H7Jc
源码：https://github.com/lansinuote/Huggingface_Toturials

文章目录

关于 HuggingFace

HuggingFace 官网：https://huggingface.co
HuggingFace 是一个开源社区，提供了先进的 NLP 模型、数据集、以及其他便利的工具。

提供的主要模型：

1、自回归
GPT2
Transformer-XL
XLNet

2、自编码
BERT
ALBERT
RoBERTa
ELECTRA

3、Seg2Seq
BART
Pegasus
T5

此处主要使用 bert-base-chinese，即bert中文模型。

安装

以下安装基于 python 3.6，pytorch 1.10

# 1、安装 transformers (HuggingFace 提供的模型)
## 方式一：pip 安装
$ pip install transformers
## 方式二：conda 安装
$ conda install -c huggingface transformers

# 2、安装 datasets
## 方式一：pip 安装
$ pip install datasets
## 方式二：conda 安装
$ conda install -c huggingface -c conda-forge datasets

字典和分词工具



from transformers import BertTokenizer

# 加载预训练字典和分词方法
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

tokenizer, sents
'''
from transformers import BertTokenizer

# 加载预训练字典和分词方法
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

tokenizer, sents
'''


# 编码两个句子
out = tokenizer.encode(
    text=sents[0],
    text_pair=sents[1],

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补pad到max_length长度
    padding='max_length', # 最大长度内补pad
    add_special_tokens=True, # pad 是添加特殊符号
    max_length=30, # 超过最大长度截断
    return_tensors=None, # 不指定类型，默认返回 list
)

print(out)
'''
[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
'''

tokenizer.decode(out)
'''
'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'
'''


# 增强的编码函数
out = tokenizer.encode_plus(
    text=sents[0],
    text_pair=sents[1],

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补零到max_length长度
    padding='max_length',
    max_length=30,
    add_special_tokens=True,

    # 可取值tf,pt,np,默认为返回list
    return_tensors=None,

    # 返回token_type_ids
    return_token_type_ids=True,

    # 返回attention_mask
    return_attention_mask=True,

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask=True,

    # 返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    # return_offsets_mapping=True,

    # 返回length 标识长度
    return_length=True,
)

# input_ids 就是编码后的词
# token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1
# special_tokens_mask 特殊符号的位置是1,其他位置是0
# attention_mask pad的位置是0,其他位置是1
# length 返回句子长度
for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'])
'''
input_ids : [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
length : 30
'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'
'''


# 批量编码句子
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[sents[0], sents[1]],
    add_special_tokens=True,

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补零到max_length长度
    padding='max_length',
    max_length=15,

    # 可取值tf,pt,np,默认为返回list
    return_tensors=None,

    # 返回token_type_ids
    return_token_type_ids=True,

    # 返回attention_mask
    return_attention_mask=True,

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask=True,

    # 返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    # return_offsets_mapping=True,

    # 返回length 标识长度
    return_length=True,
)

# input_ids 就是编码后的词
# token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1
# special_tokens_mask 特殊符号的位置是1,其他位置是0
# attention_mask pad的位置是0,其他位置是1
# length 返回句子长度
for k, v in out.items():
    print(k, ':', v)
    
tokenizer.decode(out['input_ids'][0]), tokenizer.decode(out['input_ids'][1])
'''
input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 102], [101, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]
length : [15, 12]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
('[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 [SEP]',
 '[CLS] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]')
'''



# 批量编码成对的句子
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[(sents[0], sents[1]), (sents[2], sents[3])],
    add_special_tokens=True,

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补零到max_length长度
    padding='max_length',
    max_length=30,

    # 可取值tf,pt,np,默认为返回list
    return_tensors=None,

    # 返回token_type_ids
    return_token_type_ids=True,

    # 返回attention_mask
    return_attention_mask=True,

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask=True,

    # 返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    # return_offsets_mapping=True,

    # 返回length 标识长度
    return_length=True,
)

# input_ids 就是编码后的词
# token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1
# special_tokens_mask 特殊符号的位置是1,其他位置是0
# attention_mask pad的位置是0,其他位置是1
# length 返回句子长度
for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'][0])
'''
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0], [101, 2791, 7313, 1922, 2207, 511, 1071, 800, 4638, 6963, 671, 5663, 511, 102, 791, 1921, 2798, 4761, 6887, 6821, 741, 6820, 3300, 5018, 127, 1318, 117, 4696, 3300, 102]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]
length : [27, 30]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'
'''


# 获取字典
zidian = tokenizer.get_vocab()

type(zidian), len(zidian), '月光' in zidian, # (dict, 21128, False)


# 添加新词
tokenizer.add_tokens(new_tokens=['月光', '希望'])

# 添加新符号
tokenizer.add_special_tokens({'eos_token': '[EOS]'})

zidian = tokenizer.get_vocab()

type(zidian), len(zidian), zidian['月光'], zidian['[EOS]'] # (dict, 21131, 21128, 21130)


# 编码新添加的词
out = tokenizer.encode(
    text='月光的新希望[EOS]',
    text_pair=None,

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补pad到max_length长度
    padding='max_length',
    add_special_tokens=True,
    max_length=8,
    return_tensors=None,
)

print(out) # [101, 21128, 4638, 3173, 21129, 21130, 102, 0]
 

tokenizer.decode(out) # '[CLS] 月光 的 新 希望 [EOS] [SEP] [PAD]'

数据集的操作



 
from datasets import load_dataset

# 加载数据
dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')
dataset
'''
Using custom data configuration default
Reusing dataset chn_senti_corp (/Users/lee/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)
Dataset({
    features: ['text', 'label'],
    num_rows: 9600
})
'''

 
# 查看一个数据
dataset[0]
'''
{'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
 'label': 1}
'''

 
# sort

# 未排序的label是乱序的
print(dataset['label'][:10])

# 排序之后label有序了
sorted_dataset = dataset.sort('label')
print(sorted_dataset['label'][:10])
print(sorted_dataset['label'][-10:])

 
# shuffle

# 打乱顺序
shuffled_dataset = sorted_dataset.shuffle(seed=42)
shuffled_dataset['label'][:10]

 
# select
dataset.select([0, 10, 20, 30, 40, 50])

# filter
def f(data):
    return data['text'].startswith('选择')

start_with_ar = dataset.filter(f)

len(start_with_ar), start_with_ar['text']

# train_test_split, 切分训练集和测试集
dataset.train_test_split(test_size=0.1)

# shard 把数据切分到4个桶中,均匀分配
dataset.shard(num_shards=4, index=0)

# 列操作和类型转换
# rename_column 列重命名
dataset.rename_column('text', 'textA')

# remove_columns 列移除
dataset.remove_columns(['text'])

# map 把每一个数据输入到 lamda 函数，操作返回
def f(data):
    data['text'] = 'My sentence: ' + data['text']
    return data

datatset_map = dataset.map(f)
datatset_map['text'][:5]
 
# set_format
dataset.set_format(type='torch', columns=['label'])
dataset[0]#  {'label': tensor(1)}

# 保存和加载
from datasets import load_from_disk

dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')

# 保存和加载
dataset.save_to_disk("./")
dataset = load_from_disk("./")
dataset

# 导出为其他格式
# dataset.to_csv('./datasets.csv')
# dataset.to_json('./datasets.json')

评价函数的使用

from datasets import list_metrics

# 列出评价指标
metrics_list = list_metrics()
len(metrics_list), metrics_list
'''
(34,
 ['accuracy',
  'bertscore',
  'bleu',
  'bleurt',
  'cer',
  'chrf',
  'code_eval',
  'comet',
  'competition_math',
  'coval',
  'cuad',
  'f1',
  'gleu',
  'glue',
  'google_bleu',
'''
 
#加载一个评价指标
metric = load_metric('glue', 'mrpc')

print(metric.inputs_description)  # 打印使用文档 


#计算一个评价指标
predictions = [0, 1, 0]
references = [0, 1, 1]

final_score = metric.compute(predictions=predictions, references=references)
final_score  # {'accuracy': 0.6666666666666666, 'f1': 0.6666666666666666}

管道方法的使用 pipline

实用价值不高

from transformers import pipeline

# 文本分类
classifier = pipeline("sentiment-analysis")

result = classifier("I hate you")[0] # {'label': 'NEGATIVE', 'score': 0.9991129040718079} 

result = classifier("I love you")[0] # {'label': 'POSITIVE', 'score': 0.9998656511306763} 
 
# 阅读理解
question_answerer = pipeline("question-answering")

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a 
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune 
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""

result = question_answerer(question="What is extractive question answering?", context=context) # {'score': 0.6177279353141785, 'start': 34, 'end': 95, 'answer': 'the task of extracting an answer from a text given a question'} 

# 答案需要在文本当中
result = question_answerer( question="What is a good example of a question answering dataset?", context=context) # {'score': 0.5152313113212585, 'start': 148, 'end': 161, 'answer': 'SQuAD dataset'}
  
# 完形填空
unmasker = pipeline("fill-mask")
from pprint import pprint
sentence = 'HuggingFace is creating a <mask> that the community uses to solve NLP tasks.'
unmasker(sentence)

 
# 文本生成
text_generator = pipeline("text-generation")
text_generator("As far as I am concerned, I will", max_length=50, do_sample=False)
from transformers import pipeline

# 命名实体识别
ner_pipe = pipeline("ner")

sequence = """Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO,
therefore very close to the Manhattan Bridge which is visible from the window."""

for entity in ner_pipe(sequence):
    print(entity)

 
# 文本总结
summarizer = pipeline("summarization")

ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in *.  Her next court appearance is scheduled for May 18.
"""

summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False)


# 翻译
translator = pipeline("translation_en_to_de")
sentence = "Hugging Face is a technology company based in New York and Paris"
translator(sentence, max_length=40)

实战任务1：中文分类

import torch
from datasets import load_dataset

# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']

        return text, label


dataset = Dataset('train')

len(dataset), dataset[0]


from transformers import BertTokenizer # 加载分词工具（需要和与预训练模型相匹配）

# 加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
# PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    # 编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    # input_ids:编码之后的数字
    # attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    # print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader)) # 600
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels
'''
(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1]))
'''


from transformers import BertModel

# 加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

# 模型试算
out = pretrained(input_ids=input_ids,
           attention_mask=attention_mask,
           token_type_ids=token_type_ids)

out.last_hidden_state.shape # torch.Size([16, 500, 768])  16是一个 batch-size，对应数据中的16句话；指定每一句话编码成 500个词的长度， 768 是词编码的维度；


# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2) # 此处做的是2分类，想做n分类，改成n即可。

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0]) # 取第0个特征（这个和bert 的设计思路有关）
        out = out.softmax(dim=1)
        return out

model = Model()

model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape


from transformers import AdamW

# 训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    out = model(input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)

        print(i, loss.item(), accuracy)

    if i == 300:
        break


# 测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i == 5:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

实战任务2：中文填空

import torch
from datasets import load_dataset

# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

        def f(data):
            return len(data['text']) > 30

        self.dataset = dataset.filter(f)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']

        return text


dataset = Dataset('train')

len(dataset), dataset[0]


from transformers import BertTokenizer

# 加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')


def collate_fn(data):
    # 编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=data,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=30,
                                   return_tensors='pt',
                                   return_length=True)

    # input_ids:编码之后的数字
    # attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']

    # 把第15个词固定替换为mask
    labels = input_ids[:, 15].reshape(-1).clone()
    input_ids[:, 15] = token.get_vocab()[token.mask_token]

    # print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
print(token.decode(input_ids[0]))
print(token.decode(labels[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape


from transformers import BertModel

# 加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

# 模型试算
out = pretrained(input_ids=input_ids,
           attention_mask=attention_mask,
           token_type_ids=token_type_ids)

out.last_hidden_state.shape


# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = torch.nn.Linear(768, token.vocab_size, bias=False)
        self.bias = torch.nn.Parameter(torch.zeros(token.vocab_size))
        self.decoder.bias = self.bias

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        out = self.decoder(out.last_hidden_state[:, 15])

        return out


model = Model()

model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape


from transformers import AdamW

# 训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(5):
    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader):
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)

        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 50 == 0:
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)

            print(epoch, i, loss.item(), accuracy)


# 测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i == 15:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

        print(token.decode(input_ids[0]))
        print(token.decode(labels[0]), token.decode(labels[0]))

    print(correct / total)


test()

实战任务3：中文句子关系推断


import torch
from datasets import load_dataset
import random

# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

        def f(data):
            return len(data['text']) > 40

        self.dataset = dataset.filter(f)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']

        # 切分一句话为前半句和后半句
        sentence1 = text[:20]
        sentence2 = text[20:40]
        label = 0

        # 有一半的概率把后半句替换为一句无关的话
        if random.randint(0, 1) == 0:
            j = random.randint(0, len(self.dataset) - 1)
            sentence2 = self.dataset[j]['text'][20:40]
            label = 1

        return sentence1, sentence2, label


dataset = Dataset('train')

sentence1, sentence2, label = dataset[0]

len(dataset), sentence1, sentence2, label


from transformers import BertTokenizer
# 加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')


def collate_fn(data):
    sents = [i[:2] for i in data]
    labels = [i[2] for i in data]

    # 编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=45,
                                   return_tensors='pt',
                                   return_length=True,
                                   add_special_tokens=True)

    # input_ids:编码之后的数字
    # attention_mask:是补零的位置是0,其他位置是1
    # token_type_ids:第一个句子和特殊符号的位置是0,第二个句子的位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    # print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
print(token.decode(input_ids[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels


from transformers import BertModel

# 加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

# 模型试算
out = pretrained(input_ids=input_ids,
           attention_mask=attention_mask,
           token_type_ids=token_type_ids)

out.last_hidden_state.shape


# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0])
        out = out.softmax(dim=1)
        return out


model = Model()
model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape


from transformers import AdamW

# 训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    out = model(input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        print(i, loss.item(), accuracy)

    if i == 300:
        break


# 测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i == 5:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        pred = out.argmax(dim=1)

        correct += (pred == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

2022-02-20 深圳低温

码农公寓