Bert
from transformers import (
BertTokenizer,
BertConfig,
BertModel,
)
# clue/roberta_chinese_base
bertTokenizer = BertTokenizer.from_pretrained(‘bert-base-chinese‘)
bertModel = BertModel.from_pretrained(‘bert-base-chinese‘)
sen = ‘Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。‘
inputs = bertTokenizer(sen, return_tensors=‘pt‘)
tokens = bertTokenizer.convert_ids_to_tokens(inputs[‘input_ids‘][0])
print(inputs)
print(tokens)
outputs = bertModel(**inputs)
# print(len(outputs))
print(outputs[0].shape, outputs[1].shape)
{‘input_ids‘: tensor([[ 101, 100, 2990, 897, 749, 100, 7566, 1818, 1920, 7030,
10223, 118, 8205, 118, 9143, 4638, 7564, 6378, 5298, 6427,
6241, 3563, 1798, 5310, 3354, 4638, 3563, 1798, 1469, 6444,
4500, 3427, 3373, 511, 102]]), ‘token_type_ids‘: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), ‘attention_mask‘: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[‘[CLS]‘, ‘[UNK]‘, ‘提‘, ‘供‘, ‘了‘, ‘[UNK]‘, ‘领‘, ‘域‘, ‘大‘, ‘量‘, ‘state‘, ‘-‘, ‘of‘, ‘-‘, ‘art‘, ‘的‘, ‘预‘, ‘训‘, ‘练‘, ‘语‘, ‘言‘, ‘模‘, ‘型‘, ‘结‘, ‘构‘, ‘的‘, ‘模‘, ‘型‘, ‘和‘, ‘调‘, ‘用‘, ‘框‘, ‘架‘, ‘。‘, ‘[SEP]‘]
torch.Size([1, 35, 768]) torch.Size([1, 768])
Roberta
from transformers import (
BertTokenizer,
BertConfig,
BertModel,
)
# clue/roberta_chinese_base
robertTokenizer = BertTokenizer.from_pretrained(‘hfl/chinese-roberta-wwm-ext‘)
robertModel = BertModel.from_pretrained(‘hfl/chinese-roberta-wwm-ext‘)
sen = ‘Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。‘
inputs = robertTokenizer(sen, return_tensors=‘pt‘)
tokens = robertTokenizer.convert_ids_to_tokens(inputs[‘input_ids‘][0])
print(inputs)
print(tokens)
outputs = robertModel(**inputs)
print(outputs)
print(outputs[0].shape, outputs[1].shape)
{‘input_ids‘: tensor([[ 101, 162, 10477, 8118, 12725, 8755, 2990, 897, 749, 156,
10986, 7566, 1818, 1920, 7030, 10223, 118, 8205, 118, 9143,
4638, 7564, 6378, 5298, 6427, 6241, 3563, 1798, 5310, 3354,
4638, 3563, 1798, 1469, 6444, 4500, 3427, 3373, 511, 102]]), ‘token_type_ids‘: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), ‘attention_mask‘: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[‘[CLS]‘, ‘t‘, ‘##ran‘, ‘##s‘, ‘##form‘, ‘##ers‘, ‘提‘, ‘供‘, ‘了‘, ‘n‘, ‘##lp‘, ‘领‘, ‘域‘, ‘大‘, ‘量‘, ‘state‘, ‘-‘, ‘of‘, ‘-‘, ‘art‘, ‘的‘, ‘预‘, ‘训‘, ‘练‘, ‘语‘, ‘言‘, ‘模‘, ‘型‘, ‘结‘, ‘构‘, ‘的‘, ‘模‘, ‘型‘, ‘和‘, ‘调‘, ‘用‘, ‘框‘, ‘架‘, ‘。‘, ‘[SEP]‘]
torch.Size([1, 40, 768]) torch.Size([1, 768])
ALBert
from transformers import (
BertTokenizer,
AlbertModel,
)
# clue/roberta_chinese_base
albertTokenizer = BertTokenizer.from_pretrained(‘clue/albert_chinese_tiny‘)
albertModel = AlbertModel.from_pretrained(‘clue/albert_chinese_tiny‘)
sen = ‘Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。‘
inputs = albertTokenizer(sen, return_tensors=‘pt‘)
tokens = albertTokenizer.convert_ids_to_tokens(inputs[‘input_ids‘][0])
print(inputs)
print(tokens)
outputs = albertModel(**inputs)
# print(len(outputs))
print(outputs[0].shape, outputs[1].shape)
{‘input_ids‘: tensor([[ 101, 162, 10477, 8118, 12725, 8755, 2990, 897, 749, 156,
10986, 7566, 1818, 1920, 7030, 10223, 118, 8205, 118, 9143,
4638, 7564, 6378, 5298, 6427, 6241, 3563, 1798, 5310, 3354,
4638, 3563, 1798, 1469, 6444, 4500, 3427, 3373, 511, 102]]), ‘token_type_ids‘: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), ‘attention_mask‘: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[‘[CLS]‘, ‘t‘, ‘##ran‘, ‘##s‘, ‘##form‘, ‘##ers‘, ‘提‘, ‘供‘, ‘了‘, ‘n‘, ‘##lp‘, ‘领‘, ‘域‘, ‘大‘, ‘量‘, ‘state‘, ‘-‘, ‘of‘, ‘-‘, ‘art‘, ‘的‘, ‘预‘, ‘训‘, ‘练‘, ‘语‘, ‘言‘, ‘模‘, ‘型‘, ‘结‘, ‘构‘, ‘的‘, ‘模‘, ‘型‘, ‘和‘, ‘调‘, ‘用‘, ‘框‘, ‘架‘, ‘。‘, ‘[SEP]‘]
torch.Size([1, 40, 312]) torch.Size([1, 312])
XLNet
from transformers import AutoTokenizer, AutoModel
xlnettokenizer = AutoTokenizer.from_pretrained("hfl/chinese-xlnet-base")
xlnetModel = AutoModel.from_pretrained(‘hfl/chinese-xlnet-base‘)
sen = ‘Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。‘
inputs = xlnettokenizer(sen, return_tensors=‘pt‘)
tokens = xlnettokenizer.convert_ids_to_tokens(inputs[‘input_ids‘][0])
print(inputs)
print(tokens)
outputs = xlnetModel(**inputs)
# print(outputs)
print(outputs[0].shape, len(outputs[1]))
{‘input_ids‘: tensor([[ 19, 13932, 9560, 4127, 3810, 603, 602, 412, 3336, 1144,
3025, 4402, 13, 16636, 13, 7717, 20, 19, 3712, 3620,
1723, 2280, 1301, 20, 2280, 24, 16338, 7921, 18, 4,
3]]), ‘token_type_ids‘: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2]]), ‘attention_mask‘: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1]])}
[‘▁‘, ‘Trans‘, ‘form‘, ‘ers‘, ‘提供了‘, ‘N‘, ‘L‘, ‘P‘, ‘领域‘, ‘大量‘, ‘st‘, ‘ate‘, ‘-‘, ‘of‘, ‘-‘, ‘art‘, ‘的‘, ‘▁‘, ‘预‘, ‘训练‘, ‘语言‘, ‘模型‘, ‘结构‘, ‘的‘, ‘模型‘, ‘和‘, ‘调用‘, ‘框架‘, ‘。‘, ‘<sep>‘, ‘<cls>‘]
torch.Size([1, 31, 768]) 12
Electra
from transformers import AutoTokenizer, AutoModel
electratokenizer = AutoTokenizer.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
electraModel = AutoModel.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
sen = ‘Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。‘
inputs = electratokenizer(sen, return_tensors=‘pt‘)
tokens = electratokenizer.convert_ids_to_tokens(inputs[‘input_ids‘][0])
print(inputs)
print(tokens)
outputs = electraModel(**inputs)
# print(outputs)
print(outputs[0].shape)
{‘input_ids‘: tensor([[ 101, 162, 10477, 8118, 12725, 8755, 2990, 897, 749, 156,
10986, 7566, 1818, 1920, 7030, 10223, 118, 8205, 118, 9143,
4638, 7564, 6378, 5298, 6427, 6241, 3563, 1798, 5310, 3354,
4638, 3563, 1798, 1469, 6444, 4500, 3427, 3373, 511, 102]]), ‘token_type_ids‘: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), ‘attention_mask‘: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[‘[CLS]‘, ‘t‘, ‘##ran‘, ‘##s‘, ‘##form‘, ‘##ers‘, ‘提‘, ‘供‘, ‘了‘, ‘n‘, ‘##lp‘, ‘领‘, ‘域‘, ‘大‘, ‘量‘, ‘state‘, ‘-‘, ‘of‘, ‘-‘, ‘art‘, ‘的‘, ‘预‘, ‘训‘, ‘练‘, ‘语‘, ‘言‘, ‘模‘, ‘型‘, ‘结‘, ‘构‘, ‘的‘, ‘模‘, ‘型‘, ‘和‘, ‘调‘, ‘用‘, ‘框‘, ‘架‘, ‘。‘, ‘[SEP]‘]
torch.Size([1, 40, 768])
MacBert
from transformers import AutoTokenizer, AutoModel
mactokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
macModel = AutoModel.from_pretrained("hfl/chinese-macbert-base")
sen = ‘Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。‘
inputs = electratokenizer(sen, return_tensors=‘pt‘)
tokens = electratokenizer.convert_ids_to_tokens(inputs[‘input_ids‘][0])
print(inputs)
print(tokens)
outputs = electraModel(**inputs)
# print(outputs)
print(outputs[0].shape)
{‘input_ids‘: tensor([[ 101, 162, 10477, 8118, 12725, 8755, 2990, 897, 749, 156,
10986, 7566, 1818, 1920, 7030, 10223, 118, 8205, 118, 9143,
4638, 7564, 6378, 5298, 6427, 6241, 3563, 1798, 5310, 3354,
4638, 3563, 1798, 1469, 6444, 4500, 3427, 3373, 511, 102]]), ‘token_type_ids‘: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), ‘attention_mask‘: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[‘[CLS]‘, ‘t‘, ‘##ran‘, ‘##s‘, ‘##form‘, ‘##ers‘, ‘提‘, ‘供‘, ‘了‘, ‘n‘, ‘##lp‘, ‘领‘, ‘域‘, ‘大‘, ‘量‘, ‘state‘, ‘-‘, ‘of‘, ‘-‘, ‘art‘, ‘的‘, ‘预‘, ‘训‘, ‘练‘, ‘语‘, ‘言‘, ‘模‘, ‘型‘, ‘结‘, ‘构‘, ‘的‘, ‘模‘, ‘型‘, ‘和‘, ‘调‘, ‘用‘, ‘框‘, ‘架‘, ‘。‘, ‘[SEP]‘]
torch.Size([1, 40, 768])
nlp中各中文预训练模型的输入和输出