基于Bert预训练模型的文本分类fine tune
环境
- python==3.7
- torch==1.7.1
- transformers==4.9.2
- scikit-learn==0.21.3
- tensorboard==2.5.0
- pandas
- numpy
构建数据集
将数据放到如下图格式的dataframe中,label对应的数字为每种类别的下标。
random seed设置
import torch
import numpy as np
random_seed = 2018
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
数据集划分
# 训练集 验证集 测试集划分
# stratify参数为分层抽样,根据label的种类分布,按比例抽样
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], random_state=random_seed, test_size=0.3, stratify=df['label'])
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, random_state=random_seed, test_size=0.5, stratify=temp_labels)
预训练模型加载
import transformers
from transformers import AutoModel, BertTokenizerFast
# 加载预训练模型
pretrained_model_path = "bert-base-uncased"
bert = AutoModel.from_pretrained(pretrained_model_path)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_path)
句子长度分布
import pandas as pd
# 查看训练集中的句子长度,决定pad的长度
seq_len = [len(i.split()) for i in train_text]
seq_df = pd.DataFrame(seq_len)
seq_df.describe()
如上图所示,75%的句子长度都在46,因此后面pad长度可设置为50左右
tokenizer
padding_len = 50
# 经过tokenizer之后会将文字变成数字的形式,其中也包含了attention mask
tokens_train = tokenizer.batch_encode_plus(train_text.tolist(), max_length = padding_len, pad_to_max_length=True, truncation=True)
tokens_test = tokenizer.batch_encode_plus(test_text.tolist(), max_length = padding_len, pad_to_max_length=True, truncation=True)
tokens_val = tokenizer.batch_encode_plus(val_text.tolist(), max_length = padding_len, pad_to_max_length=True, truncation=True)
# to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())
Dataset Dataloder
# create dataloader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16
train_data = TensorDataset(train_seq, train_mask, train_y)
# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)
# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
冻结预训练模型参数
# freeze all the parameters
for param in bert.parameters():
param.requires_grad = False
模型定义
# 模型定义
class BertClassifier(nn.Module):
def __init__(self, bert_pretrained_model):
super(BertClassifier, self).__init__()
self.bert = bert_pretrained_model
self.dropout = nn.Dropout(0.1)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(768, 512)
self.fc2 = nn.Linear(512, class_num)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, sent_id, mask):
bert_res = self.bert(sent_id, attention_mask=mask)
# _, cls_hs = bert_res
cls_hs = bert_res.pooler_output
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.softmax(x)
return x
初始化
device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
model = BertClassifier(bert)
model.to(device)
optimizer
from transformers import AdamW
# bert论文推荐学习率2e-5, 3e-5, 5e-5
optimizer = AdamW(model.parameters(), lr=2e-5)
查看每种类别的weights
针对label分布不均衡,在损失函数里传weights
from sklearn.utils.class_weight import compute_class_weight
group_size = df.groupby(["label"]).size()
print(group_size)
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)
损失函数
weights= torch.tensor(class_weights, dtype=torch.float)
weights = weights.to(device)
cross_entropy = nn.NLLLoss(weight=weights)
训练
from datetime import datetime
from sklearn.metrics import f1_score
from torch.utils.tensorboard import SummaryWriter
saved_path = '/data/yuhengshi/saved_model/9_aug_other_roberta_val185.plk'
writer = SummaryWriter("/data/yuhengshi/saved_model")
def evaluate():
print(f"""evaluate start""")
model.eval()
total_loss, total_accuracy = 0, 0
total_preds = []
total_labels = []
for step, batch in enumerate(val_dataloader):
batch = [t.to(device) for t in batch]
sent_id, mask, labels = batch
with torch.no_grad():
preds = model(sent_id, mask)
loss = cross_entropy(preds, labels)
total_loss = total_loss + loss.item()
preds = preds.detach().cpu().numpy()
total_preds.append(preds)
labels = labels.detach().cpu().numpy()
total_labels.append(labels)
# # 每50次
# if step % 50 == 0 and step != 0:
# print()
avg_loss = total_loss/len(val_dataloader)
total_preds = np.concatenate(total_preds, axis=0)
total_labels = np.concatenate(total_labels, axis=0)
val_f1 = f1_score(total_labels, np.argmax(total_preds, axis=1), average='micro')
return avg_loss, val_f1, total_preds, total_labels
def train(epochs=10):
best_valid_loss = float('inf')
start_time = datetime.now()
for epoch in range(epochs):
print(f"""Epoch {epoch}/{epochs} start""")
model.train()
total_loss, total_accuracy, total_batch = 0, 0, 0
# 存预测结果
total_preds = []
total_labels = []
for step, batch in enumerate(train_dataloader):
batch = [x.to(device) for x in batch]
sent_id, mask, labels = batch
model.zero_grad()
preds = model(sent_id, mask)
loss = cross_entropy(preds, labels)
total_loss = total_loss + loss.item()
loss.backward()
# 防梯度爆炸
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
# model predictions are stored on GPU. So, push it to CPU
preds = preds.detach().cpu().numpy()
total_preds.append(preds)
total_labels.append(labels.detach().cpu().numpy())
# 每50次
if step != 0 and step % 50 == 0:
current_time = datetime.now()
print(f"""Epoch {epoch}/{epochs}, step:{step}, train_loss:{loss}, cost_time:{current_time-start_time}""")
# 每个epoch结束 计算平均loss
total_preds = np.concatenate(total_preds, axis=0)
total_labels = np.concatenate(total_labels, axis=0)
avg_loss = total_loss/len(train_dataloader)
preds = np.argmax(total_preds, axis=1)
train_f1 = f1_score(total_labels, preds, average='micro')
valid_loss, valid_f1, valid_preds, valid_labels = evaluate()
valid_preds = np.argmax(valid_preds, axis=1)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), saved_path)
current_time = datetime.now()
metrics_report = classification_report(valid_labels, valid_preds)
# tensorboard
writer.add_scalar("loss/train", avg_loss, epoch)
writer.add_scalar("loss/valid", valid_loss, epoch)
writer.add_scalar("f1/train", train_f1, epoch)
writer.add_scalar("f1/valid", valid_f1, epoch)
print(f"Epoch {epoch}/{epochs}, train_loss: {avg_loss}, train_f1:{train_f1}\n"
f"valid_loss: {valid_loss}, valid_f1: {valid_f1}\n"
f"best_valid_loss: {best_valid_loss}, cost_time: {current_time-start_time}\n"
f"{metrics_report}")
print(f"=====================================")
total_batch += 1
writer.close()
train()
预测
def predict(model_path, test_data, test_label):
model.load_state_dict(torch.load(model_path))
with torch.no_grad():
preds = model(test_seq.to(device), test_mask.to(device))
preds = preds.detach().cpu().numpy()
preds = np.argmax(preds, axis=1)
return preds
tensorboard使用
tensorboard --logdir=./log2 --port 8088