一、类别编码必须是0开始
import argparse import torch import tqdm from root_path import root import os import pandas as pd import json from sklearn.model_selection import train_test_split from transformers import BertTokenizer from torch.utils.data import Dataset, DataLoader, TensorDataset import numpy as np import random import re from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup # 数据集读取 class NewsDataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels # 读取单个样本 def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(int(self.labels[idx])) return item def __len__(self): return len(self.labels) data_path = os.path.join(root, "data", "raw_data") code_to_label_file = os.path.join(data_path, "code_to_label.json") def get_dataset(): train_path = os.path.join(data_path, "all_0727.xlsx") test_path = os.path.join(data_path, "更正的测试集.xlsx") train_table = pd.read_excel(train_path, sheet_name="data") train_sentence_list = train_table["句子"].tolist() train_code_list = train_table["语义编号"] with open(code_to_label_file, "r", encoding="utf8") as f: code_label = json.load(f) train_num_list = [code_label[train_code][2] for train_code in train_code_list] return train_sentence_list,train_num_list, len(code_label) def flat_accuracy(logits, label_ids): pred = np.argmax(logits, axis = 1) acc = np.equal(pred, label_ids).sum() return acc # 训练函数 def train(model, train_loader, optim, device, scheduler, epoch, test_dataloader): model.train() total_train_loss = 0 iter_num = 0 total_iter = len(train_loader) for batch in train_loader: # 正向传播 optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) label = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=label) loss = outputs[0] total_train_loss += loss.item() # 反向梯度信息 loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 参数更新 optim.step() scheduler.step() iter_num += 1 if (iter_num % 100 == 0): print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % ( epoch, iter_num, loss.item(), iter_num / total_iter * 100)) print("Epoch: %d, Average training loss: %.4f" % (epoch, total_train_loss / len(train_loader))) def validation(model, test_dataloader, device): model.eval() total_eval_accuracy = 0 total_eval_loss = 0 for batch in test_dataloader: with torch.no_grad(): # 正常传播 input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs[0] logits = outputs[1] total_eval_loss += loss.item() logits = logits.detach().cpu().numpy() label_ids = labels.to('cpu').numpy() total_eval_accuracy += flat_accuracy(logits, label_ids) avg_val_accuracy = total_eval_accuracy / len(test_dataloader) print("Accuracy: %.4f" % (avg_val_accuracy)) print("Average testing loss: %.4f" % (total_eval_loss / len(test_dataloader))) print("-------------------------------") def main(model_name, epoch, learning_rate, batch_size, device, save_dir): device = torch.device(device) """读取训练数据""" sentence, label, num_cls = get_dataset() """划分为训练集和验证集, stratify 按照标签进行采样,训练集和验证部分同分布, random_state:设置随机数种子,保证每次都是同一个随机数。若为0或不填,则每次得到数据都不一样 """ x_train, x_test, train_label, test_label = \ train_test_split(sentence, label, test_size=0.5, stratify=label, random_state=5) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64) test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=64) train_dataset = NewsDataset(train_encoding, train_label) test_dataset = NewsDataset(test_encoding, test_label) model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=194) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # 单个读取到批量读取 train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True) # 优化方法 optim = AdamW(model.parameters(), lr=2e-5) total_steps = len(train_loader) * 1 scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) for epoch in range(4): print("------------Epoch: %d ----------------" % epoch) train(model, train_loader, optim, device, scheduler, epoch, test_dataloader) validation(model, test_dataloader, device) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--model_name', default='afi') parser.add_argument('--epoch', type=int, default=100) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--batch_size', type=int, default=2048) parser.add_argument('--device', default='cuda:0') parser.add_argument('--save_dir', default='chkpt') args = parser.parse_args() main(args.model_name, args.epoch, args.learning_rate, args.batch_size, args.device, args.save_dir)