import os
import pandas
import torch
from torch import nn
from import Dataset, DataLoader
# 用于加载bert模型的分词器
from transformers import AutoTokenizer
# 用于加载bert模型
from transformers import AutoModel
from pathlib import Path
from tqdm.notebook import tqdm

batch_size = 16
# 文本的最大长度
text_max_length = 128
epochs = 100
# 取多少训练集的数据作为验证集
validation_ratio = 0.1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 每多少步,打印一次loss
log_per_step = 50

# 数据集所在位置
dataset_dir = Path("/kaggle/input/nlp-getting-started/")
os.makedirs(dataset_dir) if not os.path.exists(dataset_dir) else ''

# 模型存储路径
model_dir = Path("/kaggle/working/")
# 如果模型目录不存在,则创建一个
os.makedirs(model_dir) if not os.path.exists(model_dir) else ''

print("Device:", device)

Device: cuda



pd_data = pandas.read_csv(dataset_dir / 'train.csv')
id keyword location text target
0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... 1
1 4 NaN NaN Forest fire near La Ronge Sask. Canada 1
2 5 NaN NaN All residents asked to 'shelter in place' are ... 1
3 6 NaN NaN 13,000 people receive #wildfires evacuation or... 1
4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... 1
... ... ... ... ... ...
7608 10869 NaN NaN Two giant cranes holding a bridge collapse int... 1
7609 10870 NaN NaN @aria_ahrary @TheTawniest The out of control w... 1
7610 10871 NaN NaN M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt... 1
7611 10872 NaN NaN Police investigating after an e-bike collided ... 1
7612 10873 NaN NaN The Latest: More Homes Razed by Northern Calif... 1

7613 rows × 5 columns

pd_data = pandas.read_csv(dataset_dir / 'train.csv')[['text', 'target']]
text target
0 Our Deeds are the Reason of this #earthquake M... 1
1 Forest fire near La Ronge Sask. Canada 1
2 All residents asked to 'shelter in place' are ... 1
3 13,000 people receive #wildfires evacuation or... 1
4 Just got sent this photo from Ruby #Alaska as ... 1
... ... ...
7608 Two giant cranes holding a bridge collapse int... 1
7609 @aria_ahrary @TheTawniest The out of control w... 1
7610 M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt... 1
7611 Police investigating after an e-bike collided ... 1
7612 The Latest: More Homes Razed by Northern Calif... 1

7613 rows × 2 columns





max_length = pd_data['text'].str.len().max()
# 按ratio随机划分训练集和验证集
pd_validation_data = pd_data.sample(frac = validation_ratio)
pd_train_data = pd_data[~pd_data.index.isin(pd_validation_data.index)]
text target
0 Our Deeds are the Reason of this #earthquake M... 1
1 Forest fire near La Ronge Sask. Canada 1
2 All residents asked to 'shelter in place' are ... 1
4 Just got sent this photo from Ruby #Alaska as ... 1
5 #RockyFire Update => California Hwy. 20 closed... 1
... ... ...
7607 #stormchase Violent Record Breaking EF-5 El Re... 1
7608 Two giant cranes holding a bridge collapse int... 1
7610 M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt... 1
7611 Police investigating after an e-bike collided ... 1
7612 The Latest: More Homes Razed by Northern Calif... 1

6852 rows × 2 columns

class MyDataset(Dataset):
    def __init__(self,mode = 'train'):
        self.mode = mode
        if mode == 'train':
            self.dataset = pd_train_data
        elif mode == 'validation':
            self.dataset = pd_validation_data
        elif mode == 'test':
            # 如果是测试模式,则返回推文和id。拿id做target主要是方便后面写入结果。
            self.dataset = pandas.read_csv(dataset_dir / 'test.csv')[['text', 'id']]
            raise Exception("Unknown mode {}".format(mode))
    def __getitem__(self, index):
        # 取第index条
        data = self.dataset.iloc[index]
        # 取其推文,做个简单的数据清理
        source = data['text'].replace("#", "").replace("@", "")
        # 取对应的推文
        if self.mode == 'test':
            # 如果是test,将id做为target
            target = data['id']
            target = data['target']
        # 返回推文和target
        return source, target

    def __len__(self):
        return len(self.dataset)
train_dataset = MyDataset('train')
validation_dataset = MyDataset('validation')
('Our Deeds are the Reason of this earthquake May ALLAH Forgive us all', 1)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer("I'm learning deep learning", return_tensors='pt')

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 4083, 2784, 4083,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


输入参数 batch:一个batch的句子,每个句子是一个元组,包含文本和目标标签,例如:[(‘推文1’, 目标1), (‘推文2’, 目标2), …]


input_ids:经过分词和映射后的输入文本的token id序列。
这个函数首先将输入的batch分成两个列表,一个是文本列表 text,一个是目标标签列表 target。然后使用 tokenizer 对文本进行分词、映射、padding和裁剪等预处理操作,得到模型的输入 src。最后将处理后的输入 src 和目标标签 target 组合成输出。


def collate_fn(batch):
    :param batch: 一个batch的句子,例如: [('推文', target), ('推文', target), ...]
    :return: 处理后的结果,例如:
             src: {'input_ids': tensor([[ 101, ..., 102, 0, 0, ...], ...]), 'attention_mask': tensor([[1, ..., 1, 0, ...], ...])}
             target:[1, 1, 0, ...]
    text, target = zip(*batch)
    text, target = list(text), list(target)

    # src是要送给bert的,所以不需要特殊处理,直接用tokenizer的结果即可
    # padding='max_length' 不够长度的进行填充
    # truncation=True 长度过长的进行裁剪
    src = tokenizer(text, padding='max_length', max_length=text_max_length, return_tensors='pt', truncation=True)

    return src, torch.LongTensor(target)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
inputs, targets = next(iter(train_loader))
print("inputs:", inputs)
print("targets:", targets)
#batch_size = 16

inputs: {'input_ids': tensor([[  101, 10482,  6591,  ...,     0,     0,     0],
        [  101,  4911,  2474,  ...,     0,     0,     0],
        [  101,  5916,  6340,  ...,     0,     0,     0],
        [  101, 21318,  2571,  ...,     0,     0,     0],
        [  101, 20010, 21149,  ...,     0,     0,     0],
        [  101, 26934,  5315,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([16, 128])
targets: tensor([0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1])



nn.Linear(768, 256):将输入的维度从768降到256,这是一个线性变换(全连接层),将BERT模型输出的768维隐藏表示转换为更低维度的表示。


nn.Linear(256, 1):将256维的表示进一步映射到一个单一的值,用于二分类问题中的概率预测。



class TextClassificationModel(nn.Module):
    def __init__(self):
        super(TextClassificationModel, self).__init__()

        # 加载bert模型
        self.bert = AutoModel.from_pretrained("bert-base-uncased")

        # 最后的预测层
        self.predictor = nn.Sequential(
            nn.Linear(768, 256),
            nn.Linear(256, 1),
    def forward(self, src):
        :param src: 分词后的推文数据

        # 将src直接序列解包传入bert,因为bert和tokenizer是一套的,所以可以这么做。
        # 得到encoder的输出,用最前面[CLS]的输出作为最终线性层的输入
        outputs = self.bert(**src).last_hidden_state[:, 0, :]

        # 使用线性层来做最终的预测
        return self.predictor(outputs)


last_hidden_state 的形状是 (batch_size, sequence_length, hidden_size),其中:

batch_size 是当前批次中样本的数量。
sequence_length 是输入序列的长度。
hidden_size 是隐藏状态的维度,通常等于BERT模型的隐藏层大小,例如在BERT-base中是768。
model = TextClassificationModel()
model =
criteria = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
# 由于inputs是字典类型的,定义一个辅助函数帮助to(device)
def to_device(dict_tensors):
    result_tensors = {}
    for key, value in dict_tensors.items():
        result_tensors[key] =
    return result_tensors

def validate():
    total_loss = 0.
    total_correct = 0
    for inputs, targets in validation_loader:
        inputs, targets = to_device(inputs),
        outputs = model(inputs)
        loss = criteria(outputs.view(-1), targets.float())
        total_loss += float(loss)

        correct_num = (((outputs >= 0.5).float() * 1).flatten() == targets).sum()
        total_correct += correct_num

    return total_correct / len(validation_dataset), total_loss / len(validation_dataset)
# 首先将模型调成训练模式

# 清空一下cuda缓存
if torch.cuda.is_available():

# 定义几个变量,帮助打印loss
total_loss = 0.
# 记录步数
step = 0

# 记录在验证集上最好的准确率
best_accuracy = 0

# 开始训练
for epoch in range(epochs):
    for i, (inputs, targets) in enumerate(train_loader):
        # 从batch中拿到训练数据
        inputs, targets = to_device(inputs),
        # 传入模型进行前向传递
        outputs = model(inputs)
        # 计算损失
        loss = criteria(outputs.view(-1), targets.float())

        total_loss += float(loss)
        step += 1

        if step % log_per_step == 0:
            print("Epoch {}/{}, Step: {}/{}, total loss:{:.4f}".format(epoch+1, epochs, i, len(train_loader), total_loss))
            total_loss = 0

        del inputs, targets

    # 一个epoch后,使用过验证集进行验证
    accuracy, validation_loss = validate()
    print("Epoch {}, accuracy: {:.4f}, validation loss: {:.4f}".format(epoch+1, accuracy, validation_loss)), model_dir / f"model_{epoch}.pt")

    # 保存最好的模型
    if accuracy > best_accuracy:, model_dir / f"")
        best_accuracy = accuracy

Epoch 1/100, Step: 49/429, total loss:27.0852
Epoch 1/100, Step: 99/429, total loss:21.9039
Epoch 1/100, Step: 149/429, total loss:22.6578
Epoch 1/100, Step: 199/429, total loss:21.1815
Epoch 1/100, Step: 249/429, total loss:20.3617
Epoch 1/100, Step: 299/429, total loss:18.9497
Epoch 1/100, Step: 349/429, total loss:20.8270
Epoch 1/100, Step: 399/429, total loss:20.0272
Epoch 1, accuracy: 0.8279, validation loss: 0.0247
Epoch 2/100, Step: 20/429, total loss:18.0542
Epoch 2/100, Step: 70/429, total loss:14.7096
Epoch 2/100, Step: 120/429, total loss:15.0193
Epoch 2/100, Step: 170/429, total loss:14.2937
Epoch 2/100, Step: 220/429, total loss:14.1752
Epoch 2/100, Step: 270/429, total loss:14.2685
Epoch 2/100, Step: 320/429, total loss:14.0682
Epoch 2/100, Step: 370/429, total loss:16.1425
Epoch 2/100, Step: 420/429, total loss:17.1818
Epoch 2, accuracy: 0.8397, validation loss: 0.0279
Epoch 3/100, Step: 41/429, total loss:8.0204
Epoch 3/100, Step: 91/429, total loss:9.5614
Epoch 3/100, Step: 141/429, total loss:9.2036
Epoch 3/100, Step: 191/429, total loss:8.9964
Epoch 3/100, Step: 241/429, total loss:10.7305
Epoch 3/100, Step: 291/429, total loss:10.5000
Epoch 3/100, Step: 341/429, total loss:11.3632
Epoch 3/100, Step: 391/429, total loss:10.3103
Epoch 3, accuracy: 0.8252, validation loss: 0.0339
Epoch 4/100, Step: 12/429, total loss:8.1302
Epoch 4/100, Step: 62/429, total loss:5.9590
Epoch 4/100, Step: 112/429, total loss:6.9333
Epoch 4/100, Step: 162/429, total loss:6.4659
Epoch 4/100, Step: 212/429, total loss:6.3636
Epoch 4/100, Step: 262/429, total loss:6.6609
Epoch 4/100, Step: 312/429, total loss:6.3064
Epoch 4/100, Step: 362/429, total loss:5.7218
Epoch 4/100, Step: 412/429, total loss:6.8676
Epoch 4, accuracy: 0.8042, validation loss: 0.0370
Epoch 5/100, Step: 33/429, total loss:4.4049
Epoch 5/100, Step: 83/429, total loss:3.0673
Epoch 5/100, Step: 133/429, total loss:4.1351
Epoch 5/100, Step: 183/429, total loss:3.8803
Epoch 5/100, Step: 233/429, total loss:3.2633
Epoch 5/100, Step: 283/429, total loss:4.6513
Epoch 5/100, Step: 333/429, total loss:4.3888
Epoch 5/100, Step: 383/429, total loss:5.1710
Epoch 5, accuracy: 0.8055, validation loss: 0.0484
model = torch.load(model_dir / f"")
model = model.eval()
test_dataset = MyDataset('test')
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
results = []
for inputs, ids in tqdm(test_loader):
    outputs = model(
    outputs = (outputs >= 0.5).int().flatten().tolist()
    ids = ids.tolist()
    results = results + [(id, result) for result, id in zip(outputs, ids)]
with open('/kaggle/working/results.csv', 'w', encoding='utf-8') as f:
    for id, result in results:

