NLP自己动手构建一个简单的Dataset

NLP应用构建一个简单的Dataset
参考:入口
需要处理的语料文件 corpus_engllish.txt 的文件格式大致如下:

The Rock is destined to be the 21st Century Segal . ||| 3
The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy Middle-earth . ||| 4
Singer/composer Bryan Adams contributes a slew of songs -- a few potential hits , er , spirit of the piece . ||| 3
You 'd think by now America would have had enough of plucky British eccentrics with hearts of gold . ||| 2
Yet the act is still charming here . ||| 3

目标:

1、将每个一行string 切割为一个句子+一个label
2、将所有出现过的词进行编号,每个词对应一个数字编号
3、然后将一个句子中出现过的所有词换成其对应的编号,这样一个句子就由很多个数字替换了,形成了一个向量。
4、现在你的数据形状为: vector:label
5、编写dataset类

其中Data可以如下表这样理解

sentence_vector为下表中的每一行

NLP自己动手构建一个简单的Dataset

代码如下

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
word2index = {}  # 词到下标的映射
index2word = {}  # 下标到词的映射
word2index.update({'PAD': 0})
index2word.update({0:'PAD'})
def sentence_embeding(corpus_path, max_sentence_length):
    '''

    Parameters
    ----------
    corpus_path:语料库路径
    max_sentence_length:最大句子长度

    Returns:句向量,标签
    -------

    '''
    stop_word = [',', '.', '(', ')', ';', '-', '!', '@', '#', '$', '%', '^', '&', '*', '[', ']', '{', '}', ':', '/','|',"\\" ]
    sentence_vector = []  # 句向量
    label_vector = []  # 标签向量
    index = 1
    with open(corpus_path, 'r') as file:
        for line in file.readlines():
            sentence_list = []  # 保存当前读出来的句子的向量,每轮循环都需要清空
            line_list = line.split("|||")  # 分开text与label
            sentence, label = line_list[0].strip(), line_list[1].strip()
            sentence_words = [word.strip() for word in sentence.split(" ") if
                              word not in stop_word]  # 分词去掉停用字符[停用字符可以自定义]
            for word in sentence_words:
                if word2index.get(word) == None:  # 查无此词
                    word2index[word] = index
                    index2word[index] = word
                    index += 1
                sentence_list.append(word2index[word])
            # 每个句子长度有限制,其句向量自然也有限制
            sentence_list.extend([word2index['PAD']] * max_sentence_length) # 添加填充字符
            sentence_list = sentence_list[:max_sentence_length] # 句子截断
            sentence_vector.append(sentence_list)
            label_vector.append(int(label))
    return np.array(sentence_vector), np.array(label_vector)


class dataset(Dataset):
    def __init__(self, corpus_path, sentence_max_length):
        super(dataset, self).__init__()
        self.Data, self.Label = sentence_embeding(corpus_path, sentence_max_length)

    def __getitem__(self, index):
        Data, Label = torch.tensor(self.Data[index]), torch.tensor(self.Label[index])
        return Data, Label

    def __len__(self):
        return len(self.Label)


if __name__ == '__main__':
    path = r'F:\python\MarkDown_code_notebook\learn\NLP_Implement_dataset\dataset_english\corpus_english.txt'
    sentence_length = 10
    dataset = dataset(corpus_path=path,sentence_max_length=sentence_length)
    dataloader = DataLoader(dataset=dataset,batch_size=5,shuffle=True)
    for data,label in dataloader:
        print("data",data)
        print("label",label)
        break

输出结果为:

data tensor([[2298,   10,  697, 2299,  118,  123,    0,    0,    0,    0],
        [ 170,  786, 1962,  157, 1963,   15, 1670, 1964,   15,   20],
        [ 170,  623,   92,  110,  890,   62, 1021,    3,    5, 1468],
        [ 813, 1609, 1610, 1461, 2131,  956,  248,    7, 1465,    5],
        [   1, 4140,   36,    1, 4141, 4142,  248,   63, 4143,  123]],
       dtype=torch.int32)
       
       
label tensor([4, 3, 3, 3, 2], dtype=torch.int32)
上一篇:sentence-BERT


下一篇:leetcode 替换单词