NLP应用构建一个简单的Dataset
参考:入口
需要处理的语料文件 corpus_engllish.txt 的文件格式大致如下:
The Rock is destined to be the 21st Century Segal . ||| 3
The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy Middle-earth . ||| 4
Singer/composer Bryan Adams contributes a slew of songs -- a few potential hits , er , spirit of the piece . ||| 3
You 'd think by now America would have had enough of plucky British eccentrics with hearts of gold . ||| 2
Yet the act is still charming here . ||| 3
目标:
1、将每个一行string 切割为一个句子+一个label
2、将所有出现过的词进行编号,每个词对应一个数字编号
3、然后将一个句子中出现过的所有词换成其对应的编号,这样一个句子就由很多个数字替换了,形成了一个向量。
4、现在你的数据形状为: vector:label
5、编写dataset类
其中Data可以如下表这样理解
sentence_vector为下表中的每一行
代码如下
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
word2index = {} # 词到下标的映射
index2word = {} # 下标到词的映射
word2index.update({'PAD': 0})
index2word.update({0:'PAD'})
def sentence_embeding(corpus_path, max_sentence_length):
'''
Parameters
----------
corpus_path:语料库路径
max_sentence_length:最大句子长度
Returns:句向量,标签
-------
'''
stop_word = [',', '.', '(', ')', ';', '-', '!', '@', '#', '$', '%', '^', '&', '*', '[', ']', '{', '}', ':', '/','|',"\\" ]
sentence_vector = [] # 句向量
label_vector = [] # 标签向量
index = 1
with open(corpus_path, 'r') as file:
for line in file.readlines():
sentence_list = [] # 保存当前读出来的句子的向量,每轮循环都需要清空
line_list = line.split("|||") # 分开text与label
sentence, label = line_list[0].strip(), line_list[1].strip()
sentence_words = [word.strip() for word in sentence.split(" ") if
word not in stop_word] # 分词去掉停用字符[停用字符可以自定义]
for word in sentence_words:
if word2index.get(word) == None: # 查无此词
word2index[word] = index
index2word[index] = word
index += 1
sentence_list.append(word2index[word])
# 每个句子长度有限制,其句向量自然也有限制
sentence_list.extend([word2index['PAD']] * max_sentence_length) # 添加填充字符
sentence_list = sentence_list[:max_sentence_length] # 句子截断
sentence_vector.append(sentence_list)
label_vector.append(int(label))
return np.array(sentence_vector), np.array(label_vector)
class dataset(Dataset):
def __init__(self, corpus_path, sentence_max_length):
super(dataset, self).__init__()
self.Data, self.Label = sentence_embeding(corpus_path, sentence_max_length)
def __getitem__(self, index):
Data, Label = torch.tensor(self.Data[index]), torch.tensor(self.Label[index])
return Data, Label
def __len__(self):
return len(self.Label)
if __name__ == '__main__':
path = r'F:\python\MarkDown_code_notebook\learn\NLP_Implement_dataset\dataset_english\corpus_english.txt'
sentence_length = 10
dataset = dataset(corpus_path=path,sentence_max_length=sentence_length)
dataloader = DataLoader(dataset=dataset,batch_size=5,shuffle=True)
for data,label in dataloader:
print("data",data)
print("label",label)
break
输出结果为:
data tensor([[2298, 10, 697, 2299, 118, 123, 0, 0, 0, 0],
[ 170, 786, 1962, 157, 1963, 15, 1670, 1964, 15, 20],
[ 170, 623, 92, 110, 890, 62, 1021, 3, 5, 1468],
[ 813, 1609, 1610, 1461, 2131, 956, 248, 7, 1465, 5],
[ 1, 4140, 36, 1, 4141, 4142, 248, 63, 4143, 123]],
dtype=torch.int32)
label tensor([4, 3, 3, 3, 2], dtype=torch.int32)