深度学习pytorch学习笔记
关于语言模型数据集预处理章节的代码解释
模块
# 导入模块
import torch
import random
import zipfile
读取数据
# 打开压缩包,读取数据
with zipfile.ZipFile('D:/dataset/data_jaychou_lyrics.txt.zip') as zin:
with zin.open('jaychou_lyrics.txt') as f: # 压缩包内文档名
# corpus_chars储存所有歌词
corpus_chars = f.read().decode('utf-8') # utf-8编码格式,输出中文,防止乱码
准备工作
# 将换行符,回车符替换为空格使用
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
# 前10000个字符训练模型
corpus_chars = corpus_chars[: 10000]
# 提取不同的字符,使用set集合的特性去重
idx_to_char = list(set(corpus_chars))
# char_to_idx字典,将idx_to_char中每个字符按顺序与数字一一对应
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
# vocab_size是字典的长度,即有多少种不同的字符
vocab_size = len(char_to_idx)
# 借助char_to_idx字典,将corpus_chars中的字符转换成字典中对应的索引号,corpus_indices列表储存
corpus_indices = [char_to_idx[char] for char in corpus_chars]
时序数据采样
# 随机采样
def data_iter_random(corpus_indices, batch_size, num_steps):
# 减1是因为输出的索引X是相应输入的索引Y+1
# num_steps为每个样本包含的时间步数,即样本序列数
# num_examples为样本总数
num_examples = (len(corpus_indices) - 1) // num_steps # //向下取整
# batch_size每个小批量的样本数
epoch_size = num_examples // batch_size # 周期大小
# 样本总数变为列表,从0开始
example_indices = list(range(num_examples))
# 打乱
random.shuffle(example_indices)
# 返回从pos位置开始的长为num_steps的corpus_indices索引号序列
def _data(pos):
return corpus_indices[pos: pos + num_steps]
for i in range(epoch_size):
# 每次读取batch_size个随机样本
i = i * batch_size
batch_indices = example_indices[i: i + batch_size]
X = [_data(j * num_steps) for j in batch_indices] # 批量大小batch_sizes确定矩阵的行数,时间序列num_steps确定了矩阵的列数
Y = [_data(j * num_steps + 1) for j in batch_indices] # batch_indices中的每项确定的矩阵每行从哪个位置开始
yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)
# 相邻采样
def data_iter_consecutive(corpus_indices, batch_size, num_steps):
# 转换格式
corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)
data_len = len(corpus_indices)
batch_len = data_len // batch_size
indices = corpus_indices[0: batch_size * batch_len].reshape(batch_size, batch_len)
epoch_size = (batch_len - 1) // num_steps
for i in range(epoch_size):
i = i * num_steps
X = indices[:, i: i + num_steps]
Y = indices[:, i + 1: i + num_steps + 1]
yield X, Y