文章目录
来自:https://www.bilibili.com/video/BV14z4y19777
原文:https://wmathor.com/index.php/archives/1443/
代码实现
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
# def test1():
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype = torch.FloatTensor
sentences = ["jack like dog", "jack like cat", "jack like animal",
"dog cat animal", "banana apple cat dog like", "dog fish milk like",
"dog cat animal like", "jack like apple", "apple like", "jack like banana",
"apple banana jack movie book music like", "cat dog hate", "cat dog like"]
word_seq = " ".join(sentences).split()
# print('-- word_seq : ', word_seq)
# 构建单词到索引的映射
vocab = list(set(word_seq))
word2idx = {w: i for i,w in enumerate(vocab) }
print('\n-- word2idx : ', word2idx)
# -- word2idx : {'movie': 0, 'like': 1, 'banana': 2, 'book': 3, 'animal': 4, 'milk': 5, 'fish': 6, 'jack': 7, 'cat': 8, 'music': 9, 'hate': 10, 'dog': 11, 'apple': 12}
# 模型参数
C = 2 # 窗口大小
batch_size = 8 #
embedding_size = 2 # 此处数据少,使用2维向量去编码
voc_size = len(vocab)
skip_grams = []
for idx in range(C, len(word_seq)-C ): # 从第C个词开始遍历,左边是 C 个词,作为背景词; len(word_seq)-C作为结束,右边有C个背景词。
center_idx = word2idx[word_seq[idx]] # 找到中心词的索引
context_idx = list(range(idx - C, idx) ) + list( range(idx+1, idx+C+1) ) # 左边背景词 + 右边背景词
context = [word2idx[word_seq[i]] for i in context_idx ] # 背景词 原文
# 中心词和左右的背景词,一一匹配
for w in context:
skip_grams.append([center_idx, w]) # 中心词索引,背景词索引列表
# print('\n', center, context, skip_grams)
# print('\n', word_seq[idx])
# print([word_seq[i] for i in context_idx])
def make_data(skip_grams):
# 中心词生成背景词的概率
# 将 输入(中心词) 生成 one-hot 编码;输出作为类别(不需要 one-hot)
input_data = []
output_data = []
for center_idx, context_idx in skip_grams:
mtx = np.eye(voc_size)[center_idx]
input_data.append(mtx) # 生成对角方阵
output_data.append(context_idx)
return input_data, output_data
定义模型
input_data, output_data = make_data(skip_grams)
input_data, output_data = torch.Tensor(input_data), torch.LongTensor(output_data)
dataset = Data.TensorDataset(input_data, output_data)
loader = Data.DataLoader(dataset, batch_size, True)
定义模型
# 定义模型
class Word2Vec(nn.Module):
def __init__(self) -> None:
super().__init__()
self.W = nn.Parameter(torch.randn(voc_size, embedding_size).type(dtype) )
self.V = nn.Parameter(torch.randn(embedding_size, voc_size).type(dtype))
def forward(self, X):
# X : [batch_size, voc_size] one-hot
# torch.mm 只能计算两维矩阵, torch.matmul 可以计算多维,可参考:https://blog.csdn.net/qsmx666/article/details/105783610/
hidden_layer = torch.matmul(X, self.W) # hidden_layer : [batch_size, embedding_size]
output_layer = torch.matmul(hidden_layer, self.V) # output_layer : [batch_size, voc_size],词典有多少词,就有多少类
return output_layer
训练
model = Word2Vec().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# 训练 2000个 epoch
for epoch in range(2000):
for i, (batch_x, batch_y) in enumerate(loader):
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
pred = model(batch_x)
loss = criterion(pred, batch_y)
if (epoch + 1) % 1000 == 0:
print(epoch + 1, i, loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 展示
import matplotlib.pyplot as plt
for i, label in enumerate(vocab):
W, WT = model.parameters()
x,y = float(W[i][0]), float(W[i][1])
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()
其他
np.eye
关于 np.eye,可参考: https://blog.csdn.net/hellocsz/article/details/101014131
大意:可以将类别总数为6的labels=[1,2,3,0,1,1]的数组转化成数组[[0,1,0,0,0,0],[0,0,1,0,0,0],[0,0,0,1,0,0],[0,0,0,0,0,0],[0,1,0,0,0,0],[0,1,0,0,0,0]]这就是所谓的one-hot的形式
voc_size = 5
center_idx = 3
square = np.eye(voc_size)[center_idx]
print('-- square : ', square) # [0. 0. 0. 1. 0.]