NLP - pytorch 实现 word2vec(简单版)

文章目录


来自:https://www.bilibili.com/video/BV14z4y19777
原文:https://wmathor.com/index.php/archives/1443/


代码实现

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim 

import torch.utils.data as Data

# def test1():

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype = torch.FloatTensor 


sentences = ["jack like dog", "jack like cat", "jack like animal",
"dog cat animal", "banana apple cat dog like", "dog fish milk like",
"dog cat animal like", "jack like apple", "apple like", "jack like banana",
"apple banana jack movie book music like", "cat dog hate", "cat dog like"]
word_seq = " ".join(sentences).split() 

# print('-- word_seq : ', word_seq)

# 构建单词到索引的映射
vocab = list(set(word_seq))
word2idx = {w: i for i,w in enumerate(vocab) } 

print('\n-- word2idx : ', word2idx)

# -- word2idx :  {'movie': 0, 'like': 1, 'banana': 2, 'book': 3, 'animal': 4, 'milk': 5, 'fish': 6, 'jack': 7, 'cat': 8, 'music': 9, 'hate': 10, 'dog': 11, 'apple': 12}


# 模型参数
C = 2 # 窗口大小
batch_size = 8 # 
embedding_size = 2 # 此处数据少,使用2维向量去编码
voc_size = len(vocab) 

skip_grams = []
for idx in range(C, len(word_seq)-C ): # 从第C个词开始遍历,左边是 C 个词,作为背景词; len(word_seq)-C作为结束,右边有C个背景词。
    center_idx = word2idx[word_seq[idx]]  # 找到中心词的索引
    context_idx = list(range(idx - C, idx) ) + list( range(idx+1, idx+C+1)  ) # 左边背景词 + 右边背景词 
    context = [word2idx[word_seq[i]] for i in context_idx ] # 背景词 原文 
    
    # 中心词和左右的背景词,一一匹配 

    for w in context:
        skip_grams.append([center_idx, w]) # 中心词索引,背景词索引列表

    # print('\n', center, context, skip_grams)
    # print('\n', word_seq[idx]) 
    # print([word_seq[i] for i in context_idx])

def make_data(skip_grams):

    # 中心词生成背景词的概率
    # 将 输入(中心词) 生成 one-hot 编码;输出作为类别(不需要 one-hot) 
    input_data = []
    output_data = []

    for center_idx, context_idx in skip_grams:
        
        mtx = np.eye(voc_size)[center_idx] 
        input_data.append(mtx) # 生成对角方阵  
        output_data.append(context_idx)  

    return input_data, output_data 


定义模型


input_data, output_data = make_data(skip_grams) 

input_data, output_data = torch.Tensor(input_data), torch.LongTensor(output_data)
dataset = Data.TensorDataset(input_data, output_data)
loader = Data.DataLoader(dataset, batch_size, True)




定义模型



# 定义模型
class Word2Vec(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.W = nn.Parameter(torch.randn(voc_size, embedding_size).type(dtype) )        
        self.V = nn.Parameter(torch.randn(embedding_size, voc_size).type(dtype)) 


    def forward(self, X):
        # X : [batch_size, voc_size] one-hot
        # torch.mm 只能计算两维矩阵, torch.matmul 可以计算多维,可参考:https://blog.csdn.net/qsmx666/article/details/105783610/ 
        hidden_layer = torch.matmul(X, self.W) # hidden_layer : [batch_size, embedding_size]
        output_layer = torch.matmul(hidden_layer, self.V) # output_layer : [batch_size, voc_size],词典有多少词,就有多少类
        return output_layer
 
 


训练


model = Word2Vec().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)


# 训练 2000个 epoch  
for epoch in range(2000):
  for i, (batch_x, batch_y) in enumerate(loader):
    batch_x = batch_x.to(device)
    batch_y = batch_y.to(device)
    pred = model(batch_x)

    loss = criterion(pred, batch_y)
    if (epoch + 1) % 1000 == 0:
      print(epoch + 1, i, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()




# 展示
import matplotlib.pyplot as plt
for i, label in enumerate(vocab):
  W, WT = model.parameters()
  x,y = float(W[i][0]), float(W[i][1])
  plt.scatter(x, y)
  plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

 


其他

np.eye

关于 np.eye,可参考: https://blog.csdn.net/hellocsz/article/details/101014131
大意:可以将类别总数为6的labels=[1,2,3,0,1,1]的数组转化成数组[[0,1,0,0,0,0],[0,0,1,0,0,0],[0,0,0,1,0,0],[0,0,0,0,0,0],[0,1,0,0,0,0],[0,1,0,0,0,0]]这就是所谓的one-hot的形式

voc_size = 5
center_idx = 3
square = np.eye(voc_size)[center_idx]

print('-- square : ', square) # [0. 0. 0. 1. 0.]
上一篇:【路径规划】基于RRT算法机器人最短路径规划matlab代码


下一篇:啊哈之队列,栈练习