gensim训练word2词向量(百度飞桨上有免费算力)

1、加载gensim之前要先下载numpy

!pip install -U numpy==1.17.2
import numpy
numpy.__version__

2、加载gensim,使用镜像源

!pip install  -U gensim==3.4.0  -i https://pypi.tuna.tsinghua.edu.cn/simple 

3、训练

import jieba
import logging
import pandas as pd
import os
from tqdm import tqdm
from gensim.models import word2vec

file_path = 'data/data81388/'

output = open(file_path+'news_data_seg.txt', 'w', encoding='utf-8')
num_lines = sum(1 for line in open(file_path+'news_data.txt', 'r'))
with open(file_path+'news_data.txt') as f:
    for idx, line in tqdm(enumerate(f), total=num_lines):
        if idx > 305000:
            print('\nextract %d articles' % idx)
            break
        article = line.strip('\n')
        article, topics = article.split(' </d> ')
        output.write(article)
        output.write(' \n')
    f.close()
    
output.close()

word2vec_params = {
    'sg': 1,
    "size": 100,
    "alpha": 0.01,
    "min_alpha": 0.0005,
    'window': 10,
    'min_count': 1,
    'seed': 1,
    "workers": 24,
    "negative": 0,
    "hs": 1,
    'compute_loss': True,
    'iter': 50,
    'cbow_mean': 0,
}

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence(file_path+"news_data_seg.txt")
model = word2vec.Word2Vec(sentences=sentences, **word2vec_params)

4、保存

model.save("news_data_mincount_1_305000_vec_original.model")
out = file_path+'news_data_mincount_1_305000_vec_original.txt'
model.wv.save_word2vec_format(out, binary=False)  
#保存之后是三个文件,news_data_mincount_1_305000_vec_original.model\.news_data_mincount_1_305000_vec_original.model.wv.vectors.npy\news_data_mincount_1_305000_vec_original.model.wv.vectors.npy
#其中,在下一次要使用模型时,.model文件可以直接加载
#重载代码:
from gensim.models import word2vec
model_1=word2vec.Word2Vec.load('news_data_mincount_1_305000_vec_original.model')

#保存.vector,词向量文件
model.wv.save_word2vec_format('news_data.vector')

[1] https://github.com/jwang0306/mta-lstm-pytorch
[2] https://blog.csdn.net/u012744245/article/details/106012163/

上一篇:Python学习笔记之Break和Continue用法分析


下一篇:mysql导出数据到csv文件