1、加载gensim之前要先下载numpy
!pip install -U numpy==1.17.2
import numpy
numpy.__version__
2、加载gensim,使用镜像源
!pip install -U gensim==3.4.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
3、训练
import jieba
import logging
import pandas as pd
import os
from tqdm import tqdm
from gensim.models import word2vec
file_path = 'data/data81388/'
output = open(file_path+'news_data_seg.txt', 'w', encoding='utf-8')
num_lines = sum(1 for line in open(file_path+'news_data.txt', 'r'))
with open(file_path+'news_data.txt') as f:
for idx, line in tqdm(enumerate(f), total=num_lines):
if idx > 305000:
print('\nextract %d articles' % idx)
break
article = line.strip('\n')
article, topics = article.split(' </d> ')
output.write(article)
output.write(' \n')
f.close()
output.close()
word2vec_params = {
'sg': 1,
"size": 100,
"alpha": 0.01,
"min_alpha": 0.0005,
'window': 10,
'min_count': 1,
'seed': 1,
"workers": 24,
"negative": 0,
"hs": 1,
'compute_loss': True,
'iter': 50,
'cbow_mean': 0,
}
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence(file_path+"news_data_seg.txt")
model = word2vec.Word2Vec(sentences=sentences, **word2vec_params)
4、保存
model.save("news_data_mincount_1_305000_vec_original.model")
out = file_path+'news_data_mincount_1_305000_vec_original.txt'
model.wv.save_word2vec_format(out, binary=False)
#保存之后是三个文件,news_data_mincount_1_305000_vec_original.model\.news_data_mincount_1_305000_vec_original.model.wv.vectors.npy\news_data_mincount_1_305000_vec_original.model.wv.vectors.npy
#其中,在下一次要使用模型时,.model文件可以直接加载
#重载代码:
from gensim.models import word2vec
model_1=word2vec.Word2Vec.load('news_data_mincount_1_305000_vec_original.model')
#保存.vector,词向量文件
model.wv.save_word2vec_format('news_data.vector')
[1] https://github.com/jwang0306/mta-lstm-pytorch
[2] https://blog.csdn.net/u012744245/article/details/106012163/