随笔

# coding: utf-8

import re

import gensim
import jieba
from gensim import corpora, models


def get_chinese(text):
"""
:param text:
:return:
"""
return "".join(re.findall(r'[\u4e00-\u9fa5]', text))


def seg_depart(sentence):
sentence_depart = jieba.cut(sentence.strip())
outstr = ''
for word in sentence_depart:
if len(word) < 2:
continue
outstr += word
outstr += " "
return outstr


def extract_topic(news):
train = []
# 处理成正确的输入格式
for line in news:
line = re.sub(r'[^\u4e00-\u9fa5]+', '', line)
line_seg = seg_depart(line.strip())
line_seg = [word.strip() for word in line_seg.split(' ')]
train.append(line_seg)

# 新闻ID化
dictionary = corpora.Dictionary(train)
corpus = [dictionary.doc2bow(text) for text in train]
tfidf = gensim.models.TfidfModel(corpus, dictionary=dictionary)
corpus = tfidf[corpus]
# 得到每条新闻的主题分布
lsi_model = models.LsiModel(corpus=corpus, id2word=dictionary)
topics = lsi_model.print_topics(10, 10)
kw = []
for index, topic in enumerate(topics):
kw += [get_chinese(_) for _ in topic[1].split('+')]
return kw


if __name__ == '__main__':
news = ["这是选举的一天!数以百万计的美国人投了希拉里的票。加入他们吧,确定你投给谁",
"希望今天每个人都能度过一个安乐的感恩节,和家人朋友共度美好时光——来自希拉里的问候",
"这是选举的一天!数以百万计的美国人投了希拉里的票。",
"这是选举的一天!数以百万计的美国人投了希拉里的票。",
"这是选举的一天!数以百万计的美国人投了希拉里的票。",
"我爱北京*"]

print({"result": extract_topic(news)})
# print(get_chinese(res[1]))
# print(get_chinese("这是选举的一天!数以百万计1234的美国人投了希拉里的票。"))
# loaded_lsi_model = models.LsiModel.load("test.lsi")
# 将主题向量的稀疏转换成正常
# pred_mats = gensim.matutils.corpus2dense(list(loaded_lsi_model[corpus[:1]]), len(dictionary)).T
# mats = gensim.matutils.corpus2dense(loaded_lsi_model[corpus], len(dictionary)).T
# print(pred_mats)
# 添加行
# print(np.row_stack((mats, pred_mats)))
# print(np.r_[mats, pred_mats])

# results = [i[0] for i in heapq.nsmallest(1000, dict(enumerate(
# distance.cdist(np.array(mats), np.array(pred_mats), 'cosine'))).items(), key=lambda x: x[1])]

# from scipy import sparse
# print(list(lsi_model[corpus[:1]]))
# mtx = sparse.csr_matrix(list(lsi_model[corpus[:1]]))
# print(mtx.todense())
# print(corpus2csc(lda[corpus[0]]))

# topics_test = lda.get_document_topics(corpus)
# print(list(topics_test))
# labels = ['体育','娱乐','科技']
# for i in range(3):
# print('这条'+labels[i]+'新闻的主题分布为:\n')
# print(topics_test[i],'\n')
上一篇:JavaScript数据结构与算法 - 字典


下一篇:JAVA 数据结构(13):数据结构主要种接口和类