1.从网上下载一份 天龙八部的txt文档以及一份通用的jieba停用词表
2.下载一个背景 图片.jpg
3.检查一个字体文件 C:/Windows/Fonts/simsun.ttc
# -*- coding:utf-8 -*- import jieba import jieba.analyse from PIL import Image import numpy as np from wordcloud import WordCloud,ImageColorGenerator import matplotlib.pyplot as plt #中文分词,将 天龙八部.txt 文档 除去停用词进行分词,将分词结果导入天龙八部分词.txt stopwords= [line.strip() for line in open("./停用词表.txt",encoding="utf-8")] def seg_sentence(sentence): sentence_seged = [word for word in jieba.cut(sentence.strip()) if (word not in stopwords and word != '\t') ] result = ' '.join(sentence_seged) return result outputs = open("天龙八部分词.txt","w",encoding='utf-8') for line in open("./天龙八部.txt",'r',encoding='GB18030'): line_seg = seg_sentence(line) outputs.write(line_seg+'\n') outputs.close() #采用TF-IDF算法进行关键词提取,返回关键词及IF-IDF权重 text = open("./天龙八部分词.txt",encoding="utf-8").read() result = jieba.analyse.extract_tags(text,topK=20,withWeight=True,allowPOS=('nr',)) print (result) #将结果[('段誉', 0.5881865046044787), ('萧峰', 0.4631424402591722).....]装换为字典做 词云模块的输入 keywords = dict() for i in result: keywords[i[0]]=i[1] #词云背景 image = Image.open('./图片.jpg') graph = np.array(image) wc = WordCloud(font_path='C:/Windows/Fonts/simsun.ttc', background_color ="White", max_words=15, mask= graph) #生成词云 wc.generate_from_frequencies(keywords) plt.imshow(wc) image_color = ImageColorGenerator(graph) plt.axis("off") plt.show() wc.to_file('词云.jpg')