'''
每次处理两百
'''
import jieba # 分词
import re # 过滤特殊字符
import numpy as np # 辅助处理
import pandas as pd # 处理
import emoji # 过滤表情
import jieba.analyse # 分级级别权重
import imageio # 图片
import jieba.posseg as pseg # 词性标注
from wordcloud import WordCloud # 词云
import os
import matplotlib.pyplot as plt # 画布
import difflib # 相似度判断
import configparser # 配置文件导入
# 路径配置导入
def cfg():
# 生成config对象
conf = configparser.ConfigParser()
# 用config对象读取配置文件
conf.read("config.ini" , encoding='utf-8')
# 以列表形式返回所有的section
sections = conf.sections()
items = conf.items('filePaths')
items = dict(items)
return items
# 采用关键词典简化数据
def wts_dict():
wts_lst = []
with open(cfg()['wts_dict_path'], encoding='utf-8') as f:
for line in f:
line = line.replace("\n", "").replace("\r", "")
wts_lst.append(line)
return wts_lst
# 文件加载
def dict_load(path):
print("文件加载!")
dt = []
with open(path, encoding='utf-8-sig') as f:
for line in f:
if line.strip() != '': # 去除空格
dt.append(line.strip())
return (dt)
# 情感值计算
def sents(sent, negdict, posdict, nodict):
pos = 0 # 积极
neg = 0 # 消极
for i in range(len(sent)):
if sent[i] in negdict:
if i == 1 and sent[i - 1] in nodict:
pos = pos + 1 # 否定-消极
elif i == 1 and sent[i - 1] not in nodict:
neg = neg + 1 # 其他-消极
elif i > 1 and sent[i - 1] in nodict:
if sent[i - 2] in nodict:
neg = neg + 1 # 否定-否定-消极
else:
pos = pos + 1 # 其他-否定-消极
elif i > 1 and sent[i - 1] not in nodict:
if sent[i - 2] in nodict:
pos = pos + 1 # 否定-其他-消极
else:
neg = neg + 1.5 # 程度-消极
elif sent[i] in posdict:
if i == 1 and sent[i - 1] in nodict:
neg = neg + 1 # 否定-积极
elif i == 1 and sent[i - 1] not in nodict:
pos = pos + 1 # 其他-积极
elif i > 1 and sent[i - 1] in nodict:
if sent[i - 2] in nodict:
pos = pos + 1 # 否定-否定-积极
else:
neg = neg + 1 # 其他-否定-积极
elif i > 1 and sent[i - 1] not in nodict:
if sent[i - 2] in nodict:
neg = neg + 1 # 否定-其他-积极
else:
pos = pos + 1.5 # 程度-积极
# print(pos, neg)
return pos, neg
# 过滤表情
def filter_emoji(test_str):
result = emoji.demojize(test_str)
return emoji.emojize(result)
# 数据加载处理为字符串列表
def pretreatment():
# 加载excel
excel = pd.read_excel(cfg()['excel_path']) #encoding = utf-8
# punt_list = ',.!?;~。!?;~… '.encode('utf8').decode('utf8')
# 暂时留取时间,不做处理
data = excel[['Title', 'Notes']]
# 生成单维DataFrame 并删除重复行
datafreame = pd.DataFrame(data).dropna(how='any').drop_duplicates(subset='Notes')
# 去除空值 NaN
dataToTwo = datafreame.dropna(axis=0)
# 存放comment列----》字符串列表
dataToTwoStr = []
for i in dataToTwo['Notes']:
dataToTwoStr.append(filter_emoji(str(i))) # 表情处理
#print(dataToTwoStr)
return dataToTwoStr
# 开始加载情感词典列表
def first_Load():
neg_dict = [] # 消极情感词典
pos_dict = [] # 积极情感词典
no_dict = [] # 否定词词典
pos_dict = dict_load(cfg()['pos_dict_path'])
# print(pos_dict)
neg_dict = dict_load(cfg()['neg_dict_path'])
# print(neg_dict)
no_dict = dict_load(cfg()['no_dict_path'])
# print(no_dict)
return pos_dict, neg_dict, no_dict
# dicts = {,{,[]}}
def comment_base_split(wts_lst, comment_base):
index = []
for i in wts_lst:
if i in comment_base:
sall_index = [r.span() for r in re.finditer(i, comment_base)]
index.append(sall_index)
# for i in index:
# i = list(set(i))
# print(index)
index_commnet = list(index)
t = []
# print(index_commnet)
for i in index_commnet:
for j in i:
tutle = list(j)
if tutle[0] < 10:
tutle[0] = 0
else:
tutle[0] = tutle[0] - 20
if tutle[1] < 10:
tutle[1] = 20
else:
tutle[1] = tutle[1] + 20
t.append(tutle)
# print(type(index_commnet))
for s in range(len(index_commnet)):
index_commnet[s] = t
# print(index_commnet)
comment_base_split_dict = []
for i in index_commnet:
for j in i:
comment_base_split_dict.append(comment_base[j[0]:j[1]])
for i in comment_base_split_dict:
for j in comment_base_split_dict:
if get_equal_rate_1(i, j) > 0.85: # 相似度大于0.85 删除
comment_base_split_dict.remove(j)
break
return comment_base_split_dict
# 判断俩字符串相似度
def get_equal_rate_1(str1, str2):
return difflib.SequenceMatcher(None, str1, str2).quick_ratio()
# 最终处理
def batchProcessing():
# 加载五台山关键字词典
wts_lst = wts_dict()
# 三个情感分析词典加载
pos_dict, neg_dict, no_dict = first_Load()
# 获取评论 字符串列表
inputs = pretreatment()
lst = []
for i in inputs:
lists = []
# 单列单行评论---》过滤特殊符号
comment_base = i.replace("\n", "").replace("\r", "").replace(" ", "")
lst.append(comment_base_split(wts_lst, comment_base))
new_list = []
for i in lst:
if i not in new_list:
new_list.append(i)
num_lst = []
pos_lst = []
neg_lst = []
# --------------------------------------print(lst)
for j in new_list:
for k in j:
k = "".join(k.split())
sub_str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", k)
# 字符串中文分词
sent = jieba.lcut(sub_str)
# 计算情感值
pos, neg = sents(sent, pos_dict, neg_dict, no_dict)
pos_lst.append(pos)
neg_lst.append(neg)
num = pos - neg
num_lst.append(num)
with open(cfg()['juzi_path'], 'a', encoding='utf-8') as f:
f.write(sub_str + '\n')
# 保存分析结果
strs_lst = []
strs_lst.append(comment_base)
# 总情感值(游记切割后,每句情感值列表),积极情感值列表, 消极情感值列表
outputFile(strs_lst, neg_lst, pos_lst)
# 保存分析结果
def outputFile(commentStr, pos, neg):
#print(commentStr)
with open(cfg().get('out'), 'a', encoding='utf-8') as f:
# with open(cfg()['comment_ioPath'],'a',encoding='utf-8') as f2:
# for i in commentStr:
# f2.write(i)
for i in commentStr:
f.write(i)
f.write("\n积极倾向值:{}".format(sum(pos)) + "\n")
f.write("消极倾向值:{}".format(sum(neg)) + "\n")
num = sum(pos) + sum(neg)
if (num > 0):
f.write("情感倾向:积极" + "\n")
elif (num < 0):
f.write("情感倾向:消极" + "\n")
else:
f.write("情感倾向:中性" + "\n")
f.write('-' * 100 + "\n")
# 生成词云
def toWordCloud():
# 停用词
fr = open(cfg()['cn_stopwords'], 'r', encoding='utf-8')
stop_word_list = fr.readlines()
new_stop_word_list = []
for stop_word in stop_word_list:
stop_word = stop_word.replace('\ufeef', '').strip()
new_stop_word_list.append(stop_word)
with open(cfg()['path'], 'r', encoding='utf-8') as f:
words = f.read()
word_dict = {}
word_list = ''
words_arr = words.split('\n')
words_jiebas = []
for i in words_arr:
words_jiebas.append(jieba.lcut(i))
for words_jieba in words_jiebas:
for word in words_jieba:
if (len(word) > 1 and not word in new_stop_word_list):
word_list = word_list + ' ' + word
if (word_dict.get(word)):
word_dict[word] = word_dict[word] + 1
else:
word_dict[word] = 1
##print(word_list)
# print(word_dict)
# 按次数进行排序
sort_words = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
print(sort_words[0:101]) # 输出前0-100的词
#color_mask = imageio.imread(cfg()['bg_path'])
wc = WordCloud(
background_color="white", # 背景颜色
max_words=1000, # 显示最大词数
font_path=cfg()['simsun_path'], # 使用字体
min_font_size=20,
max_font_size=500,
random_state=42, # 随机数
collocations=False, # 避免重复单词
width=1600,
height=1200,
margin=10,
#mask=color_mask) # 图幅宽度
)
#wc.recolor([random_state, color_func, colormap])
wc.generate(word_list)
wc.to_file(cfg()['wordcloud_ioPath'])
plt.figure(dpi=100)
# 以图片的形式显示词云
plt.imshow(wc,interpolation='catrom',vmax=1000)
# 关闭图像坐标系
plt.axis("off")
plt.show()
if __name__ == '__main__':
batchProcessing()
lst = []
with open(cfg()['juzi_path'], encoding="utf-8-sig") as f:
for i in f:
lst.append(i)
lst = list(set(lst))
with open(cfg()['path'], 'a', encoding='utf-8-sig') as f2:
for i in lst:
f2.write(i)
toWordCloud()