Python-在线文本情感分析实验

'''
每次处理两百
'''
import jieba  # 分词
import re  # 过滤特殊字符
import numpy as np  # 辅助处理
import pandas as pd  # 处理
import emoji  # 过滤表情
import jieba.analyse  # 分级级别权重
import imageio  # 图片
import jieba.posseg as pseg  # 词性标注
from wordcloud import WordCloud  # 词云
import os
import matplotlib.pyplot as plt  # 画布
import difflib  # 相似度判断
import configparser  # 配置文件导入

# 路径配置导入
def cfg():
    # 生成config对象
    conf = configparser.ConfigParser()
    # 用config对象读取配置文件
    conf.read("config.ini" , encoding='utf-8')
    # 以列表形式返回所有的section
    sections = conf.sections()
    items = conf.items('filePaths')
    items = dict(items)
    return items

# 采用关键词典简化数据
def wts_dict():
    wts_lst = []
    with open(cfg()['wts_dict_path'], encoding='utf-8') as f:
        for line in f:
            line = line.replace("\n", "").replace("\r", "")
            wts_lst.append(line)
    return wts_lst

# 文件加载
def dict_load(path):
    print("文件加载!")
    dt = []
    with open(path, encoding='utf-8-sig') as f:
        for line in f:
            if line.strip() != '':  # 去除空格
                dt.append(line.strip())
    return (dt)

# 情感值计算
def sents(sent, negdict, posdict, nodict):
    pos = 0  # 积极
    neg = 0  # 消极
    for i in range(len(sent)):
        if sent[i] in negdict:
            if i == 1 and sent[i - 1] in nodict:
                pos = pos + 1  # 否定-消极
            elif i == 1 and sent[i - 1] not in nodict:
                neg = neg + 1  # 其他-消极
            elif i > 1 and sent[i - 1] in nodict:
                if sent[i - 2] in nodict:
                    neg = neg + 1  # 否定-否定-消极
                else:
                    pos = pos + 1  # 其他-否定-消极
            elif i > 1 and sent[i - 1] not in nodict:
                if sent[i - 2] in nodict:
                    pos = pos + 1  # 否定-其他-消极
                else:
                    neg = neg + 1.5  # 程度-消极

        elif sent[i] in posdict:
            if i == 1 and sent[i - 1] in nodict:
                neg = neg + 1  # 否定-积极
            elif i == 1 and sent[i - 1] not in nodict:
                pos = pos + 1  # 其他-积极
            elif i > 1 and sent[i - 1] in nodict:
                if sent[i - 2] in nodict:
                    pos = pos + 1  # 否定-否定-积极
                else:
                    neg = neg + 1  # 其他-否定-积极
            elif i > 1 and sent[i - 1] not in nodict:
                if sent[i - 2] in nodict:
                    neg = neg + 1  # 否定-其他-积极
                else:
                    pos = pos + 1.5  # 程度-积极
        # print(pos, neg)
    return pos, neg

# 过滤表情
def filter_emoji(test_str):
    result = emoji.demojize(test_str)
    return emoji.emojize(result)

# 数据加载处理为字符串列表
def pretreatment():
    # 加载excel
    excel = pd.read_excel(cfg()['excel_path'])   #encoding = utf-8
    # punt_list = ',.!?;~。!?;~… '.encode('utf8').decode('utf8')
    # 暂时留取时间,不做处理
    data = excel[['Title', 'Notes']]

    # 生成单维DataFrame     并删除重复行
    datafreame = pd.DataFrame(data).dropna(how='any').drop_duplicates(subset='Notes')
    # 去除空值 NaN

    dataToTwo = datafreame.dropna(axis=0)
    # 存放comment列----》字符串列表

    dataToTwoStr = []
    for i in dataToTwo['Notes']:
        dataToTwoStr.append(filter_emoji(str(i)))  # 表情处理
    #print(dataToTwoStr)
    return dataToTwoStr

# 开始加载情感词典列表
def first_Load():
    neg_dict = []  # 消极情感词典
    pos_dict = []  # 积极情感词典
    no_dict = []  # 否定词词典

    pos_dict = dict_load(cfg()['pos_dict_path'])
    # print(pos_dict)
    neg_dict = dict_load(cfg()['neg_dict_path'])
    # print(neg_dict)
    no_dict = dict_load(cfg()['no_dict_path'])
    # print(no_dict)

    return pos_dict, neg_dict, no_dict
    # dicts = {,{,[]}}

def comment_base_split(wts_lst, comment_base):
    index = []
    for i in wts_lst:
        if i in comment_base:
            sall_index = [r.span() for r in re.finditer(i, comment_base)]
            index.append(sall_index)
    # for i in index:
    #     i = list(set(i))
    # print(index)
    index_commnet = list(index)
    t = []
    # print(index_commnet)
    for i in index_commnet:
        for j in i:
            tutle = list(j)
            if tutle[0] < 10:
                tutle[0] = 0
            else:
                tutle[0] = tutle[0] - 20
            if tutle[1] < 10:
                tutle[1] = 20
            else:
                tutle[1] = tutle[1] + 20
            t.append(tutle)
    # print(type(index_commnet))

    for s in range(len(index_commnet)):
        index_commnet[s] = t

    # print(index_commnet)

    comment_base_split_dict = []
    for i in index_commnet:
        for j in i:
            comment_base_split_dict.append(comment_base[j[0]:j[1]])

    for i in comment_base_split_dict:
        for j in comment_base_split_dict:
            if get_equal_rate_1(i, j) > 0.85:  # 相似度大于0.85 删除
                comment_base_split_dict.remove(j)
                break

    return comment_base_split_dict

# 判断俩字符串相似度
def get_equal_rate_1(str1, str2):
    return difflib.SequenceMatcher(None, str1, str2).quick_ratio()

#  最终处理
def batchProcessing():
    # 加载五台山关键字词典
    wts_lst = wts_dict()
    # 三个情感分析词典加载
    pos_dict, neg_dict, no_dict = first_Load()
    # 获取评论 字符串列表
    inputs = pretreatment()

    lst = []
    for i in inputs:

        lists = []
        # 单列单行评论---》过滤特殊符号
        comment_base = i.replace("\n", "").replace("\r", "").replace(" ", "")

        lst.append(comment_base_split(wts_lst, comment_base))
        new_list = []
        for i in lst:
            if i not in new_list:
                new_list.append(i)
        num_lst = []
        pos_lst = []
        neg_lst = []
        # --------------------------------------print(lst)

        for j in new_list:
            for k in j:
                k = "".join(k.split())
                sub_str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", k)
                # 字符串中文分词
                sent = jieba.lcut(sub_str)
                # 计算情感值
                pos, neg = sents(sent, pos_dict, neg_dict, no_dict)

                pos_lst.append(pos)
                neg_lst.append(neg)

                num = pos - neg
                num_lst.append(num)

                with open(cfg()['juzi_path'], 'a', encoding='utf-8') as f:
                    f.write(sub_str + '\n')
        # 保存分析结果
        strs_lst = []
        strs_lst.append(comment_base)
        # 总情感值(游记切割后,每句情感值列表),积极情感值列表, 消极情感值列表
        outputFile(strs_lst, neg_lst, pos_lst)

# 保存分析结果
def outputFile(commentStr, pos, neg):
    #print(commentStr)
    with open(cfg().get('out'), 'a', encoding='utf-8') as f:
        # with open(cfg()['comment_ioPath'],'a',encoding='utf-8') as f2:
        #     for i in commentStr:
        #         f2.write(i)
        for i in commentStr:

            f.write(i)
        f.write("\n积极倾向值:{}".format(sum(pos)) + "\n")
        f.write("消极倾向值:{}".format(sum(neg)) + "\n")
        num = sum(pos) + sum(neg)
        if (num > 0):
            f.write("情感倾向:积极" + "\n")
        elif (num < 0):
            f.write("情感倾向:消极" + "\n")
        else:
            f.write("情感倾向:中性" + "\n")
        f.write('-' * 100 + "\n")

# 生成词云
def toWordCloud():
    # 停用词
    fr = open(cfg()['cn_stopwords'], 'r', encoding='utf-8')
    stop_word_list = fr.readlines()
    new_stop_word_list = []
    for stop_word in stop_word_list:
        stop_word = stop_word.replace('\ufeef', '').strip()
        new_stop_word_list.append(stop_word)

    with open(cfg()['path'], 'r', encoding='utf-8') as f:
        words = f.read()
        word_dict = {}
        word_list = ''
        words_arr = words.split('\n')

        words_jiebas = []
        for i in words_arr:
            words_jiebas.append(jieba.lcut(i))

        for words_jieba in words_jiebas:
            for word in words_jieba:

                if (len(word) > 1 and not word in new_stop_word_list):
                    word_list = word_list + ' ' + word

                    if (word_dict.get(word)):
                        word_dict[word] = word_dict[word] + 1
                    else:
                        word_dict[word] = 1

        ##print(word_list)
        # print(word_dict)
        # 按次数进行排序
        sort_words = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
        print(sort_words[0:101])  # 输出前0-100的词

        #color_mask = imageio.imread(cfg()['bg_path'])
        wc = WordCloud(
            background_color="white",  # 背景颜色
            max_words=1000,  # 显示最大词数
            font_path=cfg()['simsun_path'],  # 使用字体
            min_font_size=20,
            max_font_size=500,
            random_state=42,  # 随机数
            collocations=False,  # 避免重复单词
            width=1600,
            height=1200,
            margin=10,
            #mask=color_mask)  # 图幅宽度
        )
        #wc.recolor([random_state, color_func, colormap])
        wc.generate(word_list)
        wc.to_file(cfg()['wordcloud_ioPath'])

        plt.figure(dpi=100)
        # 以图片的形式显示词云
        plt.imshow(wc,interpolation='catrom',vmax=1000)
        # 关闭图像坐标系
        plt.axis("off")
        plt.show()

if __name__ == '__main__':
    batchProcessing()
    lst = []
    with open(cfg()['juzi_path'], encoding="utf-8-sig") as f:
        for i in f:
            lst.append(i)
    lst = list(set(lst))
    with open(cfg()['path'], 'a', encoding='utf-8-sig') as f2:
        for i in lst:
            f2.write(i)
    toWordCloud()
上一篇:leetcode494. 目标和(回溯 动态规划 01背包)


下一篇:《Python数据分析与挖掘实战》第15章 ——电商产品评论数据情感分析