基于朴素贝叶斯分类的用户情感分类系统

 

对用户的电影评论进行情感分类

首先输入豆瓣网址,对电影评论进行爬取

然后讲爬取到的评论存入Excel表格

对Excel表格中的数据进行清洗

将数据存入mysql数据库

对数据进行分词,词频统计

调用贝叶斯算法进行情感分类

打印好评和差评

存入数据库

完毕!

 

 

# -*-coding:utf-8-*-

import urllib.request
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import matplotlib.pyplot as plt  #绘制图像的模块
import pandas as pd
from numpy import *
import jieba
import xlwt
import codecs
from pylab import mpl
import os
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine

def getHtml(url):
    """获取url页面"""
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
    req = urllib.request.Request(url,headers=headers)
    req = urllib.request.urlopen(req)
    content = req.read().decode('utf-8')
    return content

def getComment(url):
    """解析HTML页面"""
    html = getHtml(url)
    soupComment = BeautifulSoup(html, 'html.parser')

    comments = soupComment.findAll('span', 'short')
    onePageComments = []
    for comment in comments:
        # print(comment.getText()+'\n')
        onePageComments.append(comment.getText()+'\n')

    return onePageComments

#爬取豆瓣电影评论
def function1():
    print("请输入要爬取评论的电影的网址(豆瓣)")
    url11=input()
    url1=str(url11)
    f = open('G:/好评.txt', 'w', encoding='utf-8')
    for page in range(200):  # 豆瓣爬取多页评论需要验证。
        ur=url1[33:41]
        url = 'https://movie.douban.com/subject/'+str(ur)+'/comments?start=' + str(
          1 * page) + '&limit=20&sort=new_score&status=P&percent_type=h'
        print('第%s页的评论:' % (page + 1))
        print(url + '\n')
        for i in getComment(url):
            f.write(i)
            print(i)
        print('\n')
    f2=open('G:/差评.txt','w',encoding='utf-8')
    for page2 in range(200):
        url='https://movie.douban.com/subject/26752088/comments?start=' + str(1*page2) + '&limit=20&sort=new_score&status=P&percent_type=l'
        print('第%s页的评论:' %(page2+1))
        print(url+'\n')
        for i in getComment(url):
            f2.write(i)
            print(i)
        print('\n')

#文本转Excel
def function2():
    file = open("G:/comments/好评.txt", "r", encoding="UTF-8")
    a1 = file.readlines()
    workbook1 = xlwt.Workbook(encoding="UTF-8")
    worksheet1 = workbook1.add_sheet('ltq')
    for i in range(len(a1)):
        worksheet1.write(i, 0, a1[i])
        print(a1[i])
    workbook1.save('G:/comments/好评.xls')

    file = open("G:/comments/差评.txt", "r", encoding="UTF-8")
    a2 = file.readlines()
    workbook2 = xlwt.Workbook(encoding="UTF-8")
    worksheet2 = workbook2.add_sheet('ltq')
    for i in range(len(a2)):
        worksheet2.write(i, 0, a2[i])
        print(a2[i])
    workbook2.save('G:/comments/差评.xls')

    # 贝叶斯对评论进行分类

def function3():
    #         第一步 读取数据及分词
    #
    data = pd.read_csv("G:/comments/comments.csv")
    # print(data)

    # 取表中的第1列的所有值
    print("获取第一列内容")
    col = data.iloc[:, 1]
    # 取表中所有值
    arrs = col.values

    # 去除停用词
    #stopwords = {}.fromkeys([',', '。', '!', '这', '我', '非常'])
    stopwords = [line.strip() for line in open("G:/comments/中文停用词表.txt",encoding="utf-8").readlines()]
    #print("\n中文分词后结果:")
    corpus = []
    for a in arrs:
        # print a
        seglist = jieba.cut(a, cut_all=False)  # 精确模式
        final = ''
        for seg in seglist:
            seg = seg.encode('utf-8')
            if seg not in stopwords:  # 不是停用词的保留
                final += seg.decode()
        seg_list = jieba.cut(final, cut_all=False)
        output = ' '.join(list(seg_list))  # 空格拼接
        #print(output)
        corpus.append(output)
    ####################################
    #         第二步 计算词频
    #
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    vectorizer = CountVectorizer()  # 将文本中的词语转换为词频矩阵
    X = vectorizer.fit_transform(corpus)  # 计算个词语出现的次数
    word = vectorizer.get_feature_names()  # 获取词袋中所有文本关键词
    for w in word:  # 查看词频结果
        print(w)
    print('')
    print(X.toarray())

    ####################################
    #         第三步 数据分析
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.metrics import precision_recall_curve
    from sklearn.metrics import classification_report

    # 使用前4000行数据集进行训练,最后n数据集用于预测

    print(u"\n\n数据分析:")
    X = X.toarray()
    x_train = X[:4000]
    x_test = X[4000:]
    # 1表示好评 0表示差评
    # y_train = [1,1,0,0,1,0,0,1]
    y_train1 = data['Fraction'].tolist()
    y_train2 = y_train1[:4000]
    y_train = array(y_train2)

    print(y_train)
    y_test = [1, 0]

    # 调用MultinomialNB分类器
    clf = MultinomialNB().fit(x_train, y_train)
    pre = clf.predict(x_test)

    print("1表示好评,0表示差评")
    print("评论预测结果为:_____________________________________________________________________________________________________________________________________________")
    com_list1 = data['comment'].tolist()
    com_list2 = com_list1[4000:]
    j = 0
    for i in com_list2:
        print(i, "   :", pre[j])
        # print(pre[j])
        j = j + 1

    # 输出好评:
    print("是否输出全部好评? 1:是  0:否")
    selectone=input()
    selectone=int(selectone)
    if selectone==1:
        print("查看所有好评_________________________________________________________________________________________________________________________________________________")
        j = 0
        for i in com_list2:
            pre[j] = int(pre[j])
            if pre[j] == 1:
                print(i)
            j = j + 1
    #输出差评:
    print("是否输出所有差评? 1:是  0:否")
    selectsecond=input()
    selectsecond=int(selectsecond)
    if selectsecond==1:
        print("查看所有差评—————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————")
        j = 0
        for i in com_list2:
            pre[j] = int(pre[j])
            if pre[j] == 0:
                print(i)
            j = j + 1
    # print(u"预测结果:",pre)
    # print(u"真实结果:",y_test)

    from sklearn.metrics import classification_report
    print(classification_report(y_test, pre))

# 生成直方图

def function4():
    mpl.rcParams['font.sans-serif'] = ['FangSong']  # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
    plt.rcParams['font.sans-serif'] = ['SimHei']

    txt = open("G:/差评.txt", encoding="utf-8").read()
    # 加载停用词表
    stopwords = [line.strip() for line in open("G:/comments/中文停用词表.txt", encoding="utf-8").readlines()]
    words = jieba.lcut(txt)
    counts = {}
    for word in words:
        # 不在停用词表中
        if word not in stopwords:
            # 不统计字数为一的词
            if len(word) == 1:
                continue
            else:
                counts[word] = counts.get(word, 0) + 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)
    for i in range(50):
        word, count = items[i]
        print("{:<10}{:>7}".format(word, count))
    label = list(map(lambda x: x[0], items[:10]))
    value = list(map(lambda y: y[1], items[:10]))

    plt.bar(range(len(value)), value, tick_label=label)
    plt.savefig("G:/filename.png")
    # plt.show()

    # 打开数据库连接

    # 读取图片文件
    # fp = open("test.jpg",'rb',encoding='utf-8')
    fp = open("G:/filename.png", 'rb')
    img = fp.read()
    fp.close()
    db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8')
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    # 注意使用Binary()函数来指定存储的是二进制
    # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img))
    sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES  (%s)"
    cursor.execute(sql, img)
    # 提交,不然无法保存新建或者修改的数据
    db.commit()
    # 关闭游标
    cursor.close()
    # 关闭数据库连接
    db.close()


#生成词云
def function5():
    # 差评词云
    path_txt2 = 'G:/comments/差评.txt'
    f2 = open(path_txt2, 'r', encoding='UTF-8').read()
    cut_text2 = " ".join(jieba.cut(f2))
    wordcloud2 = WordCloud(
        # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
        font_path="C:/Windows/Fonts/simfang.ttf",
        # 设置了背景,宽高
        background_color="white", width=1100, height=1000).generate(cut_text2)
    plt.imshow(wordcloud2, interpolation="bilinear")
    plt.axis("off")
    # plt.show()
    wordcloud2.to_file("G:/词云图片.jpg")


    # 打开数据库连接

    # 读取图片文件
    # fp = open("test.jpg",'rb',encoding='utf-8')
    fp = open("G:/词云图片.jpg", 'rb')
    img = fp.read()
    fp.close()
    db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8')
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    # 注意使用Binary()函数来指定存储的是二进制
    # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img))
    sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES  (%s)"
    cursor.execute(sql, img)
    # 提交,不然无法保存新建或者修改的数据
    db.commit()
    # 关闭游标
    cursor.close()
    # 关闭数据库连接
    db.close()



def function6():
    #好评词云
    path_txt1 = 'G:/comments/好评.txt'
    f1 = open(path_txt1, 'r', encoding='UTF-8').read()
    # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云
    cut_text1 = " ".join(jieba.cut(f1))

    wordcloud1 = WordCloud(
        # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
        font_path="C:/Windows/Fonts/simfang.ttf",
        # 设置了背景,宽高
        background_color="white", width=1100, height=1000).generate(cut_text1)

    plt.imshow(wordcloud1, interpolation="bilinear")
    plt.axis("off")
    # plt.show()
    wordcloud1.to_file("G:/词云图片.jpg")
    # 打开数据库连接

    # 读取图片文件
    # fp = open("test.jpg",'rb',encoding='utf-8')
    fp = open("G:/词云图片.jpg", 'rb')
    img = fp.read()
    fp.close()
    db = pymysql.connect("localhost", "root", "liutaiqing", "testdb", charset='utf8')
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    # 注意使用Binary()函数来指定存储的是二进制
    # cursor.execute("INSERT INTO demo_pic_repo SET touxiang_data= %s" % pymysql.Binary(img))
    sql = "INSERT INTO demo_pic_repo (touxiang_data_blob) VALUES  (%s)"
    cursor.execute(sql, img)
    # 提交,不然无法保存新建或者修改的数据
    db.commit()
    # 关闭游标
    cursor.close()
    # 关闭数据库连接
    db.close()


count = 1
while count > 0:
    print("请选择功能:")
    print("1:爬取评论")
    print("2:文本转Excel")
    print("3:贝叶斯对评论进行分类")
    print("4:生成直方图")
    print("5:生成词云(好评)")
    print("6:生成词云(差评)")
    print("7:结束")
    celect = input()
    celect=int(celect)
    if celect == 1:
        function1()
    if celect == 2:
        function2()
    if celect == 3:
        function3()
    if celect == 4:
        function4()
    if celect == 5:
        function5()
    if celect == 6:
        function5()
    if celect == 7:
        break

该系统目前缺少pythonWeb,估计研究生之前没时间学习和补充了了。。。

有发现问题的小伙伴欢迎评论下方补充哦  ~  ~  ~

另外。记得三连啊(感谢感谢~)

上一篇:java中的xss


下一篇:零基础入门数据挖掘实践(学术前沿趋势分析)之三