python 网页爬取数据生成文字云图

2022-09-11 12:46:22

1. 需要的三个包：

from wordcloud import WordCloud        #词云库

import matplotlib.pyplot as plt        #数学绘图库

import jieba;

2. 定义变量（将对于的变量到一个全局的文件中）：

import re;

pdurl_first='https://movie.douban.com/subject/26363254/comments?start=0'

head={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36'}

reg=re.compile(r'<a href="(.*?)&amp;.*?class="next">') #下一页

cookies={"__utma":"30149280.503249607.1504402391.1504402391.1504402391.1",

         "_utmb":"30149280.2.9.1504402391","__utmc":"","__utmt":"",

         "__utmz":"30149280.1504402391.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",

         "ap":"","as":'"https://movie.douban.com/subject/26363254/comments?start=225&limit=20&sort=new_score&status=P"',

         "bid":"g7k4BGd2sRk","ck":"76vs","dbcl2":'"166279730:fohmXhoM9uU"',"ps":"y","push_doumail_num":"",

         "push_doumail_num":""}

3. 抓取数据

import requests;

import re;

from GrabData import Param;

import pandas as pd;

from bs4 import BeautifulSoup;

class GrabComent:

    ren = re.compile(r'<span class="votes">(.*?)</span>.*?comment">.*?</span>.*?<span.*?class="">(.*?)</a>.*?<span>(.*?)</span>.*?title="(.*?)"></span>.*?title="(.*?)"><p .*? > (.*?)</p>',re.S)

    def __init__(self):

        print('开始抓取数据');

        html = requests.get(Param.pdurl_first, headers=Param.head, cookies=Param.cookies);

        while html.status_code == 200:

            url_next = 'https://movie.douban.com/subject/26363254/comments' + re.findall(Param.reg, html.text)[0]

            zhanlang = re.findall(self.ren, html.text)

            print(zhanlang)

            data = pd.DataFrame(zhanlang)

            data.to_csv('H:\\python_projects\\ticket\\zhanlangpinglun.csv', header=False, index=False,

                        mode='a+')  # 写入csv文件,'a+'是追加模式

            data = []

            zhanlang = []

            print("下一页地址："+url_next);

            html = requests.get(url_next, cookies=Param.cookies, headers=Param.head)

if __name__ == '__main__':

    GrabComent();

4. 生成云图

from wordcloud import WordCloud        #词云库

import matplotlib.pyplot as plt        #数学绘图库

import jieba;

class WordYun:

    def __init__(self):

        print("开始读取文件!");

        self.main();

    def main(self):

        text = self.readFile();

        self.showTitle(text);

    def showTitle(self,text1):

        wc1 = WordCloud(

            background_color="white",

            width=1000,

            height=860,

            font_path="D:\\Windows\\Fonts\\STFANGSO.ttf",  # 不加这一句显示口字形乱码

            margin=2);

        wc2 = wc1.generate(text1)  # 我们观察到generate()接受一个Unicode的对象，所以之前要把文本处理成unicode类型

        plt.imshow(wc2)

        plt.axis("off")

        plt.show();

    def readFile(self):

        a = []

        f = open(r'H:\\python_projects\\ticket\\zhanlangpinglun.csv', 'r').read()

        words = list(jieba.cut(f))

        for word in words:

            if len(word) > 1:

                a.append(word);

        txt = r' '.join(a)

        print("readFile返回的结果："+txt);

        return txt;

if __name__ == '__main__':

    WordYun();

码农公寓

相关文章