b站爬取排行榜内容并生成词云图

import requests
import linecache
import wordcloud
import jieba
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

if __name__=="__main__":
    n=0#ID编号
    target='https://www.bilibili.com/v/popular/rank/all'#b站
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
    headers = {'User-Agent':user_agent}

    req=requests.get(url=target)
    html=req.text
    html=html.replace('<br>',' ').replace('<br/>',' ').replace('/>','>')
    bf=BeautifulSoup(html,"html.parser")   

    texts=bf.find('ul',class_='rank-list')
    texts_div=texts.find_all('div',class_='info')
    #print(texts_div)
    yun=""
    for item in texts_div:
        n=n+1
        item_name=item.find('a').text#标题
        yun+=str(item_name)
        item_href=item.find('a')['href']#链接
        h=item_href.rfind('/')
        item_href=item_href[h+1:]
        item_refer=item.find_all('span',class_='data-box')
        item_refer1=item_refer[0].text
        item_refer2=item_refer[1].text
        mid=[n,item_name,item_href,item_refer1,item_refer2]
        print(mid)

    # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云
    cut_text = " ".join(jieba.cut(yun))
    wc = wordcloud.WordCloud(
    #设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
    font_path="C:/Windows/Fonts/simfang.ttf",
     #设置了背景,宽高
    background_color="white",width=1000,height=880).generate(cut_text)

    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

 

上一篇:NLP Calculate the similarity of any two articles resume version


下一篇:【505】Using keras for word-level one-hot encoding