b站评论区爬取

import requests

import time

from bs4 import BeautifulSoup

import json

# 必要的库



def get_html(url):

    headers = {

    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

    }

    # 模拟访问信息
    r = requests.get(url, timeout=30, headers=headers)

    r.raise_for_status()

    r.endcodding = 'utf-8'

    return r.text



def get_content(url):

        comments = []

        html = get_html(url)

        try:

            s = json.loads(html)

        except:

             print("jsonload error")



        num = len(s['data']['replies']) # 获取每页评论栏的数量



        i = 0

        while i < num:

            comment = s['data']['replies'][i] # 获取每栏信息

            InfoDict = {}  # 存储每组信息字典
            InfoDict['用户名'] = comment['member']['uname']

            InfoDict['uid号'] = comment['member']['mid']

            InfoDict['评论内容'] = comment['content']['message']

            InfoDict['性别'] = comment['member']['sex']

            comments.append(InfoDict)

            i+=1
        return comments

def Out2File(dict):
    with open('评论区爬取.txt', 'a+', encoding='utf-8') as f:
        for user in dict:
            try:

                f.write('姓名:{}\t uid:{}\t 性别:{}\t \n 评论内容:{}\t \n'.format(user['用户名'], user['uid号'], user['性别'], user['评论内容']))



            except:

             print("out2File error")

        print('当前页面保存完成')





e = 0

page = 1

while e == 0:

    url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(page) + "&type=1&oid=677870443&mode=3&plat=1&_=1641278727643"


    try:

        print()

        content = get_content(url)

        print("page:", page)
        Out2File(content)

        page = page + 1



# 为了降低被封ip的风险,每爬10页便歇5秒。
        if page % 10 == 0: # 求余数

            time.sleep(5)
    except:
            e = 1

b站评论区爬取

 

 

参考视频:https://www.bilibili.com/video/BV1fu411d7Hy?from=search&seid=3483579157564497530&spm_id_from=333.337.0.0

上一篇:异常处理2:异常处理方式、问区别的面试题汇总


下一篇:飞凌FCU1104嵌入式控制单元JAVA读取串口