2021-10-02

Python爬虫获取tx收集到的关于疫情的数据

提前说明:在b占看到关于这个的视频之后跟着打的代码:
https://www.bilibili.com/video/BV177411j7qJ?spm_id_from=333.999.0.0

import urllib.request as rq
# from bs4 import BeautifulSoup
import json


url_today = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"    # 从腾讯获得数据的地址
url_last = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"

def gethtml(url):
    """
    :return: 输入url,返回对应的html数据
    """
    headers = {   #设置请求头,防止反爬
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    }
    req = rq.Request(url, headers=headers)    #这两行也可以使用requests,requests可以直接requests.get(url, headers)
    res = rq.urlopen(req)
    html = res.read().decode("utf-8")    # 设置字符格式是utf-8
    return html   

def  get_history(url_last, url_today):
    """
    :return: 输入url,历史数据和当日的详细数据
    """
    # 历史数据
    html_last = gethtml(url_last)
    
    # dict_keys(['ret', 'data'])
    data_last = json.loads(html_last)    # json字符串转字典
#     print("-------------")
#     print(data_last.keys())
#     print("-------------")
    
    # dict_keys(['cityStatis', 'chinaDayList', 'chinaDayAddList', 'provinceCompare', 'nowConfirmStatis', 'statisGradeCityDetail', 'dailyNewAddHistory', 'dailyHistory', 'wuhanDayList', 'articleList'])
    data_last = json.loads(data_last["data"])
#     print("-------------")
#     print(data_last.keys())
#     print("-------------")

    # 当天数据
    html_today = gethtml(url_today)    # 
    data_today = json.loads(html_today)
    # dict_keys(['lastUpdateTime', 'chinaTotal', 'chinaAdd', 'isShowAdd', 'showAddSwitch', 'areaTree'])
    data_today = json.loads(data_today["data"])
    
    # 数据爬取完成
    
    # ---------------------------------------------------------- #

    # 处理爬取到的历史数据
    history_data = {}   
    for every_data in data_last["chinaDayList"]:  # 每日疫情情况列表
        date = "2021." + every_data["date"]    # date中只有月和日,不能直接存入数据库,要加年份
        tup = time.strptime(date, "%Y.%m.%d")
        date = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错,数据库是datetime类型

        confirm = every_data["confirm"]    # 总确诊病例
        suspect = every_data["suspect"]    # 总疑似病例
        dead = every_data["dead"]          # 总死亡病例
        heal = every_data["heal"]          # 总治愈病例
        # 新的字典
        history_data[date] = {"confirm":confirm, "suspect":suspect, "dead":dead, "heal":heal}    

    for every_data_add in data_last["chinaDayAddList"]:    # 每日新增情况列表
        date_add = "2021." + every_data_add["date"]
        tup_add = time.strptime(date_add, "%Y.%m.%d")
        date_add = time.strftime("%Y-%m-%d", tup_add)  # 改变时间格式

        confirm_add = every_data_add["confirm"]    # 新增确诊病例
        suspect_add = every_data_add["suspect"]    # 新增疑似病例
        dead_add = every_data_add["dead"]          # 新增死亡病例 
        heal_add = every_data_add["heal"]          # 新增治愈病例
        # 更新数据,update函数可以添加新的键值对
        history_data[date_add].update({"confirm_add":confirm_add, "suspect_add":suspect_add, "dead_add":dead_add, "heal_add":heal_add})

    ''' 
    areaTree :name 中国数据
                   today
                   total
                   children :-name 省级数据 
                            -today
                            -total
                            -children:-name 市级数据
                                      -today
                                      -total
                                      
    '''
    # 处理爬取的实时详细数据
    details = []
    update_time = data_today["lastUpdateTime"]
    data_country = data_today["areaTree"]    # list 25个国家
    # print(data_country[0]["children"])  
    data_province = data_country[0]["children"]    # 0表示中国,children表示中国的各个省区
    # print(data_province)
    for pro_infos in data_province:
        province = pro_infos["name"]    # 中国各省名字
        for city_infos in pro_infos["children"]:    #  每个省的每个市
            city = city_infos["name"]
            confirm = city_infos["total"]["confirm"]        # 总确诊人数
            confirm_add = city_infos["today"]["confirm"]    # 新增确诊人数
            heal = city_infos["total"]["heal"]              # 总治愈人数
            dead = city_infos["total"]["dead"]              # 总死亡人数
            details.append([update_time, province, city, confirm, confirm_add, heal, dead])
    return history_data, details    


get_history(url_last, url_today)
上一篇:Java 7: 全面教程-第一章节: Java初体验


下一篇:C# 11编程练习