Python爬虫获取tx收集到的关于疫情的数据
提前说明:在b占看到关于这个的视频之后跟着打的代码:
https://www.bilibili.com/video/BV177411j7qJ?spm_id_from=333.999.0.0
import urllib.request as rq
# from bs4 import BeautifulSoup
import json
url_today = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5" # 从腾讯获得数据的地址
url_last = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
def gethtml(url):
"""
:return: 输入url,返回对应的html数据
"""
headers = { #设置请求头,防止反爬
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
req = rq.Request(url, headers=headers) #这两行也可以使用requests,requests可以直接requests.get(url, headers)
res = rq.urlopen(req)
html = res.read().decode("utf-8") # 设置字符格式是utf-8
return html
def get_history(url_last, url_today):
"""
:return: 输入url,历史数据和当日的详细数据
"""
# 历史数据
html_last = gethtml(url_last)
# dict_keys(['ret', 'data'])
data_last = json.loads(html_last) # json字符串转字典
# print("-------------")
# print(data_last.keys())
# print("-------------")
# dict_keys(['cityStatis', 'chinaDayList', 'chinaDayAddList', 'provinceCompare', 'nowConfirmStatis', 'statisGradeCityDetail', 'dailyNewAddHistory', 'dailyHistory', 'wuhanDayList', 'articleList'])
data_last = json.loads(data_last["data"])
# print("-------------")
# print(data_last.keys())
# print("-------------")
# 当天数据
html_today = gethtml(url_today) #
data_today = json.loads(html_today)
# dict_keys(['lastUpdateTime', 'chinaTotal', 'chinaAdd', 'isShowAdd', 'showAddSwitch', 'areaTree'])
data_today = json.loads(data_today["data"])
# 数据爬取完成
# ---------------------------------------------------------- #
# 处理爬取到的历史数据
history_data = {}
for every_data in data_last["chinaDayList"]: # 每日疫情情况列表
date = "2021." + every_data["date"] # date中只有月和日,不能直接存入数据库,要加年份
tup = time.strptime(date, "%Y.%m.%d")
date = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
confirm = every_data["confirm"] # 总确诊病例
suspect = every_data["suspect"] # 总疑似病例
dead = every_data["dead"] # 总死亡病例
heal = every_data["heal"] # 总治愈病例
# 新的字典
history_data[date] = {"confirm":confirm, "suspect":suspect, "dead":dead, "heal":heal}
for every_data_add in data_last["chinaDayAddList"]: # 每日新增情况列表
date_add = "2021." + every_data_add["date"]
tup_add = time.strptime(date_add, "%Y.%m.%d")
date_add = time.strftime("%Y-%m-%d", tup_add) # 改变时间格式
confirm_add = every_data_add["confirm"] # 新增确诊病例
suspect_add = every_data_add["suspect"] # 新增疑似病例
dead_add = every_data_add["dead"] # 新增死亡病例
heal_add = every_data_add["heal"] # 新增治愈病例
# 更新数据,update函数可以添加新的键值对
history_data[date_add].update({"confirm_add":confirm_add, "suspect_add":suspect_add, "dead_add":dead_add, "heal_add":heal_add})
'''
areaTree :name 中国数据
today
total
children :-name 省级数据
-today
-total
-children:-name 市级数据
-today
-total
'''
# 处理爬取的实时详细数据
details = []
update_time = data_today["lastUpdateTime"]
data_country = data_today["areaTree"] # list 25个国家
# print(data_country[0]["children"])
data_province = data_country[0]["children"] # 0表示中国,children表示中国的各个省区
# print(data_province)
for pro_infos in data_province:
province = pro_infos["name"] # 中国各省名字
for city_infos in pro_infos["children"]: # 每个省的每个市
city = city_infos["name"]
confirm = city_infos["total"]["confirm"] # 总确诊人数
confirm_add = city_infos["today"]["confirm"] # 新增确诊人数
heal = city_infos["total"]["heal"] # 总治愈人数
dead = city_infos["total"]["dead"] # 总死亡人数
details.append([update_time, province, city, confirm, confirm_add, heal, dead])
return history_data, details
get_history(url_last, url_today)