爬取背景:福建省发布了选考要求数据,想要获取数据进行分析,无奈数据量太大
需求分析:要爬取数据的网站为 http://fj.101.com/gaokao/#/,需要将数据存储为csv格式。
爬取代码如下
# coding=gbk
import requests # 引入爬虫所需的requests模块
from bs4 import BeautifulSoup # 引入BS模块
import json
import csv
for index_num in range(876):
index_num = str(index_num + 1)
base_url = 'https://wjt-subject-tool-api.sdp.101.com/v1/actions/manage?_=1567736178037&page={}&page_size=30&school_name=&subject_name='
target_url = base_url.format(index_num) # 拼接完整的目标URL
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
response = requests.get(url=target_url, headers=headers)
if response.status_code == 200:
# 状态码200,获取成功,则返回获取到的HTML数据
school_items = response.content.decode("utf-8")
else:
# 获取不成功则return None
school_items = None
# print(school_items)
# print(type(school_items)) # 注意:此时items的格式是str
jsons = json.loads(school_items) # 转为json格式
# print(jsons)
# print(type(jsons)) # 此时json的格式为dict
ss = jsons['items']
# print(ss)
school_data_result=[]
for s in ss: # 字典遍历取值
school_list = [s['id'], s['school_name'], s['subject_name'], s['subject_detail'], s['fsubject'], s['ssubject']]
# print(school_list) # 直接在屏幕上打印高校选科要求数据
# 将高校名称写入csv文件
with open ('fujian_data.csv',"a+",newline='',encoding="utf-8-sig") as f: # 注意此处写encoding="utf-8"会出现乱码;此外,使用newline=''避免出现空行
writer = csv.writer(f)
writer.writerow(list(school_list))
print('第'+ index_num + '页抓取完成')