爬取目标网站https://sc.chinaz.com/jianli/free.html
思路
思路捋清,直接上代码
# -*- codeing = utf-8 -*-
# @Time : 2021/7/20 10:13
# @Author : ArthurHuang
# @File : 10_xpath解析案例_站长素材中免费简历模板爬取.py
# @Software : PyCharm
import requests
from lxml import html
etree = html.etree #新版本etree现在需要这样导入
import os
if __name__ == "__main__":
url = 'http://sc.chinaz.com/jianli/free_%d.html'
for page in range(1, 6): # 循环取前5页,每页20张简历
# UA伪装:将对应的User-Agent封装到一个字典中
headers = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 91.0.4472.77 Safari / 537.36"
}
if page == 1: # 第一页与其余几页的url不同,需要分开写
new_url = 'http://sc.chinaz.com/jianli/free.html'
else:
new_url = format(url % page)
page_text = requests.get(url=new_url, headers=headers).text
# 实例化etree对象
tree = etree.HTML(page_text)
# 创建一个文件夹保存图片
if not os.path.exists('./jianliLibs'):
os.mkdir('./jianliLibs')
a_list = tree.xpath('//div[@id="container"]/div/a')
for a in a_list:
# 获取简历名称列表
all_titles = a.xpath('./img/@alt')[0]+'.zip'
all_titles = all_titles.encode('iso-8859-1').decode('utf-8') # 通用处理中文乱码的解决方案
#print(all_titles)
# 获取每个简历对应的单独网页地址
all_href = 'https:'+a.xpath('./@href')[0]
response = requests.get(url=all_href, headers=headers)
resume_data = response.text
resumetree = etree.HTML(resume_data)
resume_download_list = resumetree.xpath('//div[@id="down"]/div[2]/ul/li[1]')
# 每个简历对应的点击下载的地址
for download in resume_download_list:
all_downloads = download.xpath('./a/@href')[0]
resume_rar_page = requests.get(url=all_downloads, headers=headers).content # 向点击下载的url发送请求,把简历下载到本地
resume_path = 'jianliLibs/' + all_titles
with open(resume_path, 'wb')as fp:
fp.write(resume_rar_page )
print(all_titles, "下载成功!!!")
成功获取