正则爬取糗事百科图片
import requests
import re
import os
dir_name = './糗图'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
if name == 'main':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
#制作url模板
url = 'https://www.qiushibaike.com/imgrank/page/%d/'
for pageNum in range(1,3):
#对应页码的url
new_url = format(url%pageNum)
#使用通用爬虫对整张页面爬取
page_text = requests.get(url=new_url,headers=headers).text
#聚焦爬虫将页面中所有的糗图解析提取
ex = '<div class="thumb">.*?<img src="(.*?)" alt=".*?</div>'
img_src_list = re.findall(ex,page_text,re.S)
for src in img_src_list:
src = 'https:'+src
#发起请求
img_data = requests.get(url=src,headers=headers).content
#生产图片名称
img_name = src.split('/')[-1]
img_path = dir_name+'/'+img_name
print(img_name + 'success')
with open(img_path,'wb') as fp:
fp.write(img_data)