爬虫Day04

正则爬取糗事百科图片

import requests
import re
import os
dir_name = './糗图'
if not os.path.exists(dir_name):
os.mkdir(dir_name)

if name == 'main':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
#制作url模板
url = 'https://www.qiushibaike.com/imgrank/page/%d/'
for pageNum in range(1,3):
#对应页码的url
new_url = format(url%pageNum)

    #使用通用爬虫对整张页面爬取
    page_text = requests.get(url=new_url,headers=headers).text
#聚焦爬虫将页面中所有的糗图解析提取

  • 孩子,你考虑过这婴儿车的感受么?
  • 爬虫Day04

        ex = '<div class="thumb">.*?<img src="(.*?)" alt=".*?</div>'
        img_src_list = re.findall(ex,page_text,re.S)
    
        for src in img_src_list:
            src = 'https:'+src
            #发起请求
            img_data = requests.get(url=src,headers=headers).content
            #生产图片名称
            img_name = src.split('/')[-1]
            img_path = dir_name+'/'+img_name
            print(img_name + 'success')
    
            with open(img_path,'wb') as fp:
                fp.write(img_data)
    上一篇:cgb2008-京淘day04


    下一篇:Day04内容总结稿子