1.爬取站长图片源码
#爬取站长'http://sc.chinaz.com/tupian/gudianmeinvtupian.html',所有的古典美女图片 import os import time import random import requests from lxml import etree from multiprocessing.dummy import Pool #获取所有页面的url url ='http://sc.chinaz.com/tupian/gudianmeinvtupian.html' page_url_list=[f'http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html' for i in range(2,7)] page_url_list.insert(0,url) headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36', # 'Content-Encoding':'gzip', # 'Content-Type': 'text/html', } pig_url_list = [] def get_pig_url(url): response = requests.get(url=url, headers=headers) #xpath解析数据 tree = etree.HTML(response.content.decode()) div_list = tree.xpath('//div[@id="container"]/div') for div in div_list: url = div.xpath('.//img/@src2')[0] pig_url_list.append(url) def download(url): '''下载图片数据''' return requests.get(url=url,headers=headers).content def save_pig(data): '''保存图片''' # name=url.split('/')[-1] name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善 path='zhanzhangpig/'+name with open(path,'wb') as f: f.write(data) if not os.path.exists('zhanzhangpig'): os.makedirs('zhanzhangpig') # 使用线程池 print('多线程爬取开始') start_time=time.time() pool=Pool(8) pool.map(get_pig_url,page_url_list) data_list=pool.map(download,pig_url_list) pool.map(save_pig,data_list) #关闭线程池 end_time=time.time() print('多线程爬取结束') print('耗时:',end_time-start_time) pool.close() pool.join()
2 爬取妹子网图片(https://www.mzitu.com/tag/ugirls/)
import os import time import random import requests from lxml import etree from multiprocessing.dummy import Pool session=requests.session() if not os.path.exists('meizitu'): os.makedirs('meizitu') url='https://www.mzitu.com/tag/ugirls/' page_url_list=[f'https://www.mzitu.com/tag/ugirls/page/{i}/' for i in range(2,17)] page_url_list.insert(0,url) headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://www.mzitu.com/tag/ugirls/' # 反爬机制:需携带网页请求的原地址 } pig_url_list = [] def get_pig_url(url): response = session.get(url=url, headers=headers) # print(response.text) #xpath解析数据 tree = etree.HTML(response.content.decode()) div_list = tree.xpath('//ul[@id="pins"]/li') for div in div_list: url = div.xpath('.//img/@data-original')[0] pig_url_list.append(url) def download(url): '''下载图片数据''' # print(url) return session.get(url=url,headers=headers).content def save_pig(data): '''保存图片''' name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善 path='meizitu/'+name with open(path,'wb') as f: f.write(data) print('多线程爬取开始') start_time=time.time() #开启线程 pool=Pool(10) # pig_url_list=get_pig_url(url=url) #单页爬取 #多页爬取 pool.map(get_pig_url,page_url_list) # print(pig_url_list) data_list=pool.map(download,pig_url_list) pool.map(save_pig,data_list) pool.close() pool.join() #关闭线程池 end_time=time.time() print('多线程爬取结束') print('耗时:',end_time-start_time) #--------------------统计文件夹中文件个数----------------- print(len(os.listdir('./meizitu')))
!!!384张美图等你拿