1 import requests 2 from lxml import etree 3 import re 4 from multiprocessing.dummy import Pool 5 #需求:爬取梨视频的视频数据 6 headers = { 7 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 8 } 9 #原则:线程池处理的是阻塞且较为耗时的操作 10 11 #对下述url发起请求解析出视频详情页的url和视频的名称 12 url = 'https://www.pearvideo.com/category_5' 13 page_text = requests.get(url=url,headers=headers).text 14 tree = etree.HTML(page_text) 15 li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') 16 urls = [] #存储所有视频的链接and名字 17 for li in li_list: 18 detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] 19 name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4' 20 #对详情页的url发起请求 21 detail_page_text = requests.get(url=detail_url,headers=headers).text 22 #从详情页中解析出视频的地址(url) 23 ex = 'srcUrl="(.*?)",vdoUrl' 24 video_url = re.findall(ex,detail_page_text)[0] 25 dic = { 26 'name':name, 27 'url':video_url 28 } 29 urls.append(dic) 30 #对视频链接发起请求获取视频的二进制数据,然后将视频数据进行返回 31 def get_video_data(dic): 32 url = dic['url'] 33 print(dic['name'],'正在下载......') 34 data = requests.get(url=url,headers=headers).content 35 #持久化存储操作 36 with open(dic['name'],'wb') as fp: 37 fp.write(data) 38 print(dic['name'],'下载成功!') 39 #使用线程池对视频数据进行请求(较为耗时的阻塞操作) 40 pool = Pool(4) 41 pool.map(get_video_data,urls) 42 43 pool.close() 44 pool.join()