线程池练习

 1 import requests
 2 from lxml import etree
 3 import re
 4 from multiprocessing.dummy import Pool
 5 #需求:爬取梨视频的视频数据
 6 headers = {
 7     'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
 8 }
 9 #原则:线程池处理的是阻塞且较为耗时的操作
10 
11 #对下述url发起请求解析出视频详情页的url和视频的名称
12 url = 'https://www.pearvideo.com/category_5'
13 page_text = requests.get(url=url,headers=headers).text
14 tree = etree.HTML(page_text)
15 li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
16 urls = [] #存储所有视频的链接and名字
17 for li in li_list:
18     detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
19     name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
20     #对详情页的url发起请求
21     detail_page_text = requests.get(url=detail_url,headers=headers).text
22     #从详情页中解析出视频的地址(url)
23     ex = 'srcUrl="(.*?)",vdoUrl'
24     video_url = re.findall(ex,detail_page_text)[0]
25     dic = {
26         'name':name,
27         'url':video_url
28     }
29     urls.append(dic)
30 #对视频链接发起请求获取视频的二进制数据,然后将视频数据进行返回
31 def get_video_data(dic):
32     url = dic['url']
33     print(dic['name'],'正在下载......')
34     data = requests.get(url=url,headers=headers).content
35     #持久化存储操作
36     with open(dic['name'],'wb') as fp:
37         fp.write(data)
38         print(dic['name'],'下载成功!')
39 #使用线程池对视频数据进行请求(较为耗时的阻塞操作)
40 pool = Pool(4)
41 pool.map(get_video_data,urls)
42 
43 pool.close()
44 pool.join()

 

上一篇:python常用数据转换


下一篇:华为OBS上传,与modelart添加标签--python