import requests from lxml import etree import re import os from multiprocessing.dummy import Pool import random if __name__ == '__main__': #创建视频得文件 if not os.path.exists("./video"): os.mkdir("./video") url="https://www.pearvideo.com/category_59" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4455.2 Safari/537.36" } page_text=requests.get(url=url,headers=headers).text tree=etree.HTML(page_text) li_list=tree.xpath('//*[@id="listvideoList"]/ul/li') video_ajax="https://www.pearvideo.com/videoStatus.jsp?"#通过抓包工具获取 urls = [] # 存储所有视频的链接and名字 for li in li_list: video_id=li.xpath('./div/a/@href')[0]#得到视频id,如video_1727785 video_num=video_id.split('_')[1]#得到视频id里得数字 video_name=li.xpath("./div/a/div[2]/text()")[0]+'.mp4' params={ 'contId':video_num, 'mrd':str(random.random())#随机数 } video_headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4455.2 Safari/537.36", 'Referer': 'https://www.pearvideo.com/' +video_id } video_dic=requests.get(url=video_ajax,headers=video_headers,params=params).json() # print(video_dic)#此时就不会显示下架了,得到含有视频地址得字典 video_url=video_dic["videoInfo"]["videos"]["srcUrl"] # 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址 # 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4" # 伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4" re_list=re.split('[/-]', video_url) re_str=re_list[6] video_true_url=video_url.replace(re_str,"cont-"+video_num) # print(video_true_url) dic={ "name":video_name, "url":video_true_url } urls.append(dic) #使用线程池对视频数进行请求 def get_video_data(dic): url=dic["url"] name=dic["name"] print(name+"正在下载。。。。。。") video_data=requests.get(url=url,headers=headers).content with open("./video/"+name,"wb")as fp: fp.write(video_data) print(name+"下载成功!!!") pool=Pool(4) pool.map(get_video_data,urls) pool.close() pool.join()