利用线程池和回调函数爬虫

from concurrent.futures import ThreadPoolExecutor
import requests
import re
import uuid

pool = ThreadPoolExecutor(200)


# 1.发送请求函数
def get_page(url):
    response = requests.get(url)
    return response


# 2.解析主页获取视频ID号
def parse_index(response):
    id_list = re.findall(
        '<a href="video_(.*?)".*?>',
        response.text,
        re.S
    )
    return id_list


# 3.解析视频详情页获取真实 视频链接
def parse_detail(res):
    response = res.result()
    movie_detail_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
    print(f'往视频链接: {movie_detail_url}发送请求...')

    # 异步往视频详情页链接发送请求,把结果交给
    pool.submit(get_page, movie_detail_url).add_done_callback(save_movie)
    return movie_detail_url


# 4.往真实视频链接发送请求,获取数据并保存到本地
def save_movie(res):
    movie_response = res.result()
    # print(1111)
    # movie_response = get_page(movie_detail_url)
    # print(movie_response)
    name = str(uuid.uuid4())
    print(f'{name}.mp4视频开始保存...')
    with open(f'{name}.mp4', 'wb') as f:
        f.write(movie_response.content)
    print('视频下载完毕!')


if __name__ == '__main__':

    # 1.访问主页获取数据
    index_response = get_page('https://www.pearvideo.com/')

    # # 2.解析主页获取所有的视频id号
    id_list = parse_index(index_response)
    print(id_list)
    # 3.循环对每个视频详情页链接进行拼接
    for id in id_list:
        print(id)
        detail_url = 'https://www.pearvideo.com/video_' + id

        # 异步提交爬取视频详情页,把返回的数据,交给parse_detail(回调函数)
        pool.submit(get_page, detail_url).add_done_callback(parse_detail)
上一篇:scrapy把数据保存到mongodb


下一篇:scrapy存储到Excel中