爬校园网

#!/usr/bin/env python
#coding: utf8
#python2
import re
import requests
import uuid
def get_page(url):
    response=requests.get(url)
    return  response

def parse_detail(html):
    movie_url = re.findall('<source src="(.*?)">',html,re.S)
    if movie_url:
        return movie_url[0]

def save_video(content):
    with open(f'{uuid.uuid4()}.mp4','wb') as f:
        f.write(content)
        print('打印完毕..')
def parse_index(html):
    # findall 匹配所有
    # re.findall('正则匹配规则','匹配文本','匹配模式')
    # re.S:对全部文本进行搜索
    detail_urls = re.findall(
        '<div class="items"><a class="imglink" href="(.*?)"',
        html, re.S)
    return detail_urls

if __name__ == '__main__':
    for line in range(6):
        url = f'http://www.xiaohuar.com/list-3-{line}'
    url='http://www.xiaohuar.com/v/'
    response=get_page(url)
    # print(response)
    # print(response.status_code)
    # print(response.text)
    detail_urls = parse_index(response.text)

    #循环遍历详情页面url
    for detail_url in detail_urls:
        # print(detail_url)
        # response = get_page(detail_url)
        # print(response.text)
        detail_res = get_page(detail_url)
        movie_url = parse_detail(detail_res.text)

        # 判断视频url存在则打印
        if movie_url:
            print(movie_url)
            #往视频url发送请求获取视频二进制流
            movie_res = get_page(movie_url)
    #       #把视频的二进制流给save_video函数去保存到本地
            save_video(movie_res.content)

    # 解析数据

  

上一篇:爬虫三部曲


下一篇:爬虫Day1