day01

import requests
"""
爬取校花网
    1、请求url
        www.xiaohuar.com/v
    2、请求方式
        get
    3、User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
        
"""
#爬虫三部
#1.发送请求
def get_page(url):
   response = requests.get(url)
   return  response
#2.解析数据
import re
def parse_index(html):
    detail_urls = re.findall(' <div class="items"><a class="imglink" href="(.*?)"',html,re.S)
    return detail_urls

def parse_detail(html):
    movie_url = re.findall('<source src="(.*?)">',html,re.S)
    if movie_url:
        return movie_url[0]
#3.保存数据
import uuid
def save_movies(content):
    with open(f"{uuid.uuid4()}.mp4","wb") as f :
        f.write(content)
        print("下载完成.....")

#main
if __name__ == '__main__':
    # http: // www.xiaohuar.com / list - 3 - 1.html
    count = 0   #计数
    for i in range(6):
        #发送请求
        url = f"http://www.xiaohuar.com/list-3-{i}.html"    #改变网址中特定的字符的值,根据规律进行爬取下一页
        response = get_page(url)
        #响应状态码
        # print( response.status_code)
        #响应文本
        # print(response.text)
        #解析页面
        detail_urls = parse_index(response.text)
        for detail_url in detail_urls:
            print(detail_url)
            detail_1 = get_page(detail_url)
            movie_1 = parse_detail(detail_1.text)
            if movie_1:
                movie_res = get_page(movie_1)
                count += 1
                print(f"正在爬取第{count}个视频..")
                save_movies(movie_res.content)
        print("第一页爬取完成!")

 

上一篇:爬虫Day1


下一篇:day3 python爬虫