import requests """ 爬取校花网 1、请求url www.xiaohuar.com/v 2、请求方式 get 3、User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36 """ #爬虫三部 #1.发送请求 def get_page(url): response = requests.get(url) return response #2.解析数据 import re def parse_index(html): detail_urls = re.findall(' <div class="items"><a class="imglink" href="(.*?)"',html,re.S) return detail_urls def parse_detail(html): movie_url = re.findall('<source src="(.*?)">',html,re.S) if movie_url: return movie_url[0] #3.保存数据 import uuid def save_movies(content): with open(f"{uuid.uuid4()}.mp4","wb") as f : f.write(content) print("下载完成.....") #main if __name__ == '__main__': # http: // www.xiaohuar.com / list - 3 - 1.html count = 0 #计数 for i in range(6): #发送请求 url = f"http://www.xiaohuar.com/list-3-{i}.html" #改变网址中特定的字符的值,根据规律进行爬取下一页 response = get_page(url) #响应状态码 # print( response.status_code) #响应文本 # print(response.text) #解析页面 detail_urls = parse_index(response.text) for detail_url in detail_urls: print(detail_url) detail_1 = get_page(detail_url) movie_1 = parse_detail(detail_1.text) if movie_1: movie_res = get_page(movie_1) count += 1 print(f"正在爬取第{count}个视频..") save_movies(movie_res.content) print("第一页爬取完成!")