'''''' ''' https://movie.douban.com/top250?start=0&filter= https://movie.douban.com/top250?start=25&filter= https://movie.douban.com/top250?start=50&filter= 1、发送请求 2、解析数据 3、保存数据 ''' import requests import re #爬虫三部曲 #1、发送请求 def get_page(base_url): response = requests.get(base_url) return response #2、解析文本 def parse_index(text): res = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>' '.*?导演:(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>' '.*?<span class="inq">(.*?)</span>',text,re.S) #print(res) return res #3、保存数据 def save_data(data): with open('douban.txt','a',encoding='utf-8') as f: f.write(data) #main + 回车键 if __name__ == '__main__': #num = 10 #base_url = 'https://movie.douban.com/top250?start={}&filter='.format(num) num = 0 for line in range(10): base_url = f'https://movie.douban.com/top250?start={num}&filter=' num += 25 print(base_url) #1、发送请求,调用函数 response = get_page(base_url) #2、解析文本 movie_list = parse_index(response.text) #3、保存数据 #数据的格式化 for movie in movie_list: #print(movie) #解压赋值 #电影排名、电影url、电影名称、导演 - 主演 - 类型,电影评价,评价人数,电影简介 v_top,v_url,v_name,v_daoyan,v_point,v_num,v_desc = movie movie_content = f''' 电影排名:{v_top} 电影url:{v_url} 电影名称:{v_name} 电影主演:{v_daoyan} 电影评分:{v_point} 评价人数:{v_num} 电影简介:{v_desc} \n ''' print(movie_content) #保存数据 save_data(movie_content)