''' 爬取豆瓣电影信息 电影排名、电影url、电影名称 电影导演、电影主演、电影年份/类型 电影评分、电影评论、电影简介 分析所有主页的url ''' import requests import re # 爬虫三部曲 # 1.发送请求 def get_page(url): response = requests.get(url) # print(response.text) return response # 2.解析数据 def parse_index(html): movie_list = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演:(.*?)主演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',html,re.S) return movie_list # 3.保存数据 def save_data(movie): top , m_url, name,daoyan,actor,year_type,point,commit,desc = movie year_type = year_type.strip('\n') data = ''' ==========欢迎观赏========== 电影排名:{} 电影url:{} 电影名称:{} 电影导演:{} 电影主演:{} 电影类型:{} 电影评分:{} 电影评论:{} 电影简介:{} ==========下次再来========== \n \n '''.format(top,m_url,name,daoyan,actor,year_type,point,commit,desc) print(data) with open('douban_top250.txt','a',encoding='utf-8') as f: f.write(data) print('电影:{}写入成功...'.format(name)) if __name__ == '__main__': num = 0 for line in range(10): url = 'https://movie.douban.com/top250?start={}&filter='.format(num) num += 25 print(url) # 1.往每个主页发送请求 index_res = get_page(url) #2. 解析主页获取电影信息 movie_list = parse_index(index_res.text) for movie in movie_list: # print(movie) # 3.保存数据 save_data(movie) ''' 爬取豆瓣电影信息 电影排名、电影url、电影名称 电影导演、电影主演、电影年份/类型 电影评分、电影评论、电影简介 分析所有主页的url ''' import requests import re # 爬虫三部曲 # 1.发送请求 def get_page(url): response = requests.get(url) # print(response.text) return response # 2.解析数据 def parse_index(html): movie_list = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演:(.*?)主演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',html,re.S) return movie_list # 3.保存数据 def save_data(movie): top , m_url, name,daoyan,actor,year_type,point,commit,desc = movie year_type = year_type.strip('\n') data = ''' ==========欢迎观赏========== 电影排名:{} 电影url:{} 电影名称:{} 电影导演:{} 电影主演:{} 电影类型:{} 电影评分:{} 电影评论:{} 电影简介:{} ==========下次再来========== \n \n '''.format(top,m_url,name,daoyan,actor,year_type,point,commit,desc) print(data) with open('douban_top250.txt','a',encoding='utf-8') as f: f.write(data) print('电影:{}写入成功...'.format(name)) if __name__ == '__main__': num = 0 for line in range(10): url = 'https://movie.douban.com/top250?start={}&filter='.format(num) num += 25 print(url) # 1.往每个主页发送请求 index_res = get_page(url) #2. 解析主页获取电影信息 movie_list = parse_index(index_res.text) for movie in movie_list: # print(movie) # 3.保存数据 save_data(movie)