上代码:
# Changjin Lake film data from douban # @Time: 20211006 # @Author: heheyang import requests from bs4 import BeautifulSoup import re import pandas as pd def singlePage_crawl(url,headers,comments_info): """ 豆瓣单页爬取评论 :param url: 待爬取url :return: 评论信息字典comments_dict """ # 豆瓣反爬机制,要加上请求头 html = requests.get(url,headers=headers).text soup = BeautifulSoup(html, 'html.parser') # 利用beautifulsoup找到具体位置 contents_find = soup.find_all(attrs={'class': 'short'}) contents_info_find = soup.find_all(attrs={'comment-info'}) # 利用正则表达式提取短评 for content in contents_find: comment = re.findall('<span class="short">(.*?)</span>',str(content)) if comment: comments_info["comments"].append(comment[0]) else: comments_info["comments"].append(None) # 提取评论时间和用户名 for contents_info in contents_info_find: # 匹配name name = re.findall(">(.*?)</a>",str(contents_info)) comments_info["name"].extend(name) # 匹配date和rate lst_tmp = re.findall('title="(.*?)"',str(contents_info)) if len(lst_tmp) == 2: ratetitle,date = lst_tmp[0],lst_tmp[1] elif len(lst_tmp) == 1: ratetitle = None date = lst_tmp[0] else: ratetitle = None date = None comments_info["rate"].append(ratetitle) comments_info["date"].append(date) print(len(comments_info["date"])) return comments_info def main(): """ program flow :return: 评论信息excel """ headers = { "cookie": '自行添加', "USER-AGENT": '自行添加' } comments_info = { "name":[], "date":[], "rate":[], "comments":[] } for i in range(25): url = "https://movie.douban.com/subject/25845392/comments?start=%d&limit=20&status=P&sort=new_score" %(20*i) comments_info = singlePage_crawl(url, headers, comments_info) df = pd.DataFrame(comments_info) df.to_excel("douban_comments.xlsx") if __name__ == '__main__': main()
请求头自行添加,结果保存到excel中: