豆瓣网是一家基于用户对于图书、电影和音乐兴趣而搭建的社交网站。爬取豆瓣网上面的电影评论数据具有很重要的作用。电影评论数据是NLP(自然语言处理)的结果,通过电影评论数据集可以进一步做中文分词、命名实体识别、关键词提取、句法分析、文本向量化、情感分析、舆情分析等进一步的数据处理和应用。和天启IP一起看看怎么爬取豆瓣影评吧~
from urllib import request
import time
import re
import os
os.mkdir(r’C:\Users*\Desktop\PYhomework\c800’)
search_counts = 800
url = ‘https://movie.douban.com/subject/2353023/reviews’
headers = {***}
headers[‘Referer’] = ‘https://movie.douban.com/subject/***/’
i = 0
lists = []
for count in range(0, search_counts, 20):
url = url + "?start=" + str(count)
req = request.Request(url, headers=headers)
response = request.urlopen(req)
HTML = response.read()
HTML = HTML.decode("utf-8")
pattern = re.compile("<div data-cid=\"(.*)\">")
lists = pattern.findall(HTML) + lists
'''爬取实际评论'''
headers[
‘Cookie’] = ‘***’
headers[‘Host’] = ‘movie.douban.com’
headers[‘Sec-Fetch-Dest’] = ‘document’
headers[‘Sec-Fetch-Mode’] = ‘navigate’
headers[‘Sec-Fetch-Site’] = ‘none’
headers[‘Sec-Fetch-User’] = ‘?1’
headers[‘Upgrade-Insecure-Requests’] = ‘1’
print(‘爬取成功!’)
for id in lists:
i += 1
url = 'https://movie.douban.com/j/review/' + id + '/full'
req = request.Request(url, headers=headers)
response = request.urlopen(req)
comment = response.read()
comment = comment.decode("utf-8")
with open(r"C:\Users\*\Desktop\PYhomework\c800\comment%d.txt" % i, mode="w", encoding="utf-8") as c:
c.write(comment)
print("comment%d保存成功!" % i)
time.sleep(0) # 随缘设置
print(“抓取完成!”)