利用urllib以及re模块进行豆瓣网页Top250电影信息的抓取
import urllib.request
import bs4
import re
from openpyxl import Workbook
# 网页信息匹配规则
pattern_find_Title = re.compile(r'<span class="title">(.*?)</span>') # 电影名字匹配规则
pattern_find_Score = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') # 电影评分匹配规则
pattern_find_Evaluator = re.compile(r'<span>(\d*)人评价</span>') # 电影评价人数匹配规则
pattern_find_actor = re.compile(r'主演: (.*?) ') #主演匹配规则
pattern_find_BriefComment = re.compile(r'<span class="inq">(.*?)</span>') # 简评匹配规则
pattern_find_Editor = re.compile(r'导演: (.*?) ') #导演匹配规则
def main():
savepath = 'C:/Users/slli/Desktop/豆瓣Top250.xlsx'
base_url = 'https://movie.douban.com/top250?start='
data_xlsx = data_cross(base_url)
save_data(data_xlsx,savepath)
def askurl(url): # 得到指定的一个url网页信息
head = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
# print(html)
except Exception as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
return html
def data_cross(base_url):
data_xlsx = [['排名','电影名称','导演','主演','评分','评价人数','简评']]
for i in range(0, 10):
url = base_url + str(i * 25)
html = askurl(url)
soup = bs4.BeautifulSoup(html, 'html.parser')
all_items = soup.find_all('div', class_="item")
for j in range(0,len(all_items)):
data = []
item = str(all_items[j])
rank_str = '第{}名'.format(i*25+j+1)
rank = []
rank.append(rank_str)
title = re.findall(pattern_find_Title, item)
editor = re.findall(pattern_find_Editor,item)
actor = re.findall(pattern_find_actor, item)
score = re.findall(pattern_find_Score, item)
evaluator = re.findall(pattern_find_Evaluator, item)
comment = re.findall(pattern_find_BriefComment, item)
data.append(rank[0])
data.append(title[0])
data.append(editor[0])
if actor == []:
data.append('名字不详')
else:
data.append(actor[0])
data.append(score[0])
data.append(evaluator[0])
if comment == []:
data.append('暂未简评')
else:
data.append(comment[0])
data_xlsx.append(data)
return data_xlsx
def save_data(data_xlsx,savepath):
wb = Workbook()
ws = wb.create_sheet(title='豆瓣Top250')
for row in data_xlsx:
ws.append(row) #把每一行写入到excel
wb.save(savepath)
if __name__ == '__main__':
main()
print('Done!')
运行后的结果: