最近花了三天左右的时间做了一个爬虫项目,记录如下:
import requests
from bs4 import BeautifulSoup
url='https://movie.douban.com/top250'
movie_names=[]
movie_messages=[]
movie_scores=[]
movie_rank=[]
def movie_nameget(url):
headers = { # 这是请求头
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'
}
responce=requests.get(url,headers=headers,timeout=10)
page=responce.content.decode()
soup=BeautifulSoup(page,'lxml')
wd=soup.find_all('div',class_='hd')
qt=soup.find_all('p',class_='')
pf=soup.find_all('span',class_="rating_num")
pm=soup.find_all('em',class_="")
for each in qt: #获取导演等信息
message1=each.text.replace(" ","")
message=message1.replace("\n ","")
movie_messages.append(message)
for each in wd: #获取电影名
span=each.find('span',class_='title')
title=span.text
movie_names.append(title)
for each in pf: #获取电影评分
scores=each.text
movie_scores.append("评分"+scores)
for each in pm: #获取电影排名
rank=each.text
movie_rank.append("排名:"+rank+"\t")
for i in range(0,25): #每页有25个电影信息,逐一存储
file=open("movietop250.txt","a", encoding="utf-8")
file.write(movie_rank[i]+movie_names[i]+movie_messages[i]+"\t\t\t"+movie_scores[i]+"\n") #将信息写入到txt文件中
file.close()
movie_names.clear()
movie_messages.clear()
movie_scores.clear()
movie_rank.clear() #更新列表
for j in range(0,10): #一共有10页,用for循环获取每页信息
houzui='?start='+str(j*25)+'&filter='
link=url+houzui
movie_nameget(link)