爬取思路:
1、使用selenium来获取页面源码,实现翻页功能。
2、获取页面源码后可以获取每个电影对应页面的url。
3、然后请求电影页面的url信息,将需要的保存起来即可。
code:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import time
class Spide_douban():
def __init__(self):
self.headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"
}
self.url = "https://movie.douban.com/top250?"
self.driver = None
self.film_url = []
self.film_name = []
self.film_seq = []
def get_film_url(self,html):
soup = BeautifulSoup(html,"html.parser")
film_url_list = soup.find("ol",attrs = {"class":"grid_view"}).find_all("div",attrs = {"class":"pic"})
for div in film_url_list:
self.film_url.append(div.find("a").attrs["href"])
self.film_name.append(div.find("img").attrs["alt"])
def deal_actor_img(self,soup_img):
img_url = soup_img.find("div").attrs["style"]
img_name = soup_img.find("span").text
img_name_role = soup_img.find("span",attrs = {"class":"role"}).text
return img_url[22:-1],img_name,img_name_role
def deal_actor_role(self,img_name,img_name_role,num):
file = open("D:\\项目案例\\{}\\{}的简介.txt".format(self.film_name[num],self.film_name[num]),"a",encoding=("utf-8"))
file.write("\n")
file.write(img_name)
file.write(" ")
file.write(img_name_role)
file.close()
def deal_url(self,url,num):
try:
os.mkdir("D:\\项目案例\\{}".format(self.film_name[num]))
print("正在处理电影 {}".format(self.film_name[num]))
except Exception as e:
print("正在处理电影 {}".format(self.film_name[num]))
res = requests.get(url,headers = self.headers)
time.sleep(2)
soup = BeautifulSoup(res.text,"html.parser")
file = open("D:\\项目案例\\{}\\{}的简介.txt".format(self.film_name[num],self.film_name[num]),"w",encoding=("utf-8"))
#添加电影评分
soup_comment = soup.find("div",attrs = {"id":"interest_sectl"})
grad = soup_comment.find("strong",attrs = {"class":"ll rating_num"}).string
vote = soup_comment.find("span",attrs = {"property":"v:votes"}).string
file.write("电影豆瓣评分:{}".format(grad))
file.write(" {}人评价\n".format(vote))
F = soup.find("div",attrs = {"id":"info"})
#添加片长
time1 = F.find("span",attrs = {"property":"v:runtime"})
file.write("电影时长:{}\n\n".format(time1.string))
#添加电影类型
label_list = F.find_all("span",attrs = {"property":"v:genre"})
l= len(label_list)
file.write("电影类型:")
for i in range(l):
file.write(label_list[i].string)
if(i+1 != l):
file.write("、")
file.write("\n")
#添加上映日期
data_list = F.find_all("span",attrs = {"property":"v:initialReleaseDate"})
l = len(data_list)
file.write("电影上映日期:")
for i in range(l):
file.write(data_list[i].string)
if(i+1 != l):
file.write("、")
file.write("\n")
#添加电影简介
file.write("电影简介:\n")
soup_text = soup.find("div",attrs = {"id":"link-report"}).find("span",attrs = {"property":"v:summary"}).text.strip()
soup_text_list = soup_text.split()
text = ""
for str_i in soup_text_list:
number = 0
for j in str_i:
number +=1
if(number %30==0):
text+='\n'
text+=j
file.write(text)
file.write("\n")
file.close()
#保存电影海报
file = open("D:\\项目案例\\{}\\{}.jpg".format(self.film_name[num],self.film_name[num]),"wb")
img_url = soup.find("div",attrs = {"id":"mainpic"}).find("img").attrs["src"]
res_img = requests.get(img_url,headers = self.headers)
file.write(res_img.content)
file.close()
#保存主演的照片
soup_actor_list = soup.find("div",attrs = {"id":"celebrities"}).find_all("li")
for actor in soup_actor_list:
img_url,img_name,img_name_role = self.deal_actor_img(actor)
file = open("D:\\项目案例\\{}\\{}.jpg".format(self.film_name[num],img_name),"wb")
res = requests.get(img_url,headers = self.headers)
file.write(res.content)
file.close()
self.deal_actor_role(img_name,img_name_role,num)
def move(self):
for i in range(30):
js = "var q=document.documentElement.scrollTop={}".format(i*100) #javascript语句
self.driver.execute_script(js)
time.sleep(0.25)
def run(self):
self.driver = webdriver.Chrome(executable_path="D:\chromedriver.exe")
self.driver.get(self.url)
#滑动当前页面,使页面充分加载
time.sleep(2)
self.move()
# html = self.driver.page_source
# self.get_film_url(html)
flag = True
while(flag):
html = self.driver.page_source
self.get_film_url(html)
soup = BeautifulSoup(html,"html.parser")
next_soup = soup.find("span",attrs = {"class":"next"})
if(next_soup.find("a")==None):
break
self.driver.find_element_by_css_selector('#content > div > div.article > div.paginator > span.next > a').click()
self.move()
#控制翻页,获得每个页面的HTML源码。
# 判断是否有a标签,有的话就点,是None的话说明当前是最后一页了。
#得到的每个电影的url,然后在处理每个电影。
for num,url in enumerate(self.film_url):
try:
self.deal_url(url,num)
except Exception as E:
print("{} 电影信息获取出错,等待进行二次获取".format(self.film_name[num]))
self.film_seq.append(num)
for i in film_seq:
try:
print("尝试二次获取电影:{}的信息".format(self.film_name[i]))
self.deal_url(self.film_url[i], self.film_name[i])
except Exception as e:
print("{} 电影二次获取信息仍然失败".format(self.film_name[i]))
Sd = Spide_douban()
Sd.run()
豆瓣网的反爬机制就是如果频繁访问的话会对ip进行封禁,下次再访问的时候需要登录才能访问。