在谷歌浏览器运行需要chromedriver.exe文件放在同级目录下
import re
import urllib
import requests
import os
from bs4 import BeautifulSoup
import time
from selenium import webdriver ##驱动浏览器
from selenium.webdriver.support.wait import WebDriverWait
def id(path):
browser = webdriver.Chrome() ##调用Chrome的驱动,生产一个浏览器对象
wait = WebDriverWait(browser, 10) ##设置selenium等待浏览器加载完毕的最大时间
try:
url = "https://movie.douban.com/"
browser.get(url)
douban = browser.find_element_by_id("inp-query")
douban.send_keys(path)
douban = browser.find_element_by_css_selector(".nav-search .inp-btn input")
douban.click()
browser.find_element_by_class_name("cover").click()
url = browser.current_url
id = url.split("/")[-2]
browser.close()
return id
except:
pass
def getContent(q,w):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" ,
'Connection': 'keep-alive'
}
url = "https://movie.douban.com/celebrity/%s/photos/?type=C&start=%s&sortby=like&size=a&subtype=a"%(q,w)
##请求对象(url+请求头)
r = requests.get(url,headers = headers)
##获取页面内容
soup = BeautifulSoup(r.text,"html.parser")
return soup
def getItem(soup):
try:
movieList = soup.find("ul",attrs={"class": "poster-col3 clearfix"})
List = movieList.find_all("img")
num = re.findall(r'src="(.+?)"', str(List))
return num
except:
os.removedirs(path)
print("您输入的不是明星")
def downloadFile(url,dir,index):
ext = url.split(".")[-1]
path = dir + "/" + str(index)+"."+ext
urllib.request.urlretrieve(url,path)
def creadDir(path):
if os.path.isdir(path)==False:
os.mkdir(path)
def ye(q):
try:
for w in range(0,10000,30):
soup = getContent(q, w)
list = getItem(soup)
n = 1
for m in range(0, len(list)):
url = list[m]
downloadFile(url, path, n)
n += 1
return "完成"
except:
pass
path = input("请输入需要查找的明星")
q = id(path)
creadDir(path)
ye(q)