全栈式爬取图片

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

url1 = 'http://www.netbian.com'  # 要爬的网址
path = 'chromedriver.exe'  # 驱动路径

# 创建一个参数配置对象
chrome_options = Options()
# 不加载图片
prefs = {"profile.managed_default_content_settings.images":2}
chrome_options.add_experimental_option("prefs", prefs)



# 生成浏览器
browser = webdriver.Chrome(path, options=chrome_options)

fengleis = []
data = []

def getFenglei():
    browser.get(url1)
    for i in range(2, 3 + 1):
        fenlei = browser.find_element_by_xpath('//*[@id="header"]/div[1]/ul/li[1]/div/a[%d]'%i).get_attribute('href')
        print(fenlei)
        fengleis.append(fenlei)

# 获取数据
def getData(url):  # 获取数据
    # 每页多少张
    for i in range(1, 3):
        if i != 3:
            browser.get(url)

            # 获取 缩略图跳到下载页面的地址
            li_a = browser.find_element_by_xpath('//*[@id="main"]/div[3]/ul/li[%d]//a' % i).get_attribute('href')
            if li_a == 'http://pic.netbian.com/':
                continue
            #                                     //*[@id="main"]/div[3]/ul/li[2]/a
            print(li_a)
            browser.get(li_a)  # 打开下载页面网址

            download = browser.find_element_by_xpath('//*[@id="main"]/div[3]/div/div/a').get_attribute('href')
            print(download)  # 下载大图地址

            browser.get(download)

            big_img = browser.find_element_by_xpath('//*[@id="endimg"]/tbody/tr/td/a/img')
            img_url = big_img.get_attribute('src')
            title = big_img.get_attribute('title')
            print(img_url, title)
            data.append({'img_url': img_url, 'title': title})

    # browser.get(img_url)
    # print()


# 翻页
def page(url):
    # 爬多少页
    for i in range(1, 10):
        if i == 1:
            getData(url)
        else:
            # http://www.netbian.com/feizhuliu/index_2.htm
            getData(url + 'index_%d.htm' % i)


def downImg():
    import requests
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    for i in data:
        # if i == 5:  # 限制下载数量
        #     break

        try:
            res = requests.get(i.get('img_url'), headers=headers).content
            print(res)
        # 下载路径
            with open('./all/%s.jpg' % i.get("title"), 'wb') as f:
                f.write(res)
            print('正在下载' + i.get('title'))
        except Exception as e:
            print(e)
            continue


def run():
    getFenglei()  # 获取分类
    for url in fengleis:  # 一类一类的爬

        page(url)

    # 下载图片
    downImg()

run()

 

上一篇:Zerodium公开Tor浏览器0day代码执行漏洞 被喷“不负责任”


下一篇:使用Selenium爬取豆瓣电影前100的爱情片相关信息