Python 抓取图片(记录)
记录过程,怕忘了。复制就能用。
# coding=utf-8
import os
import platform
from multiprocessing.pool import ThreadPool
import lxml
import requests
from lxml import etree
import time
from apscheduler.schedulers.blocking import BlockingScheduler
import logging
import random
import bs4
import sys
from random import randint
from clint.textui import progress
# 抓取网址 https://wallhaven.cc/toplist 排行榜 latest 最新 hot 热门 random 随机
url = "https://wallhaven.cc/latest"
# 下载路径
path = "D:\\Download\\wallhaven\\latest"
# http请求头
headers = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
# 模拟浏览器请求
Hostreferer = {
'User-Agent': headers,
# 'Connection': 'keep-alive',
'Referer': 'https://wallhaven.cc/toplist',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
# 创建文件夹
def createFile(file_path):
if os.path.exists(file_path) is False:
os.makedirs(file_path)
# 切换路径至上面创建的文件夹
os.chdir(file_path)
times = time.strftime("%Y-%m-%d_%H:%M:%S")
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename="log.txt",
filemode='a')
def imgs(lv):
# 页码
global s, t, k, page
pich = ""
pagenumber = {}
# 套图
pictures = {}
# 张码
piececode = {}
urls = []
try:
html = requests.get(url, headers=Hostreferer, timeout=5)
html.encoding = "utf-8"
createFile(path)
# 获取网页内容
text = etree.HTML(html.text)
# 获取最大页数
kom = requests.get("https://wallhaven.cc/toplist?page=2", headers=Hostreferer, timeout=5)
ls = etree.HTML(kom.text)
pagenum = ls.xpath("//div[@id='thumbs']/section/header/h2/text()")[1]
pagenum = pagenum[3:]
print("总页数:" + pagenum)
# 定义循环页数
for i in range(1, int(3) + 1):
page = url + "?page=" + str(i)
print("图片页码:" + page)
file = path + "\\" + str(i)
createFile(file)
try:
two = requests.get(page, headers=Hostreferer, timeout=5)
two.encoding = "utf-8"
two_j = etree.HTML(two.text)
pic = two_j.xpath("//div[@id='thumbs']/section/ul/li/figure/a/@href")
for j in range(1, len(pic)):
go = []
# print("第" + str(i) + "页第" + str(j) + "个图片")
echo("success", "第" + str(i) + "页第" + str(j) + "个图片")
# print("图片地址:" + pic[j])
pich = pic[j]
t = j
try:
imgurl = requests.get(pic[j], headers=Hostreferer, timeout=5)
imgurl.encoding = "utf-8"
img = etree.HTML(imgurl.text)
image = img.xpath("//section[@id='showcase']/div/img/@src")
echo("success", "图片下载路径:" + image[0])
# print("图片下载路径:" + image[0])
arry = image[0].split('/')
file_name = arry[len(arry) - 1]
fileimg = file + "\\" + str(file_name)
k = fileimg
# imageurl = requests.get(image[0], headers=Hostreferer, timeout=5,stream=True)
# print(fileimg)
# 存取图片路径
go.append(fileimg)
go.append(image[0])
urls.append(go)
# 第一种下载
# f = open(fileimg, 'ab')
# f.write(imageurl.content)
# imageurl.close()
imgurl.close()
except Exception as e:
print("下载出了个问题")
print(e)
# time.sleep(1)
two.close()
except Exception as e:
# pictures["第" + str(s) + "页第" + str(t) + "个套图"] = pich
print(e)
# time.sleep(1)
s = i
kom.close()
print(urls)
# 存入网址
imgurl = "imgurl.txt"
ts = []
n = 0
print("开始下载...")
print("图片数量:" + str(len(urls)))
# 第二种下载
for x in urls:
n = n + 1
g = open(imgurl, "a+")
b = str(x[1]) + "\n"
g.write(b)
url_response(x[1], x[0], n)
print("图片抓取完成")
if pagenumber:
print("无法抓取页面:")
print(pagenumber)
else:
print("无法抓取页面:0")
if pictures:
print("无法抓取套图:")
print(pictures)
else:
print("无法抓取套图:0")
if piececode:
print("无法抓取张码:")
print(piececode)
else:
print("无法抓取张码:0")
except Exception as e:
timetyr = 3 # 重试次数
if lv < timetyr:
lv += 1
imgs(lv)
print(e)
pagenumber["第" + str(s) + "页"] = page
# 定义下载函数
def url_response(url, imglen, n):
r = requests.get(url, headers=Hostreferer, timeout=5, stream=True)
# 定义一个1024的字节
chunk_size = 1024
size = 0
content_size = int(r.headers['content-length'])
with open(imglen, 'ab') as f:
# 边下载边存硬盘 chunk_size=chunk_size可修改 单位为B
for chunl in r.iter_content(chunk_size=chunk_size):
# 写入文件
f.write(chunl)
size += len(chunl) # 已下载文件大小
# \r 指定第一个字符开始,搭配end属性完成覆盖进度条
print('\r' + '[下载进度]: %s%.2f%%' % ('>' * int(size * 50 / content_size), float(size / content_size * 100)),
end='')
print('[第' + str(n) + '个图片大小]: %0.2f MB' % (content_size / chunk_size / 1024))
f.close()
def echo(color, *args):
colors = {'error': '\033[91m', 'success': '\033[94m', 'info': '\033[93m'}
if not color in colors or platform.system() == 'Windows':
print(' '.join(args))
print(colors[color], ' '.join(args), '\033[0m')
if __name__ == '__main__':
imgurl = "imgurl.txt"
open(imgurl, "a+")
start = time.time()
imgs(1)
end = time.time()
print('\n' + "全部下载完成!用时%s秒" % (end - start))
scheduler = BlockingScheduler()
scheduler._logger = logging
scheduler.start()