案例
import os from hashlib import md5 from selenium import webdriver import requests from lxml import etree # 首页请求 def get_response(url): headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"} res = requests.get(url, headers=headers) return res # 从返回的json中解析出组图链接 def get_article_title_url(text, i): # json解析获取字典表数据 article_url = text.json()["data"][i]['article_url'] title = text.json()["data"][i]['title'] return article_url,title # 从单个组图链接里解析出每张图片的URL地址-自上而下 def parse_article_url(article_url): driver = webdriver.Chrome(r"D:\python\com\zxsoft\python\chromedriver.exe") driver.get(article_url) text = driver.page_source html = etree.HTML(text) hrefs = html.xpath('//div[@class="article-content"]//div[@class="pgc-img"]//img[@class="syl-page-img"]//@src') driver.close() return hrefs # 将每张图片保存在对应标题的本地文件夹下 def save_jpg(title,href): res = requests.get(href) file_path = '{}/{}.{}'.format(title, md5(res.content).hexdigest(), 'jpg') with open(file_path, 'wb') as f: f.write(res.content) os.chdir(r"E:/ntmssFile/nv/") for i in range(20): url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab'.format(i * 20) r = get_response(url) data_length = r.json()["data"] for i in range(len(data_length)): try: # 不是所有的列表中都有组图标题和链接信息,用try防止报错 article_url,title_text = get_article_title_url(r, i) if not os.path.exists(title_text): os.makedirs(title_text) hrefs = parse_article_url(article_url) for href in hrefs: save_jpg(title_text,href) except: continue