###
做爬虫的时候,有时候遇到需要的数据在加载资源当中,通常做法是拼接url,然后获取数据,但首先需要进行分析,如果拼接中的参数有加密的情况时,如果不能模拟算法生成正确的参数,那就很头疼。而访问performance,可以获得加载网站时的资源请求信息,可以通过这一特点,获取url和数据。
####
import time from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import json class Mychrome: def __init__(self): self.options = webdriver.ChromeOptions() self.flash_urls = [] self.set_browser() def set_browser(self): prefs = { "profile.managed_default_content_settings.images": 1, } if self.flash_urls is not None and len(self.flash_urls) != 0: prefs['profile.managed_plugins_allowed_for_urls'] = self.flash_urls self.options.add_experimental_option('prefs', prefs) self.options.add_experimental_option('w3c', False) # 方法1 # capabilities = DesiredCapabilities.CHROME # capabilities['loggingPrefs'] = {"performance","all"} # self.driver = webdriver.Chrome( # desired_capabilities=capabilities # ) # 方法2 # self.options.add_experimental_option("excludeSwitches", ['enable-automation']) # window.navigator.webdriver设置为undefined,逃过网站的防爬检查,headless无效 desired_capabilities = self.options.to_capabilities() # 将功能添加到options中 desired_capabilities['loggingPrefs'] = { "performance": "ALL" # 添加日志 } self.driver = webdriver.Chrome( desired_capabilities=desired_capabilities ) def gethtml(self): url = 'http://www.baidu.com' self.driver.get(url) print(self.driver.get_log('performance')) print('-' * 60) print(self.driver.get_log('performance')) for entry in self.driver.get_log('performance'): params = json.loads(entry.get('message')).get('message') print(params.get('request')) # 请求连接 包含错误连接 print(params.get('response')) # 响应连接 正确有返回值得连接 if __name__ == '__main__': browser = Mychrome().gethtml()
####
####