selenium 记录 performance日志

###

做爬虫的时候,有时候遇到需要的数据在加载资源当中,通常做法是拼接url,然后获取数据,但首先需要进行分析,如果拼接中的参数有加密的情况时,如果不能模拟算法生成正确的参数,那就很头疼。而访问performance,可以获得加载网站时的资源请求信息,可以通过这一特点,获取url和数据。

 

####

import time

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json


class Mychrome:

    def __init__(self):
        self.options = webdriver.ChromeOptions()
        self.flash_urls = []
        self.set_browser()

    def set_browser(self):

        prefs = {
            "profile.managed_default_content_settings.images": 1,

        }
        if self.flash_urls is not None and len(self.flash_urls) != 0:
            prefs['profile.managed_plugins_allowed_for_urls'] = self.flash_urls
        self.options.add_experimental_option('prefs', prefs)
        self.options.add_experimental_option('w3c', False)

        # 方法1
        # capabilities = DesiredCapabilities.CHROME
        # capabilities['loggingPrefs'] = {"performance","all"}
        # self.driver = webdriver.Chrome(
        #     desired_capabilities=capabilities
        # )

        # 方法2
        # self.options.add_experimental_option("excludeSwitches", ['enable-automation'])  # window.navigator.webdriver设置为undefined,逃过网站的防爬检查,headless无效
        desired_capabilities = self.options.to_capabilities()  # 将功能添加到options中
        desired_capabilities['loggingPrefs'] = {
            "performance": "ALL"  # 添加日志
        }
        self.driver = webdriver.Chrome(
            desired_capabilities=desired_capabilities
        )

    def gethtml(self):
        url = 'http://www.baidu.com'
        self.driver.get(url)
        print(self.driver.get_log('performance'))
        print('-' * 60)
        print(self.driver.get_log('performance'))
        for entry in self.driver.get_log('performance'):
            params = json.loads(entry.get('message')).get('message')
            print(params.get('request'))  # 请求连接 包含错误连接
            print(params.get('response'))  # 响应连接 正确有返回值得连接


if __name__ == '__main__':
    browser = Mychrome().gethtml()

####

 

 

 

 

 

####

上一篇:透过performance探究js操作dom样式时浏览器会做什么?


下一篇:Multiple SSH Keys settings for different github account