爬虫进阶(五)——selenium

selenium基本操作(需要提前下载浏览器driver.exe)

from selenium import webdriver
from time import sleep
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.jd.com/')
sleep(1)
#进行标签定位
search_input = bro.find_element_by_id('key')
search_input.send_keys('mac pro')

btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()
sleep(2)

#执行js
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)

page_text = bro.page_source
print(page_text)

sleep(2)
bro.quit()

动态加载数据爬取(个人认为极难爬取的网站)

https://www.aqistudy.cn/
from selenium import webdriver
from time import sleep
from lxml import etree
bro = webdriver.Chrome(executable_path='chromedriver.exe')

bro.get('https://www.aqistudy.cn/')
sleep(1)
page_text = bro.page_source
page_text_list = [page_text]

for i in range(3):
    bro.find_element_by_id('pageIto_next').click()#点击下一页
    sleep(1)
    page_text_list.append(bro.page_source)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//ul[@id="gzlist"]/li')
    for li in li_list:
        title = li.xpath('./dl/@title')[0]
        num = li.xpath('./ol/@title')[0]
        print(title+':'+num)

sleep(2)
bro.quit()

selenium动作链+iframe标签获取

from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
# iframe标签
bro.switch_to.frame('iframeResult')
div_tag = bro.find_element_by_id('draggable')
#拖动= 点击+滑动
action = ActionChains(bro)
action.click_and_hold(div_tag)
for i in range(5):
    #perform让动作链立即执行
    action.move_by_offset(17,2).perform()
    sleep(0.5)
action.release()
sleep(3)
bro.quit()

12306模拟登录,很早之前做的,现在不知道行不行,可以借鉴一下

from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains
from Cjy import Chaojiying_Client
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='chromedriver.exe')
# bro.get('https://kyfw.12306.cn/otn/login/init')
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
sleep(5)
# bro.save_screenshot('main.png')
#
# code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
# location = code_img_tag.location
# size = code_img_tag.size
# #裁剪的区域范围
# rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))
#
# i = Image.open('./main.png')
# frame = i.crop(rangle)
# frame.save('code.png')
#
# def get_text(imgPath,imgType):
#     chaojiying = Chaojiying_Client('666', '666', '899370')
#     im = open(imgPath, 'rb').read()
#     return chaojiying.PostPic(im, imgType)['pic_str']
#
# #55,70|267,133 ==[[55,70],[33,66]]
# result = get_text('./code.png',9004)
# all_list = []
# if '|' in result:
#     list_1 = result.split('|')
#     count_1 = len(list_1)
#     for i in range(count_1):
#         xy_list = []
#         x = int(list_1[i].split(',')[0])
#         y = int(list_1[i].split(',')[1])
#         xy_list.append(x)
#         xy_list.append(y)
#         all_list.append(xy_list)
# else:
#     x = int(result.split(',')[0])
#     y = int(result.split(',')[1])
#     xy_list = []
#     xy_list.append(x)
#     xy_list.append(y)
#     all_list.append(xy_list)
# print(all_list)
# # action = ActionChains(bro)
# for a in all_list:
#     x = a[0]
#     y = a[1]
#     ActionChains(bro).move_to_element_with_offset(code_img_tag,x,y).click().perform()
#     sleep(1)

bro.find_element_by_id('username').send_keys('123456')
sleep(1)
bro.find_element_by_id('password').send_keys('67890000000')
sleep(1)
bro.find_element_by_id('loginSub').click()

sleep(5)
bro.quit()

selenium规避检测,无头浏览器

#使用谷歌无头浏览器
# from selenium import webdriver
# from time import sleep
# from selenium.webdriver.chrome.options import Options
#
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
#
# driver = webdriver.Chrome(r'chromedriver.exe',chrome_options=chrome_options)
# driver.get('https://www.cnblogs.com/')
# print(driver.page_source)

#如何规避selenium被检测
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from time import sleep

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"')
driver = webdriver.Chrome(r'chromedriver.exe',options=option)
# driver.get('https://www.taobao.com/')
url = 'https://www.aqistudy.cn/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
    'Connection':"close"
}
driver.get(url)
sleep(4)
page_text = driver.page_source
print(page_text)
sleep(2)
driver.quit()

示例:梨视频爬取

import requests
from lxml import etree
import re
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
url = 'https://www.pearvideo.com/category_1'
page_text = requests.get(url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
for li in li_list:
    detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
    title = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
    print(detail_url)
    detail_page_text = requests.get(detail_url,headers=headers).text
    print(detail_page_text)
    # ex = 'srcUrl="(.*?)",vdoUrl'
    # video_url = re.findall(ex,detail_page_text,re.S)[0]
    tree = etree.HTML(detail_page_text)
    video_url = tree.xpath('//video/@src')[0]
    video_data = requests.get(video_url,headers=headers).content
    with open(title,'wb') as fp:
        fp.write(video_data)

 

上一篇:爬虫项目


下一篇:爬虫 - 不弹出浏览器