爬虫模块之selenium模块

一 模块的介绍

 selenium模块最开始是一个自动化测试的工具,驱动浏览器完全模拟浏览器自动测试。

from selenium import webdriver  # 驱动浏览器
browser=webdriver.Chrome() # 谷歌浏览器
browser=webdriver.Firefox() # 火狐浏览器
browser=webdriver.PhantomJS() # 虚拟浏览器
browser=webdriver.Safari()
browser=webdriver.Edge()

二 下载安装

#安装:selenium+chromedriver
pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.29,并非2.9
国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.29/
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads #验证安装
C:\Users\Administrator>python3
Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from selenium import webdriver
>>> driver=webdriver.Chrome() #弹出浏览器
>>> driver.get('https://www.baidu.com')
>>> driver.page_source #注意:
selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver
下载链接:https://github.com/mozilla/geckodriver/releases
#安装:selenium+phantomjs
pip3 install selenium
下载phantomjs,解压后把phantomjs.exe所在的bin目录放到环境变量
下载链接:http://phantomjs.org/download.html #验证安装
C:\Users\Administrator>phantomjs
phantomjs> console.log('egon gaga')
egon gaga
undefined
phantomjs> ^C
C:\Users\Administrator>python3
Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from selenium import webdriver
>>> driver=webdriver.PhantomJS() #*面浏览器
>>> driver.get('https://www.baidu.com')
>>> driver.page_source

三 基本使用

 ActionChains:拖动的一些事。

 expected_conditions:加载的时间设置

 find_element_by_id:id查找的方式。

 send_keys:发送查找的关键字

 click:点击事件

 current_url:获取正在驱动的url

 get_cookies:获取cookies信息

 page_source:页面源代码

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome()
try:
browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw')
input_tag.send_keys('美女') #python2中输入中文错误,字符串前加个u
input_tag.send_keys(Keys.ENTER) #输入回车 wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #等到id为content_left的元素加载完毕,最多等10秒 print(browser.page_source)
print(browser.current_url)
print(browser.get_cookies()) finally:
browser.close()

四 选择器

 基本选择器查找: 

 find_element_by_id:根据ID查找

 find_element_by_link_text:通过文本查找

 find_element_by_partial_link_text:根据某些文本模糊查找到第一个内容

 find_element_by_class_name:通过class查找

 find_element_by_name:通过name属性查找

  补充:

  presence_of_all_elements_located:相对应的所有元素加载完毕过后

  presence_of_element_located:查找到第一个加载完毕后

  element_to_be_clickable:等待可以点击过后。

  By.CLASS_NAME:class查找的方式

  get_attribute:访问标签的属性

  text:访问文本

  tag_name:访问name

# from selenium import webdriver
# from selenium.webdriver import ActionChains
# from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
# from selenium.webdriver.common.keys import Keys #键盘按键操作
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
# import time
#
# try:
# '''
# find_element_by_id
# find_element_by_name
# find_element_by_link_text
# find_element_by_partial_link_text
# find_element_by_tag_name
# find_element_by_class_name
#
# find_element_by_css_selector
# find_element_by_xpath
# '''
# driver = webdriver.Chrome()
# wait=WebDriverWait(driver,3)
# driver.get('https://www.baidu.com/')
#
# # 1、find_element_by_id
# # input_tag=driver.find_element_by_id('kw')
# # print(input_tag.tag_name)
# # print(input_tag.get_attribute('name'))
# # print(input_tag.text)
#
# # 2、find_element_by_link_text
# # login=driver.find_element_by_link_text('登录')
# # login.click()
#
# # 3、find_element_by_partial_link_text
# login=driver.find_element_by_partial_link_text('登')
# login.click()
#
# # 4、find_element_by_class_name
# # login_for_user=driver.find_element_by_class_name('tang-pass-footerBarULogin')
# # login_for_user=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tang-pass-footerBarULogin')))
# login_for_user=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin')))
# # print(login_for_user)
# login_for_user.click()
#
#
# #4、find_element_by_name
# # input_user=driver.find_element_by_name('userName')
# # input_pwd=driver.find_element_by_name('password')
# # button=driver.find_element_by_id('TANGRAM__PSP_10__submit')
# #
# # input_user.send_keys('17094322519')
# # input_pwd.send_keys('11111111111')
# # button.click()
#
#
#
# time.sleep(5)
# finally:
# driver.close()

 以上这些只能够查找出来一个内容,如果想要加载相关的所有内容,将有血查找方式的element改成elements就可以了。如下

    find_elements_by_name
find_elements_by_xpath
find_elements_by_link_text
find_elements_by_partial_link_text
find_elements_by_tag_name
find_elements_by_class_name
find_elements_by_css_selector

 find_element(s)_by_xpath:如果在没有一个合适定位的方式的时候就可以使用这个

  /:单斜杠,查找一个标签,可以从根标签一层一层的向内部查找。

  //:双斜杠,从当前页面查找出相对用的所有的标签。

  [数字]:确定查找到哪一个标签。

  [@属性=“属性值”]:属性的查找方式

  [locntains(@属性=“属性值的部分内容”)]:属性模糊查找

  //*:所有的标签

  [标签/@属性=“属性值”]:查找有这个标签的标签的值

  ..:两个点,代表的是上一级

#Xpath选择器
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素
import time try:
driver = webdriver.Chrome()
# wait = WebDriverWait(driver, 3)
driver.implicitly_wait(3) driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html') #1、//与/
# tag=driver.find_element_by_xpath('/html/body/div/a')
# print(tag.tag_name)
# print(tag.text)
# print(tag.get_attribute('href')) # tag=driver.find_elements_by_xpath('//a')
# print(tag) # tag=driver.find_elements_by_xpath('//div//a')
# tag=driver.find_elements_by_css_selector('div a')
# print(len(tag)) #2、查找第几个
# tag=driver.find_elements_by_xpath('//div//a[5]')
# print(tag[0].text) #3、按照属性查找
# tag1=driver.find_element_by_xpath('//a[@href="image4.html"]')
# tag2=driver.find_element_by_xpath('//a[4]')
# tag3=driver.find_element_by_xpath('//a[contains(@href,"image4")]')
#
# print(tag1.text)
# print(tag2.text)
# print(tag3.text) #4、其他
# driver.find_elements_by_xpath('//*[@class="xxxxx"]')
# driver.find_elements_by_xpath('//div[@class="xxxxx"][@class="yyyyy"]') # print(driver.find_element_by_xpath('//a[img/@src="data:image2_thumb.jpg"]').text)
# print(driver.find_element_by_xpath('//a/..').tag_name) # print([tag.tag_name for tag in driver.find_elements_by_xpath('//img//..')]) img=driver.find_element_by_xpath('//img')
print(img.location)
print(img.size) time.sleep(5)
finally:
driver.close()

五 交互操作

 location:坐标,横:x;竖:y

 size:大小,也就是内容的长宽

 impicitly_wait:隐式等待。

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() #隐式等待:在查找所有元素时,如果尚未被加载,则等10秒
browser.implicitly_wait(10) browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw')
input_tag.send_keys('美女')
input_tag.send_keys(Keys.ENTER) contents=browser.find_element_by_id('content_left') #没有等待环节而直接查找,找不到则会报错
print(contents) browser.close()

 显式等待:

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome()
browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw')
input_tag.send_keys('美女')
input_tag.send_keys(Keys.ENTER) #显式等待:显式地等待某个元素被加载
wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left'))) contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
print(contents) browser.close()

 execute_script:直接写js代码

 clear:清空输入框

 iframe:在一个页面中嵌套一个页面

 switch_to.frame:切换到子页面

 awitch_to.parent_frame:切换到父页面

 Action chains(浏览器对象):拖动

 drag_aand_drop(源,目标):从源拖动到目标

 perform():开始执行

 click_and_hold:点击不松手

 move_by_offset:偏移量

 release:松开鼠标

# from selenium import webdriver
# from selenium.webdriver import ActionChains
# from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
# from selenium.webdriver.common.keys import Keys #键盘按键操作
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
# import time
#
#
# try:
# driver=webdriver.Chrome()
# driver.get('https://www.jd.com/')
# driver.implicitly_wait(3)
#
# input_tag=driver.find_element_by_id('key')
# input_tag.send_keys('iphoneX')
# input_tag.send_keys(Keys.ENTER)
#
# time.sleep(3)
# input_tag = driver.find_element_by_id('key')
# input_tag.clear()
# input_tag.send_keys('mac pro')
# input_tag.send_keys(Keys.ENTER)
#
#
# time.sleep(5)
# finally:
# driver.close() #ActionChains
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time try:
driver=webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
driver.implicitly_wait(3)
driver.execute_script('alert("hahha")') # driver.switch_to.frame('iframeResult')
# driver.switch_to.parent_frame() # 方式一:基于一条链,从头一下移动到尾部
# source = driver.find_element_by_id('draggable')
# target = driver.find_element_by_id('droppable')
# actions=ActionChains(driver)
# actions.drag_and_drop(source,target)
# actions.perform() # 方式二:基于不同ActionChains,可以控制移动的位移
# source = driver.find_element_by_id('draggable')
# target = driver.find_element_by_id('droppable') # distance=target.location['x'] - source.location['x']
#
# ActionChains(driver).click_and_hold(source).perform()
# ActionChains(driver).move_by_offset(xoffset=distance,yoffset=0).perform()
# ActionChains(driver).release().perform()
#
# res=0
# while res < distance:
# ActionChains(driver).move_by_offset(xoffset=1,yoffset=0).perform()
# res+=1
# ActionChains(driver).release().perform()
# time.sleep(5)
finally:
driver.close()

六 浏览器的前进和后退:

 back:后退

 forword:前进

#浏览器的前进后退
# import time
# from selenium import webdriver
#
# browser=webdriver.Chrome()
# browser.get('https://www.baidu.com')
# browser.get('https://www.taobao.com')
# browser.get('http://www.python.org/')
#
# time.sleep(3)
# browser.back()
# time.sleep(3)
# browser.forward()
# browser.close()

七 cookies

 get_cookies:获取cookies里面的信息。

#cookies
from selenium import webdriver browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies()) # browser.delete_all_cookies()

八 异常处理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException try:
browser=webdriver.Chrome()
browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
browser.switch_to.frame('iframssseResult') except TimeoutException as e:
print(e)
except NoSuchFrameException as e:
print(e)
finally:
browser.close()

九 选项卡管理

#选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式
import time
from selenium import webdriver browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()') print(browser.window_handles) #获取所有的选项卡
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(10)
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()

十 练习

#注意:网站都策略都是在不断变化的,精髓在于学习流程。下述代码生效与2017-11-7,不能保证永久有效
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() try:
browser.get('http://mail.163.com/') wait=WebDriverWait(browser,5) frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe')))
browser.switch_to.frame(frame) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container'))) inp_user=browser.find_element_by_name('email')
inp_pwd=browser.find_element_by_name('password')
button=browser.find_element_by_id('dologin')
inp_user.send_keys('')
inp_pwd.send_keys('xxxx')
button.click() #如果遇到验证码,可以把下面一小段打开注释
# import time
# time.sleep(10)
# button = browser.find_element_by_id('dologin')
# button.click() wait.until(EC.presence_of_element_located((By.ID,'dvNavTop')))
write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] #获取第二个li标签就是“写信”了
write_msg.click() wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0')))
recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt')
title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input')
recv_man.send_keys('378533872@qq.com')
title.send_keys('圣旨')
print(title.tag_name) frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe')))
browser.switch_to.frame(frame)
body=browser.find_element(By.CSS_SELECTOR,'body')
body.send_keys('egon很帅,可以加工资了') browser.switch_to.parent_frame() #切回他爹
send_button=browser.find_element_by_class_name('nui-toolbar-item')
send_button.click() #可以睡时间久一点别让浏览器关掉,看看发送成功没有
import time
time.sleep(10000) except Exception as e:
print(e)
finally:
browser.close()
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time def get_goods(driver):
try:
goods=driver.find_elements_by_class_name('gl-item') for good in goods:
detail_url=good.find_element_by_tag_name('a').get_attribute('href') p_name=good.find_element_by_css_selector('.p-name em').text.replace('\n','')
price=good.find_element_by_css_selector('.p-price i').text
p_commit=good.find_element_by_css_selector('.p-commit a').text msg = '''
商品 : %s
链接 : %s
价钱 :%s
评论 :%s
''' % (p_name,detail_url,price,p_commit) print(msg,end='\n\n') button=driver.find_element_by_partial_link_text('下一页')
button.click()
time.sleep(1)
get_goods(driver)
except Exception:
pass def spider(url,keyword):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(3) # 使用隐式等待
try:
input_tag=driver.find_element_by_id('key')
input_tag.send_keys(keyword)
input_tag.send_keys(Keys.ENTER)
get_goods(driver)
finally:
driver.close() if __name__ == '__main__':
spider('https://www.jd.com/',keyword='iPhone8手机')
#coding=utf-8
from selenium import webdriver
# from selenium import fire
from selenium.webdriver.common.keys import Keys
import re
from selenium.webdriver.support.ui import Select import time
from pyquery import PyQuery as pq
from xlwt import *
import calendar
from collections import OrderedDict def openurl(num):
browser = webdriver.Chrome(executable_path=r"H:\chromedriver_win32\chromedriver.exe")
browser.get("https://finsix.com/#section-compatibility")
html = browser.page_source # 获取网站源码
data = browser.page_source # str() 函数将对象转化为适于人阅读的形式。 lis = [['product','version','title','status','tags','info']]
re_rule_1 = r'<select class="section-compatibility__manufacturers select2-hidden-accessible" tabindex="-1" aria-hidden="true">(.*?)</select>'
data_list = re.findall(re_rule_1,data, re.S)
data = data_list[0]
re_rule = r'<option value=".*?">(.*?)</option>'
datalist = re.findall(re_rule, data, re.S)
print datalist
for i in range(1, len(datalist)):
try:
browser.find_element_by_class_name('select2-selection__arrow').click()
except :
browser.find_element_by_class_name('section-compatibility__message__close').click()
browser.find_element_by_class_name('select2-selection__arrow').click()
s1 = Select(browser.find_element_by_class_name('section-compatibility__manufacturers'))
s2 = s1.options[i]
# s1.select_by_index(i)
s3= s2.text
print s3
s2.click()
try:
browser.find_element_by_class_name('select2-selection__arrow').click()
except :
browser.find_element_by_class_name('section-compatibility__message__close').click()
browser.find_element_by_class_name('select2-selection__arrow').click()
s2.click()
browser.find_element_by_class_name('select2-selection__arrow').click() time.sleep(2) rule1 = r'<select class="section-compatibility__models select2-hidden-accessible" tabindex="-1" aria-hidden="true">(.*?)</select>'
data = browser.page_source
if '<select class="section-compatibility__models select2-hidden-accessible" tabindex="-1" aria-hidden="true">' not in data:
rule1 = '<select class="section-compatibility__models select2-hidden-accessible" disabled="" tabindex="-1" aria-hidden="true">(.*?)</select>'
bullish = re.findall(rule1,data , re.S)
# print len(bullish),bullish
if len(bullish)>0:
bullish = bullish[0]
else:
print len(bullish), bullish
lis.append([s3,'','','','',''])
continue
re_rule = r'<option value=".*?">(.*?)</option>'
bullish = re.findall(re_rule, bullish, re.S)
print bullish
for j in range(0,len(bullish)):
btn = browser.find_elements_by_class_name('select2-selection')
try:
btn[1].click()
except :
try:
browser.find_element_by_class_name('section-compatibility__message__close').click()
btn[1].click()
except:
# btn[1].click()
browser.find_element_by_class_name('section-compatibility__message__close').click()
btn[1].click()
s4=Select(browser.find_element_by_class_name('section-compatibility__models'))
s5 = s4.options[j]
s6=s5.text
print s6
s5.click()
try:
btn[1].click()
except:
try:
browser.find_element_by_class_name('section-compatibility__message__close').click()
btn[1].click()
except:
hdata = browser.page_source
try:
t = browser.find_element_by_class_name('section-compatibility__message__title').text
except:
t = ''
print t
b1 = r'<div class="section-compatibility__message__tip"> <strong>(.*?): </strong><span>(.*?)</span> </div>'
bk = re.findall(b1, hdata, re.S)
print bk
try:
status = bk[0][0]
except:
status = ''
try:
tag = bk[0][1]
except:
tag = ''
# i1 = r'<div class="section-compatibility__message__body"><p>(.*?)</p>.*?</div>'
# info = re.findall(i1,hdata,re.S)
try:
info = browser.find_element_by_class_name(
'section-compatibility__message__body').find_element_by_tag_name('p').text
except:
info = ''
print info
lis.append([s3, s6, t, status, tag, info])
continue
hdata = browser.page_source
try:
t = browser.find_element_by_class_name('section-compatibility__message__title').text
except:
t=''
print t
b1 = r'<div class="section-compatibility__message__tip"> <strong>(.*?): </strong><span>(.*?)</span> </div>'
bk = re.findall(b1,hdata,re.S)
print bk
try:
status = bk[0][0]
except:status=''
try:
tag = bk[0][1]
except:tag=''
# i1 = r'<div class="section-compatibility__message__body"><p>(.*?)</p>.*?</div>'
# info = re.findall(i1,hdata,re.S)
try:
info = browser.find_element_by_class_name('section-compatibility__message__body').find_element_by_tag_name('p').text
except:info=''
print info
lis.append([s3,s6,t,status, tag,info]) try:
browser.find_element_by_class_name('section-compatibility__message__close').click()
except:
# btn[1].click()
try:
browser.find_element_by_class_name('section-compatibility__message__close').click()
except:
try:
btn[1].click()
s5.click()
btn[1].click()
browser.find_element_by_class_name('section-compatibility__message__close').click()
except:
continue
return lis def zhizuo(lis):
file = Workbook(encoding='utf-8')
table = file.add_sheet('data')
for i, p in enumerate(lis):
for j, q in enumerate(p):
table.write(i, j, q)
file.save( 'product_info.csv')
return 'success' url = 'https://www.xuangubao.cn/'
lis = openurl(3)
print(lis)
zhizuo(lis)
# f=open("F:\\text.txt","a")
# for key,values in dict.items():
# f.write((key+"\t"))
# print(key,values)
# f.close()

爬取finsix

破解滑动验证:

 http://www.cnblogs.com/fangjie0410/p/8269219.html

 

上一篇:js实现瀑布流加载图片效果


下一篇:css3多列布局瀑布流加载样式