selenium显性等待爬取京东数据

 

1.模拟用户操作获取数据

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/12/2 21:10
# @Author : Lhtester
# @Site : 
# @File : 爬取京东商品.py
# @Software: PyCharm

import time
import pymongo
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree

browser = webdriver.Chrome()
wait = WebDriverWait(browser,50)
db = pymongo.MongoClient(host='', port=27017,username="root",password="123456")["mydb"]['jd']

def search():
    browser.get('https://www.jd.com/')
    try:
        '''判断是否至少有1个元素存在于dom树中,如果定位到就返回列表'''
        input_text = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#key"))
        )
        '''判断某个元素中是否可见并且是enable的,代表可点击'''
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,"#search > div > div.form > button"))
        )
        input_text[0].send_keys('python')
        submit.click()
        #使用CSS3 :nth-child() 选择器 获取总页数
        total = wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > em > b")

            )
        )

        print(total[0].text)
        html = browser.page_source
        # print('see look look ',html)
        prase_html(html)
        return total[0].text
    except TimeoutError:
        search()
def next_page(page_number):
    try:
        #滑动到底部,加载出后30个货物信息
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)
        #翻页动作
        button =  wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_bottomPage > span.p-num > a.pn-next > em"))
        )
        button.click()
        wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#J_goodsList > ul > li:nth-child(60)"))

        )
        # print(data)


        #判断翻页成功
        wait.until(
            EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#J_bottomPage > span.p-num >a.curr"),str(page_number))
        )
        html = browser.page_source
        prase_html(html)
    except TimeoutError:
        return next_page(page_number)
def prase_html(html):
    print('处理prase')
    html = etree.HTML(html)
    items = html.xpath('//li[@class="gl-item"]')
    for i in range(len(items)):
        item= {}
        if html.xpath('//div[@class="p-img"]')[i].get('data-lazy-img') != "done":
            img =html.xpath('//div[@class="p-img"]//img')[i].get('data-lazy-img')
        else:
            img = html.xpath('//div[@class="p-img"]//img')[i].get('src')
        #图片URL
        item["img"]= img
        #标题
        item["title"] = html.xpath('//div[@class="p-name"]//em')[i].xpath('string(.)')
        #价格
        item["price"] = html.xpath('//div[@class="p-price"]//i')[i].text
        #评论
        item["commit"] = html.xpath('//div[@class="p-commit"]//a')[i].text

        save(item)

def save(item):
    try:
        db.insert_many([item])
        print('插入成功')
    except Exception:
        print("{}存储到MongoDB失败".format(str(item)))

def main():
    print("第",1,"页:")
    total = int(search())
    for i in range(2, total +1 ):
        time.sleep(3)
        print("第", i, "页:")
        next_page(i)

if __name__ == "__main__":
    main()

结果如下:

selenium显性等待爬取京东数据

 

上一篇:《手把手教你》系列技巧篇(四十六)-java+ selenium自动化测试-web页面定位toast-下篇(详解教程)


下一篇:python爬虫学习方向以及需要掌握的知识