爬虫selenium框架爬取携程酒店数据

全部原创截止到2021.1.29可用, 直接写入了数据库看代码就可以知道数据表设计的方式, 有帮助的话点个赞谢谢.

from selenium import webdriver
from time import sleep
import xlwt  # 进行excel操作
import pandas as pd
from sqlalchemy import create_engine


driver=webdriver.Chrome()
#打开网页
driver.get("https://hotels.ctrip.com/hotels/list?countryId=1&city=48&checkin=2021/02/22&checkout=2021/02/23&optionId=48&optionType=City&directSearch=0&display=%E4%B8%9C%E6%96%B9%2C%20%E6%B5%B7%E5%8D%97%2C%20%E4%B8%AD%E5%9B%BD&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&")

#19&optionId=31&optionType=Province&directSearch=0&display=%E6%B5%B7%E5%8D%97%2C%20%E4%B8%AD%E5%9B%BD&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&



#通过xpath点击搜索
driver.maximize_window()
#driver.find_element_by_xpath("//*[@id='hotels-destination']").send_keys("海口")
#driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div[1]/div[1]/div[3]/div/div/ul/li[5]/div").click()
driver.implicitly_wait(30)#隐式休息20s 登录携程 #  加长到30s 办两件事 1. 把登录框X掉  2. 一直下拉直到出现查看更多蓝色按钮为止

newData = pd.DataFrame(columns=['hotel', 'price', 'address', 'score', 'number','image'])

# for i in range(1,11):
for j in range(4,14):
    name=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[1]/div[2]/div[1]/div/span[1]")
    price=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[2]/div[2]/div[1]/p/span")
    #address=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[1]/div[2]/div[2]/p/span[1]")
    percent=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[2]/div[1]/div/div[2]/span")
    people=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[2]/div[1]/div/div[1]/p[2]/a")
    image = driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[1]/div[1]/div/div/div")


    names = name.get_attribute("textContent").replace('\n', '').replace('\t', '')
    prices = price.get_attribute("textContent")
    percents = percent.get_attribute("textContent")
    peoples = people.get_attribute("textContent")


    # lst = address.get_attribute("textContent").split(" ")
    # a = len(lst)
    # addresses = lst[a-1]
    addresses = "东方"

    xiugai = image.get_attribute("style")
    lstn = xiugai.strip('");').split("//")
    ls = len(lstn)
    images = "https://" + lstn[ls-1]
    sleep(3)
    #if j%2 == 0:
    #   driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/div[2]/div/span").click()
    newData = newData.append(pd.Series({'hotel': names, 'price': prices, 'address': addresses, 'score': percents, 'number': peoples, 'image': images}),ignore_index=True)

data=newData
engine = create_engine('mysql+pymysql://root:root@localhost:3306/myblog')

data.to_sql('cityhotel', engine, if_exists='append')





print("爬取完毕!")

上一篇:Golang XML 序列化忽略父级元素


下一篇:浅谈java开闭原则