十一、学习分布式爬虫的第十一天

selenium实战12306购票

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException,ElementNotVisibleException
import csv

#为什么要把driver放在外面?
#因为如果放在类里面,那么当这个类执行完后会自动被内存清理掉,那么driver也会被清理掉
driver = webdriver.Chrome(executable_path="D:\chromedriver\chromedriver.exe")

# 9:商务座,M:一等座,O:二等座,3:硬卧,4:软卧,1:硬座
class TrainSpider(object):
    login_url = "https://kyfw.12306.cn/otn/resources/login.html" #当在类中定义变量时,可以使用self来获取这个变量
    personal_url = "https://kyfw.12306.cn/otn/view/index.html"  #个人中心url
    left_ticket_url = "https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc"  #购票页面
    confirm_passenger_url = "https://kyfw.12306.cn/otn/confirmPassenger/initDc"  #确认乘客信息页面
    stations_codes = {}  # 车站所对应的代号

    def __init__(self,from_station,to_station,train_date,trains,passengers):
        """
        :param from_station: 起始站
        :param to_station: 目的站
        :param train_date: 出发日期
        :param trains: 需要购买的车次,{"G529":["M","O"]} 用户可以接受G529列车上的一等座和二等座
        :param passengers: 乘客的姓名,需要传入一个列表
        """
        self.from_station = from_station
        self.to_station = to_station
        self.train_date = train_date
        self.init_station_code()  #当这个类创建完后,直接执行这个方法初始化车站的代号
        self.trains = trains
        self.passengers = passengers
        self.select_number = None  #选中的车次

    def init_station_code(self):
        with open('stations.csv', 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for line in reader:
                name = line["name"]
                code = line["code"]
                self.stations_codes[name] = code

    def login(self):
        driver.get(self.login_url)
        #等待url是否变成个人中心的url,来判断是否登录成功
        WebDriverWait(driver,100).until(
            EC.url_to_be(self.personal_url)
        )
        print("登录成功!")

    #搜索余票
    def search_left_ticket(self):
        driver.get(self.left_ticket_url)
        #起始站的代号设置
        from_station_code = self.stations_codes[self.from_station]
        from_station_input = driver.find_element_by_id("fromStation")
        driver.execute_script("arguments[0].value='%s'"%from_station_code,from_station_input) #拿到input标签并且设置其值
        #终点站的代号设置
        to_station_code = self.stations_codes[self.to_station]
        to_station_input = driver.find_element_by_id("toStation")
        driver.execute_script("arguments[0].value='%s'" % to_station_code,to_station_input)
        #设置时间
        train_date_input = driver.find_element_by_id("train_date")
        driver.execute_script("arguments[0].value='%s'" % self.train_date, train_date_input)
        #执行查询操作
        search_btn = driver.find_element_by_id("query_ticket")
        search_btn.click()

        #解析车次信息
        WebDriverWait(driver,100).until(
            EC.presence_of_element_located((By.XPATH,"//tbody[@id='queryLeftTable']/tr"))
        )
        train_trs = driver.find_elements_by_xpath("//tbody[@id='queryLeftTable']/tr[not(@datatran)]") #列表
        is_searched = False
        while True:
            for train_tr in train_trs:
                infos = train_tr.text.replace('\n', " ").split(" ")  # 拿到车次数据
                number = infos[0]  # 车次
                if number in self.trains:  # 判断某个key是否在字段当中
                    seat_types = self.trains[number]
                    for seat_type in seat_types:
                        if seat_type == "9":
                            # 商务舱
                            count = infos[7]  # 商务舱余票情况
                            if count.isdigit() or count == '有':
                                is_searched = True
                                break
                        elif seat_type == "M":
                            # 一等座
                            count = infos[8]
                            if count.isdigit() or count == '有':
                                is_searched = True
                                break
                    if is_searched:
                        self.select_number = number  # 保存选中的车次
                        order_btn = train_tr.find_element_by_xpath("//a[@class='btn72']")
                        order_btn.click()
                        return

    def confirm_passengers(self):
        #当点击预定后,需要等页面刷新完后再进行操作
        WebDriverWait(driver,100).until(
            EC.url_to_be(self.confirm_passenger_url)
        )
        #先等待乘客标签加载出来
        WebDriverWait(driver,100).until(
            EC.presence_of_element_located((By.XPATH,"//ul[@id='normal_passenger_id']/li/label"))
        )
        #确认需要购买车票的乘客
        passenger_labels = driver.find_elements_by_xpath("//ul[@id='normal_passenger_id']/li/label")
        for passenger_label in passenger_labels:
            name = passenger_label.text  #拿到乘客名字
            if name in self.passengers:
                passenger_label.click()

        #确认需要购买的席位信息
        seat_select = Select(driver.find_element_by_id("seatType_1"))
        seat_types = self.trains[self.select_number]  #拿到选中车次的席位信息
        for seat_type in seat_types:
            try:
                seat_select.select_by_value(seat_type)  #如果没有这个席位,就会抛出异常
            except NoSuchElementException:
                continue
            else: #如果没有抛出异常,说明选中的席位有剩余
                break

        #等待提交订单按钮可以被点击
        WebDriverWait(driver,100).until(
            EC.element_to_be_clickable((By.ID,"submitOrder_id"))
        )
        submit_btn = driver.find_element_by_id("submitOrder_id")
        submit_btn.click()

        #等待模态对话框出现,并且确认按钮可以点击了
        WebDriverWait(driver,100).until(
            EC.presence_of_element_located((By.CLASS_NAME,"w664"))
        )
        WebDriverWait(driver,100).until(
            EC.element_to_be_clickable((By.ID,"qr_submit_id"))
        )
        sub_btn = driver.find_element_by_id("qr_submit_id")
        while sub_btn:
            try:
                sub_btn.click()
                sub_btn = driver.find_element_by_id("qr_submit_id")
            except ElementNotVisibleException:
                break
        print("恭喜抢票成功!")

    def run(self):
        #1.登录
        self.login()
        #2.车次余票查询
        self.search_left_ticket()
        #3.确认乘客和车次信息
        self.confirm_passengers()

def main():
    spider = TrainSpider("北京","长沙","2020-02-20",{"G71":["9","M"]},["吴育理"])
    spider.run()

if __name__ == '__main__':
    main()

破解js实战

首先要查找到网站发送请求的js文件,进行js美化,然后对这个js文件进行分析,分析出发送各个数据的算法后进行模拟。

import requests
import time
import random
import hashlib

def main():
    word = input("请输入需要翻译的单词:")
    url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
        'Referer': 'http://fanyi.youdao.com/',
        'cookie':'DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; OUTFOX_SEARCH_USER_ID=-1055718841@150.255.48.175; JSESSIONID=abc3xMazG4OPfa9RueCbx; OUTFOX_SEARCH_USER_ID_NCOO=1135975571.425469; _ntes_nnid=484c695ac7e7a8ec4e01562558fa2f10,1582092612397; ___rl__test__cookies=1582092758715',
        "Host": "fanyi.youdao.com",
        "Origin":"http://fanyi.youdao.com",
    }
    timestamp = time.time()*1000
    salt = str(timestamp) + str(random.randint(0,10))
    temp = "fanyideskweb" + word + salt + "Nw(nmmbP%A-r6U3EUn]Aj"
    sign = hashlib.md5(temp.encode("utf-8")).hexdigest()
    data = {
        'i': word,
        'from': 'AUTO',
        'to': 'AUTO',
        'smartresult': 'dict',
        'client': 'fanyideskweb',
        'salt': salt,
        'sign': sign,
        'ts': timestamp,
        'bv': '35242348db4225f3512aa00c2f3e7826',
        'doctype': 'json',
        'version': '2.1',
        'keyfrom': 'fanyi.web',
        'action': 'FY_BY_REALTlME',
    }
    respones = requests.post(url,headers=headers,data=data)
    print(respones.json()['translateResult'][0][0]['tgt']) #将返回的字符串转换为json格式的数据

if __name__ == '__main__':
    main()
十一、学习分布式爬虫的第十一天十一、学习分布式爬虫的第十一天 Mr_Little_li 发布了12 篇原创文章 · 获赞 0 · 访问量 671 私信 关注
上一篇:leetcode--Gas Station问题


下一篇:Station M2极客主机