使用requests爬取8684公交线路

爬取湖北部分公交线路

代码示例

import re

import requests  # 如果没有需要安装requests
from lxml import etree
from bs4 import BeautifulSoup

# 爬取湖北湖北宜昌公交的基本url
base_url = 'https://yichang.8684.cn'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64',
}
bus_list = []


def first_url(text):
	# 使用xpath解析
    tree = etree.HTML(text)
    # 爬取以数字开头的公交线地址
    num_list = tree.xpath('/html/body/div[6]/div[2]/div[1]/div/a/@href')
    # 爬取以字母开头的公交线开头的地址
    eng_list = tree.xpath('/html/body/div[6]/div[2]/div[2]/div/a/@href')
    return num_list + eng_list


def parse_a(alist):
    for al in alist:
        # 每一路公交车的url
        url = base_url + al['href']
        # 请求每一条公交线路的详细信息
        resp = requests.get(url=url, headers=headers)
        tree = etree.HTML(resp.text)
        # 线路名称
        xianlu_name = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/h1/text()')[0]
        print("正在爬取"+xianlu_name,url)
        # 运行时间
        start_time = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/ul/li[1]/text()')[0]
        ticket_price = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/ul/li[2]/text()')[0]
        # 使用正则把票价取出来
        ticket_price = 0 if  ticket_price[-4:]=='免费乘车' or ticket_price[-2:]=='暂无' else re.search('(\d)',ticket_price).group()
        # 上行线路所经过的站
        shangxing_xianlu = tree.xpath('/html/body/div[7]/div[1]/div[7]/ol/li/a/text()')
        # 下行线路所经过的站
        xiaxing_xianlu = tree.xpath('/html/body/div[7]/div[1]/div[9]/ol/li/a/text()')
        # 存放公交线路的基本信息
        bus_dict = {
            'xianlu_name':xianlu_name,
            'start_time':start_time[5:],
            'ticket_price':ticket_price,
            'shangxing_xianlu':shangxing_xianlu,
            'xiaxing_xianlu':xiaxing_xianlu,
            'shangxing_stop':len(shangxing_xianlu),
            'xiaxing_stop':len(xiaxing_xianlu)
        }
        # 写入文件
        with open('bus.txt','a',encoding='utf8') as fp:
            fp.write(str(bus_dict)+'\n')
        print('结束爬取'+xianlu_name)




def erji_url(totallist):
    # 拼接url地址
    for urllist in totallist:
        # 二级请求地址
        erji_req_url = base_url + urllist
        resp = requests.get(url=erji_req_url, headers=headers)
        # 这里使用bs4解析,也可以使用xpath,就是为了熟练使用各个解析方法
        soup = BeautifulSoup(resp.text, 'lxml')
        # 找到所有的含有一类的所有的a标签,比如找到含有数字1的所有公交线路
        a_list = soup.select(
            'body > div.layout.layout--728-250 > div.layout-left > div.cc-content > div.list.clearfix>a')
        parse_a(a_list)


def main():
    resp = requests.get(url=base_url, headers=headers)
    # 获取一级url
    first_total_list = first_url(resp.text)
    # 获取二级url
    erji_url(first_total_list)


if __name__ == '__main__':
    main()

所涉及到的知识点

1.requests请求
2.两种解析方法
xpath
bs4

上一篇:Android自动化测试中如何处理各种弹窗


下一篇:seleniumUI自动化定位方式:1.xpath 2.css