爬取湖北部分公交线路
代码示例
import re
import requests # 如果没有需要安装requests
from lxml import etree
from bs4 import BeautifulSoup
# 爬取湖北湖北宜昌公交的基本url
base_url = 'https://yichang.8684.cn'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64',
}
bus_list = []
def first_url(text):
# 使用xpath解析
tree = etree.HTML(text)
# 爬取以数字开头的公交线地址
num_list = tree.xpath('/html/body/div[6]/div[2]/div[1]/div/a/@href')
# 爬取以字母开头的公交线开头的地址
eng_list = tree.xpath('/html/body/div[6]/div[2]/div[2]/div/a/@href')
return num_list + eng_list
def parse_a(alist):
for al in alist:
# 每一路公交车的url
url = base_url + al['href']
# 请求每一条公交线路的详细信息
resp = requests.get(url=url, headers=headers)
tree = etree.HTML(resp.text)
# 线路名称
xianlu_name = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/h1/text()')[0]
print("正在爬取"+xianlu_name,url)
# 运行时间
start_time = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/ul/li[1]/text()')[0]
ticket_price = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/ul/li[2]/text()')[0]
# 使用正则把票价取出来
ticket_price = 0 if ticket_price[-4:]=='免费乘车' or ticket_price[-2:]=='暂无' else re.search('(\d)',ticket_price).group()
# 上行线路所经过的站
shangxing_xianlu = tree.xpath('/html/body/div[7]/div[1]/div[7]/ol/li/a/text()')
# 下行线路所经过的站
xiaxing_xianlu = tree.xpath('/html/body/div[7]/div[1]/div[9]/ol/li/a/text()')
# 存放公交线路的基本信息
bus_dict = {
'xianlu_name':xianlu_name,
'start_time':start_time[5:],
'ticket_price':ticket_price,
'shangxing_xianlu':shangxing_xianlu,
'xiaxing_xianlu':xiaxing_xianlu,
'shangxing_stop':len(shangxing_xianlu),
'xiaxing_stop':len(xiaxing_xianlu)
}
# 写入文件
with open('bus.txt','a',encoding='utf8') as fp:
fp.write(str(bus_dict)+'\n')
print('结束爬取'+xianlu_name)
def erji_url(totallist):
# 拼接url地址
for urllist in totallist:
# 二级请求地址
erji_req_url = base_url + urllist
resp = requests.get(url=erji_req_url, headers=headers)
# 这里使用bs4解析,也可以使用xpath,就是为了熟练使用各个解析方法
soup = BeautifulSoup(resp.text, 'lxml')
# 找到所有的含有一类的所有的a标签,比如找到含有数字1的所有公交线路
a_list = soup.select(
'body > div.layout.layout--728-250 > div.layout-left > div.cc-content > div.list.clearfix>a')
parse_a(a_list)
def main():
resp = requests.get(url=base_url, headers=headers)
# 获取一级url
first_total_list = first_url(resp.text)
# 获取二级url
erji_url(first_total_list)
if __name__ == '__main__':
main()
所涉及到的知识点
1.requests请求
2.两种解析方法
xpath
bs4