SHOW ME THE CODE!!!
首先进行网页分析,具体操作:省略。
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 10 16:25:59 2021
@author: Hider
"""
# 爬虫学习:8684公交路线
# 网站:https://www.8684.cn/
# 公交站点、地铁站点、违章、资讯等等数据
'''
--------- 网页分析 ----------
广州公交:https://guangzhou.8684.cn/
div class="bus-layer depth w120"
第3个 div class="p110"
市区编码线路:https://guangzhou.8684.cn/line1
div class="list clearfix"
a标签 href title
广州1路公交车路线:https://guangzhou.8684.cn/x_322e21c5
'''
上代码!!!
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import random
import time
def get_ua():
user_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ',
]
user_agent = random.choice(user_agents) # 随机抽取对象
return user_agent
# 请求
url = 'https://guangzhou.8684.cn/'
response = requests.get(url=url, headers={'User-Agent':get_ua()}, timeout=10)
# 获取数据并解析
soup = BeautifulSoup(response.text, 'lxml')
soup_bus_layer = soup.find('div', class_='bus-layer depth w120')
# 解析分类数据
dict_result = {}
soup_bus_list = soup_bus_layer.find_all('div', class_='pl10')
for soup_bus in soup_bus_list:
name = soup_bus.find('span', class_='kt').get_text()
# print(name)
if '线路分类' in name:
soup_a_list = soup_bus.find('div', class_='list')
for soup_a in soup_a_list.find_all('a'):
text = soup_a.get_text()
href = soup_a.get('href')
dict_result[text] = 'https://guangzhou.8684.cn' + href
print(dict_result)
# 遍历各个线路
bus = []
for key, value in dict_result.items():
print('Key is:', key)
print('Value is:', value)
response = requests.get(url=value, headers={'User-Agent':get_ua()}, timeout=10)
# 获取数据并解析
soup = BeautifulSoup(response.text, 'lxml')
# 详细线路
soup_bus_list = soup.find('div', class_='list clearfix')
for soup_a in soup_bus_list.find_all('a'):
text = soup_a.get_text()
href = soup_a.get('href')
title = soup_a.get('title')
bus.append([key, value, title, text, 'https://guangzhou.8684.cn' + href])
# print(bus)
# 公交线路明细车站
final_bus_result = []
# bus_test = bus[0:10]
index = 0
# 遍历每一条线路
for i in bus:
print(f'正在爬取{i[2]}...')
index += 1
if index % 100 == 0:
print('休息一下吧!~ZzzZ~ ')
time.sleep(random.randint(5, 10)) # 添加随机时间
print(index)
url = i[4]
response = requests.get(url=url, headers={'User-Agent':get_ua()}, timeout=10)
# 获取数据并解析
soup = BeautifulSoup(response.text, 'lxml')
soup_bus_run = soup.find('ul', class_='bus-desc')
# 运行时间
bus_run_time = soup_bus_run.find_all('li')[0].get_text()
# 参考票价
bus_price = soup_bus_run.find_all('li')[1].get_text()
# 公交公司
try:
bus_company = soup_bus_run.find_all('li')[2].find('a').get_text()
except:
bus_company = soup_bus_run.find_all('li')[2].get_text()
# 最后更新
bus_update_time = soup_bus_run.find_all('li')[3].get_text() # 此处应该可优化 只取内容 剔除div
# 站点信息
soup_bus_station = soup.find_all('div', class_='bus-lzlist mb15')[0]
bus_station = {}
for soup_bus in soup_bus_station.find_all('li'):
text = soup_bus.get_text()
href = soup_bus.find('a').get('href')
bus_station[text] = 'https://guangzhou.8684.cn' + href
final_bus_result.append([i[0], i[1], i[2], i[3], url, bus_run_time, bus_price, bus_company, bus_update_time, bus_station])
df = pd.DataFrame(final_bus_result).rename(columns={0:'线路分类', 1:'线路分类网址', 2:'线路', 3:'线路名称', 4:'线路网址', 5:'运行时间', 6:'参考票价', 7:'公交公司', 8:'最后更新', 9:'站点信息'})
df.to_csv(r'C:\Users\Hider\Desktop\bus.csv', index=False, encoding='utf-8-sig')
参考链接:手把手教学,正式开始!