爬虫学习笔记:8684公交路线

SHOW ME THE CODE!!!

首先进行网页分析,具体操作:省略。

# -*- coding: utf-8 -*-
"""
Created on Fri Dec 10 16:25:59 2021
@author: Hider
"""

# 爬虫学习:8684公交路线
# 网站:https://www.8684.cn/
# 公交站点、地铁站点、违章、资讯等等数据

'''
--------- 网页分析 ----------
广州公交:https://guangzhou.8684.cn/
div class="bus-layer depth w120"
第3个 div class="p110"

市区编码线路:https://guangzhou.8684.cn/line1
div class="list clearfix"
a标签 href title

广州1路公交车路线:https://guangzhou.8684.cn/x_322e21c5
'''

上代码!!!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import random
import time

def get_ua():
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
		'Opera/8.0 (Windows NT 5.1; U; en)',
		'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
		'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
		'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
		'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
		'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
		'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
		'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
		'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
		'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
		'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
		'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
		'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
		'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
		'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
		'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ',
    ]
    user_agent = random.choice(user_agents) # 随机抽取对象
    return user_agent

# 请求
url = 'https://guangzhou.8684.cn/'
response = requests.get(url=url, headers={'User-Agent':get_ua()}, timeout=10)

# 获取数据并解析
soup = BeautifulSoup(response.text, 'lxml')
soup_bus_layer = soup.find('div', class_='bus-layer depth w120')

# 解析分类数据
dict_result = {}
soup_bus_list = soup_bus_layer.find_all('div', class_='pl10')
for soup_bus in soup_bus_list:
    name = soup_bus.find('span', class_='kt').get_text()
    # print(name)
    if '线路分类' in name:
        soup_a_list = soup_bus.find('div', class_='list')
        for soup_a in soup_a_list.find_all('a'):
            text = soup_a.get_text()
            href = soup_a.get('href')
            dict_result[text] = 'https://guangzhou.8684.cn' + href

print(dict_result)

# 遍历各个线路
bus = []

for key, value in dict_result.items():
    print('Key is:', key) 
    print('Value is:', value)
    response = requests.get(url=value, headers={'User-Agent':get_ua()}, timeout=10)

    # 获取数据并解析
    soup = BeautifulSoup(response.text, 'lxml')
    # 详细线路
    soup_bus_list = soup.find('div', class_='list clearfix')
    for soup_a in soup_bus_list.find_all('a'):
        text = soup_a.get_text()
        href = soup_a.get('href')
        title = soup_a.get('title')
        bus.append([key, value, title, text, 'https://guangzhou.8684.cn' + href])
    
# print(bus)

# 公交线路明细车站
final_bus_result = []
# bus_test = bus[0:10]
index = 0
# 遍历每一条线路
for i in bus:
    print(f'正在爬取{i[2]}...')
    index += 1
    if index % 100 == 0:
        print('休息一下吧!~ZzzZ~ ')
        time.sleep(random.randint(5, 10)) # 添加随机时间
    print(index)
    url = i[4]
    response = requests.get(url=url, headers={'User-Agent':get_ua()}, timeout=10)
    # 获取数据并解析
    soup = BeautifulSoup(response.text, 'lxml')
    soup_bus_run = soup.find('ul', class_='bus-desc')
    # 运行时间
    bus_run_time = soup_bus_run.find_all('li')[0].get_text()
    # 参考票价
    bus_price = soup_bus_run.find_all('li')[1].get_text()
    # 公交公司
    try:
        bus_company = soup_bus_run.find_all('li')[2].find('a').get_text()
    except:
        bus_company = soup_bus_run.find_all('li')[2].get_text()
    # 最后更新
    bus_update_time = soup_bus_run.find_all('li')[3].get_text() # 此处应该可优化 只取内容 剔除div
    # 站点信息
    soup_bus_station = soup.find_all('div', class_='bus-lzlist mb15')[0]
    
    bus_station = {}
    for soup_bus in soup_bus_station.find_all('li'):
        text = soup_bus.get_text()
        href = soup_bus.find('a').get('href')
        bus_station[text] = 'https://guangzhou.8684.cn' + href
    final_bus_result.append([i[0], i[1], i[2], i[3], url, bus_run_time, bus_price, bus_company, bus_update_time, bus_station])



df = pd.DataFrame(final_bus_result).rename(columns={0:'线路分类', 1:'线路分类网址', 2:'线路', 3:'线路名称', 4:'线路网址', 5:'运行时间', 6:'参考票价', 7:'公交公司', 8:'最后更新', 9:'站点信息'})

df.to_csv(r'C:\Users\Hider\Desktop\bus.csv', index=False, encoding='utf-8-sig')

参考链接:手把手教学,正式开始!

上一篇:python爬虫第三章:(二)bs4进行数据解析


下一篇:我与云原生的故事