1.简介
使用requests库获取网页源码,使用bs4解析网页内容,最后使用pandas将获取到的信息保存到本地的excel中
2.安装必要的库
pip install request
pip install bs4
pip install pandas
3.上代码
# --coding:utf-8--
import requests
from bs4 import BeautifulSoup
import os
import time
import pandas as pd
download_path = './瓜子二手车'
if not os.path.exists(download_path):
os.makedirs(download_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'Cookie': 'uuid=88550bd8-67ff-47f1-d6f1-750c06bb6603; ganji_uuid=6225037200189058361729; lg=1; antipas=z8655O2183Ou858zk663D81q83; clueSourceCode=%2A%2300; Hm_lvt_bf3ee5b290ce731c7a4ce7a617256354=1610463132,1610770520; sessionid=18fce3de-c20b-4f21-df60-a3737fba8a10; close_finance_popup=2021-01-16; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22default%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%2288550bd8-67ff-47f1-d6f1-750c06bb6603%22%2C%22ca_city%22%3A%22zibo%22%2C%22sessionid%22%3A%2218fce3de-c20b-4f21-df60-a3737fba8a10%22%7D; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A44171002837%7D; cityDomain=bj; user_city_id=12; preTime=%7B%22last%22%3A1610770813%2C%22this%22%3A1610463129%2C%22pre%22%3A1610463129%7D; Hm_lpvt_bf3ee5b290ce731c7a4ce7a617256354=1610770814',
'Host': 'www.guazi.com'}
base_url = 'https://www.guazi.com/bj/buy'
# 获取详情页
def get_detail_url(url):
rq = requests.get(url, headers=headers)
soup = BeautifulSoup(rq.text, 'lxml')
content = soup.find(class_='carlist clearfix js-top')
links = content.find_all('a')
detail_url_list = []
for link in links:
detail_url_list.append(f"https://www.guazi.com{link['href']}")
return detail_url_list
# 获取详情数据
def get_detail(url):
rq = requests.get(url, headers=headers)
soup = BeautifulSoup(rq.text, 'lxml')
content = soup.find(class_='product-textbox')
title = content.find('h1').text
info = content.find(class_='assort clearfix')
span = info.find_all('span')
info_dic = {'name': title.strip(), 'km': span[1].text, 'displacement': span[2].text, 'gearbox': span[3].text}
print(f'{url}下载完成')
return info_dic
def main():
url_list = get_detail_url(base_url)
name_list = []
km_list = []
displacement_list = []
gearbox_list = []
for url in url_list:
infos = get_detail(url)
name_list.append(infos['name'])
km_list.append(infos['km'])
displacement_list.append(infos['displacement'])
gearbox_list.append(infos['gearbox'])
time.sleep(1)
df = pd.DataFrame({'名称': name_list, '公里数': km_list, '排量': displacement_list, '变速箱': gearbox_list})
df.to_excel(f'{download_path}/guazi.xlsx')
if __name__ == '__main__':
main()