【爬虫】爬取瓜子网二手车信息(mysql+selenium模拟登录)

# By Vax
# At time - 2021/1/3 15:36
# linked from

import json
import requests, re
from lxml import etree

# 获取网页的源码
def get_content(url, headers):
    response = requests.get(url, headers=headers)
    return response.text


# 获取子页原代码
def get_info(text):
    item = {}
    title_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@title')
    price_list = text.xpath('//div[@class="t-price"]/p/text()')
    year_list = text.xpath('//div[@class="t-i"]/text()[1]')
    millon_list = text.xpath('//div[@class="t-i"]/text()[2]')
    picture_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/img/@src')
    details_list = text.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@href')
    for i, title in enumerate(title_list):
        item['标题'] = title
        item['价格'] = price_list[i] + '万'
        item['公里数'] = millon_list[i]
        item['年份'] = year_list[i]
        item['照片链接'] = picture_list[i]
        item['详情页链接'] = 'https://www.guazi.com' + details_list[i]
        print(item)


# 主函数
def main():
    base_url = 'https://www.guazi.com/bj/buy/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
        'Cookie': 'track_id=7534369675321344; uuid=c129325e-6fea-4fd0-dea5-3632997e0419; antipas=wL2L859nHt69349594j71850u61; cityDomain=bj; clueSourceCode=10103000312%2300; user_city_id=12; ganji_uuid=6616956591030214317551; sessionid=5f3261c7-27a6-4bd6-e909-f70312d46c39; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%227534369675321344%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22c129325e-6fea-4fd0-dea5-3632997e0419%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%225f3261c7-27a6-4bd6-e909-f70312d46c39%22%7D; preTime=%7B%22last%22%3A1572951901%2C%22this%22%3A1572951534%2C%22pre%22%3A1572951534%7D',
    }
    html = etree.HTML(get_content(base_url, headers))
    brand_url_list = html.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]/ul/li/p/a/@href')
    for url in brand_url_list:
        headers = {
            'Referer': 'https://www.guazi.com/bj/buy/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'Cookie': 'track_id=7534369675321344; uuid=c129325e-6fea-4fd0-dea5-3632997e0419; antipas=wL2L859nHt69349594j71850u61; cityDomain=bj; clueSourceCode=10103000312%2300; user_city_id=12; ganji_uuid=6616956591030214317551; sessionid=5f3261c7-27a6-4bd6-e909-f70312d46c39; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%227534369675321344%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22c129325e-6fea-4fd0-dea5-3632997e0419%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%225f3261c7-27a6-4bd6-e909-f70312d46c39%22%7D; preTime=%7B%22last%22%3A1572953403%2C%22this%22%3A1572951534%2C%22pre%22%3A1572951534%7D',
        }
        brand_url = 'https://www.guazi.com' + url.split('/#')[0] + '/o%s/#bread'  # 拼接每个品牌汽车的url
        print(brand_url)
        for i in range(1, 3):
            html = etree.HTML(get_content(brand_url % i, headers=headers))
            get_info(html)


if __name__ == '__main__':
    main()

 

上一篇:Linux安全模型中的3A


下一篇:封装一个 员工类 使用preparedStatement 查询数据 (2) 使用 arrayList 集合