爬虫模板几乎一个样儿
提前安装pandas用于存储数据、requests处理URL资源、Bs4(BeautifulSoup4)提取数据、lxml读取网页
爬取链家二手房信息
# -*- coding: utf-8 -*-
# @Author : LEHOSO
# @FileName: Lianjia2.py
# @Time : 2021/10/11 16:55
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
# 表头
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
a = []
def get_info(url):
wb_data = requests.get(url, headers=header)
# 爬取整个网页
soup = BeautifulSoup(wb_data.text, 'lxml')
# 网页单个元素
ranks = soup.select('div.totalPrice.totalPrice2 > span')
title = soup.select('div.title > a')
location = soup.select('div.flood > div > a')
area = soup.select('div.flood > div > a:nth-child(3)')
fllowInfo = soup.select('div.followInfo')
# 存入进列表
for ranks, titles, locaitons, areas, fllowInfos in zip(ranks, title, location, area, fllowInfo):
data = {
'价格': ranks.get_text().strip(),
'标题': titles.get_text().strip(),
'位置': locaitons.get_text().strip() + '-' + areas.get_text().strip(),
'关注': fllowInfos.get_text().strip().split('/')[0],
'距今发布日期': fllowInfos.get_text().strip().split('/')[1]
}
a.append(data)
print(data)
if __name__ == '__main__':
# 网址路径
urls = [
'https://cq.lianjia.com/ershoufang/'
]
for url in urls:
get_info(url)
time.sleep(2)
# pandas存入数据
df_out = pd.DataFrame(a, columns=['价格', '标题', '位置', '关注', '距今发布日期'])
#导出为xlsx格式
df_out.to_excel('aaa.xlsx')