案例演示
网站 https://cs.lianjia.com/ 需求:名称 位置 房源信息 价格 翻页 保存到csv
import requests
from lxml import etree
import csv
from fake_useragent import UserAgent
class ScendHouseSpide:
def __init__(self):
self.base_url = " https://cs.lianjia.com/ershoufang/pg{}/"
def get_html(self, num):
url = self.base_url.format(num)
headers = {
"User-Agent": UserAgent().chrome
}
res = requests.get(url, headers=headers)
return res.content.decode()
def get_data(self, num):
lst = []
html_element = etree.HTML(self.get_html(num))
prices = html_element.xpath("//*[@id='content']/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()")
names = html_element.xpath("//*[@id='content']/div[1]/ul/li/div[1]/div[2]/div/a[1]/text() ")
positions = html_element.xpath("//*[@id='content']/div[1]/ul/li/div[1]/div[2]/div/a[1]/text() ")
infos = html_element.xpath("//*[@id='content']/div[1]/ul/li/div[1]/div[3]/div/text() ")
for pr, na, po, inf in zip(prices, names, positions, infos):
data_dict = {
"价格": pr+"万",
"名称": na,
"位置": po,
"房源信息": inf
}
lst.append(data_dict)
return lst
@staticmethod
def save_data(lst):
headers = {"价格", "名称", "位置", "房源信息"}
with open(file="链家长沙二手房信息.csv", mode='w', encoding='utf-8') as f:
writer = csv.DictWriter(f, headers)
writer.writeheader()
writer.writerows(lst)
if __name__ == '__main__':
lst1 = []
for i in range(1, 5): # 只爬取了5页
lst1 += ScendHouseSpide().get_data(i)
ScendHouseSpide().save_data(lst1)