利用Python相关工具爬取链家二手房中的各房源信息,包含小区名称、户型、装修情况、单价和总价
要求:
1、使用工具:urllib.request(请求)、re(解析)、csv(存储)
2、编程范式:面向对象
3、反反爬机制:利用time和random控制请求频次、伪装请求头User-Agent
代码如下:
from urllib import request import re import csv import time import random class LianjiaSpider(object): def __init__(self, pages=1): self.base_url = ‘https://hf.lianjia.com/ershoufang/pg{}‘ self.headers = {‘User-Agent‘: ‘Mozilla/5.0‘} self.pages = pages # 获取请求响应 def get_page(self, url): req = request.Request(url, headers=self.headers) res = request.urlopen(req) html = res.read().decode(‘utf-8‘) self.parse_page(html) # 解析相应内容 def parse_page(self, html): pattern = re.compile( r‘<div class="houseInfo".*?data-el="region">(.*?)</a>(.*?)</div>.*?<div class="totalPrice".*?<span>(.*?)</span>‘, re.S) house_list = pattern.findall(html) for house in house_list: print(house[1].split(‘|‘)) self.save_csv(house_list) # 保存解析内容 def save_csv(self, house_list): house_new_list = [] for house in house_list: house = (house[0].strip(), house[1].split(‘|‘)[1].strip(), house[1].split(‘|‘)[2].strip()[:-2], house[1].split(‘|‘)[4].strip(), house[2].strip(),) house_new_list.append(house) with open(‘lianjia_ershoufang.csv‘, ‘a+‘, encoding=‘utf-8‘, newline=‘‘) as f: writer = csv.writer(f) writer.writerows(house_new_list) def main(self): for page in range(1, self.pages + 1): url = self.base_url.format(str(page)) self.get_page(url) print(‘第%d页下载成功‘ % page) time.sleep(random.randint(1, 10)) if __name__ == ‘__main__‘: with open(‘lianjia_ershoufang.csv‘, ‘a+‘, encoding=‘utf-8‘, newline=‘‘) as f: writer = csv.writer(f) writer.writerow([‘小区名称‘, ‘户型‘, ‘面积‘, ‘装修状况‘, ‘总价‘]) spider = LianjiaSpider(100) spider.main()