python - 爬虫入门练习 爬取链家网二手房信息

import requests
from bs4 import BeautifulSoup
import sqlite3 conn = sqlite3.connect("test.db")
c = conn.cursor() for num in range(1,101):
url = "https://cs.lianjia.com/ershoufang/pg%s/"%num
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/64.0.3282.140 Safari/537.36',
} req = requests.session()
response = req.get(url, headers=headers, verify=False) info = response.text
f1 = BeautifulSoup(info,'lxml')
f2 = f1.find(class_='sellListContent')
f3 = f2.find_all(class_='clear LOGCLICKDATA') for i in f3:
data_id = i.find(class_="noresultRecommend").get('data-housecode')
href = i.find( class_ ="noresultRecommend img ").get('href')
title = i.find(class_ ="title").get_text()
adress = i.find(class_="houseInfo").get_text().split("|")
jage = i.find(class_="totalPrice").get_text() # print(k,data_id, '|', title, '|', adress, '|', jage, '|', href)
# print("---") dz = ''
fx = ''
dx = ''
cx = ''
zx = ''
dt = ''
if len(adress) == 6:
dz = adress[0]
fx = adress[1]
dx = adress[2]
cx = adress[3]
zx = adress[4]
dt = adress[5]
elif len(adress) ==5:
dz = adress[0]
fx = adress[1]
dx = adress[2]
cx = adress[3]
zx = adress[4]
dt = 'None'
# print(dz,fx,dx,cx,zx,dt)
elif len(adress) < 5:
print(dz, fx, dx, cx, zx, dt) info = {'nid':int(data_id),
'title':title,
'dz':dz,
'fx':fx,
'dx':dx,
'cx':cx,
'zx':zx,
'dt':dt,
'jg':jage,
'url':href}
# print(info) x = info
sql = "insert into rsf(nid,dz,fx,dx,cx,zx,dt,jg,title,url)values(%d,'%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (x['nid'], x['dz'], x['fx'], x['dx'], x['cx'], x['zx'], x['dt'], x['jg'], x['title'], x['url'])
cursor = c.execute(sql)
conn.commit()
# print("OK") conn.close() # import json
# file_path = 'info%s.txt'%num
# json_data = json.dumps(info_list).encode('utf8')
# with open(file_path,'wb') as f:
# f.write(json_data)

sqlite3 读取数据

import sqlite3
conn = sqlite3.connect("test.db")
c = conn.cursor() #sqlit3 查询数据
cursor = c.execute("SELECT * from rsf")
k = 1
for row in cursor:
num = float(row[7].split('万')[0])
if 30.0 < num < 50.0:
print(k,row[1],row[3],num,row[-2])
k +=1 conn.close()
上一篇:zookeeper 笔记-机制的特点


下一篇:python网络爬虫抓取动态网页并将数据存入数据库MySQL