爬取天津链家各个房屋的数据
数据量很多,建议先改一下试一试在完全爬下来。
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
#导包
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#链家网站的区分类,后面的数字是每个对应分区的总页数
url ='''https://tj.lianjia.com/ershoufang/heping/ 100,
https://tj.lianjia.com/ershoufang/nankai/ 100,
https://tj.lianjia.com/ershoufang/hexi/ 100,
https://tj.lianjia.com/ershoufang/hongqiao/ 100,
https://tj.lianjia.com/ershoufang/xiqing/ 100,
https://tj.lianjia.com/ershoufang/beichen/ 100,
https://tj.lianjia.com/ershoufang/dongli/ 100,
https://tj.lianjia.com/ershoufang/jinnan/ 100,
https://tj.lianjia.com/ershoufang/tanggu/ 100,
https://tj.lianjia.com/ershoufang/kaifaqutj/ 52,
https://tj.lianjia.com/ershoufang/wuqing/ 100,
https://tj.lianjia.com/ershoufang/binhaixinqu/ 100,
https://tj.lianjia.com/ershoufang/baodi/ 100,
https://tj.lianjia.com/ershoufang/jizhou/ 69,
https://tj.lianjia.com/ershoufang/haihejiaoyuyuanqu/ 43,
https://tj.lianjia.com/ershoufang/jinghai/ 26
'''
def get_urls(urli,n):
'''
功能:分页网址url采集
n:页面参数
urli:网址
结果:得到分页网址list
'''
lst = []
for page in range(1,n):
ui = urli+'pg'+'%i'%page
lst.append(ui)
return lst
#解析页面
def get_dataurls(ui,d_h,d_c,ips):
'''
Parameters
----------
ui : 分页网址.
d_h : user-agent信息.
d_c : cookies信息
Returns
-------
列表网址.
'''
try:
ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
except:
try:
ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
except:
print('request failed 2 times')
#访问页面
soupi = BeautifulSoup(ri.text,'lxml')
ul = soupi.find('ul',class_="sellListContent")
lis = ul.find_all('li')
lst = []
for li in lis:
lst.append(li.find('a')['href'])
return lst
#获取数据
def get_data(ui,d_h,d_c,ips):
'''
Parameters
----------
ui : 分页网址.
d_h : user-agent信息.
d_c : cookies信息
'''
try:
proxies = None
ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
except:
# logdebug('requests failed one time')
try:
proxies = None
ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
except:
# logdebug('requests failed two time')
print('requests failed two time')
soupi = BeautifulSoup(ri.text,'lxml')
dic = {}#空字典存储数据
dic['房名'] = soupi.find('div',class_="title").h1.text
dic['总价'] = soupi.find('span',class_="total").text + soupi.find('span',class_="unit").text
dic['单价'] = soupi.find('span',class_="unitPriceValue").text
dic['户型'] = soupi.find('div', class_="room").div.text
dic['朝向'] = soupi.find('div',class_="type").div.text
dic['面积'] = soupi.find('div',class_="area").div.text
dic['小区名称'] = soupi.find('div',class_="communityName").a.text
dic['所在区域'] = soupi.find('span',class_="info").text
infors = soupi.find('div',class_="introContent").text
s = re.sub(r' +','',infors)
dic['挂牌时间'] = re.search(r'挂牌时间\d+-\d+-\d+', s).group(0)
position = re.search(r"resblockPosition:'([\d.]+),([\d.]+)'",ri.text)
return dic
#这个如果遇见反爬虫网站可以设置动态IP,一小时一块钱吧,搜索阿布云即可
#使用方法可以看他们的文档
def get_proxies(p_User,p_Pass,p_Host,p_Port):
'''
生成动态ip函数
Parameters
----------
p_Usermp_Pass :
设置代理服务器.
p_Host,p_Port :
代理服务器验证信息
Returns
-------
ip
'''
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host" : p_Host,
"port" : p_Port,
"user" : p_User,
"pass" : p_Pass,
}
ips = {
"http" : proxyMeta,
"https" : proxyMeta,
}
return ips
if __name__ == "__main__":
#设置爬取多少页
url_lst = url.strip().split(',')
url_u = []
for str in url_lst:
url_u.append(str.split(' '))
ip_dic = None
#设置代理ip
# 123 为你花了钱之后给你的密钥
# ip_dic = get_proxies('123',
# '123',
# 'http-dyn.abuyun.com',
# '9020')
urllst1 = []
#获取分页网址
for url_p in url_u:
lst_test = get_urls(url_p[0],int(url_p[1]))
urllst1.extend(lst_test)
#u1 = urllst1[0]
#设置登录信息
#dic_h 和cookies如何设置见图片
dic_h = {''User-Agent':''}
dic_c = {}
cookies = ''' '''
for i in cookies.split('; '):
dic_c[i.split('=')[0]] = i.split('=')[1]
#获取agent cookies
urllst2 = []
#得到每个房子的网址
for u in urllst1:
try:
urllst2.extend(get_dataurls(u, dic_h, dic_c,ip_dic))
print('成功采集页面信息,成功采集%i条数据' %(len(urllst2)))
except:
print('获取页面信息失败,分页网址位:',u)
# print(urllst2)
#获取信息
errorlst = []
datalst = []
for u in urllst2:
try:
datalst.append(get_data(u,dic_h,dic_c,ip_dic))
print('数据采集成功,总共采集%i条数据' %len(datalst))
except:
errorlst.append(u)
print('采集数据失败,失败网址为',u)
for u in errorlst:
try:
datalst.append(get_data(u,dic_h,dic_c,ip_dic))
print('数据采集成功,总共采集%i条数据' %len(datalst))
except:
errorlst.append(u)
print('采集数据失败,失败网址为',u)
data = datalst
print(12345)
datadf = pd.DataFrame(datalst)
//注意改下路径
datadf.to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\dataultra.xlsx')
# pd.DataFrame(errorlst).to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\errorlst.xlsx')
# data_room.to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\dataroom.xlsx')
'''
ui = 'https://tj.lianjia.com/ershoufang'
ri = requests.get(url=ui,headers = dic_h, cookies = dic_c)
position = re.search(r"resblockPosition",ri.text)
'''
如何获取代码中的dic_h 和 cookies(注意都要是字符串)
去官网注册一个账号
然后刷新一下,右键检查,选择network
然后下拉
user_agent就是我们的dic_h,注意写成字典,
cookies是
全复制下来即可
爬取天津链家各个小区单价的数据
同样的,注意改下写excel的路径和登录信息。
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#天津链家小区数据的网址,后面是总页数
url ='''https://tj.lianjia.com/xiaoqu/heping/?from=recpage1 23,
https://tj.lianjia.com/xiaoqu/nankai/?from=recpage1 25,
https://tj.lianjia.com/xiaoqu/hexi/?from=recpage1 23,
https://tj.lianjia.com/xiaoqu/hebei/?from=recpage1 17,
https://tj.lianjia.com/xiaoqu/hedong/?from=recpage1 19,
https://tj.lianjia.com/xiaoqu/hongqiao/?from=recpage1 11,
https://tj.lianjia.com/xiaoqu/xiqing/?from=recpage1 17,
https://tj.lianjia.com/xiaoqu/beichen/?from=recpage1 13,
https://tj.lianjia.com/xiaoqu/dongli/?from=recpage1 15,
https://tj.lianjia.com/xiaoqu/jinnan/?from=recpage1 12,
https://tj.lianjia.com/xiaoqu/tanggu/?from=recpage1 18,
https://tj.lianjia.com/xiaoqu/kaifaqutj/?from=recpage1 5,
https://tj.lianjia.com/xiaoqu/wuqing/?from=recpage1 15,
https://tj.lianjia.com/xiaoqu/binhaixinqu/?from=recpage1 14,
https://tj.lianjia.com/xiaoqu/baodi/?from=recpage1 7,
https://tj.lianjia.com/xiaoqu/jizhou/?from=recpage1 10,
https://tj.lianjia.com/xiaoqu/haihejiaoyuyuanqu/?from=recpage1 2,
https://tj.lianjia.com/xiaoqu/jinghai/?from=recpage1 9
'''
'''
u = url_lst[1].split(' ')[0]
lst = []
n = 1
for page in range(1,n+1):
u = u.replace('\n','')
u = u.split(r'/?')[0] + '/pg' + '%i'%page + '/?' + u.split('/?')[1]
ips = None
ri = requests.get(url=u,headers = dic_h, cookies = dic_c,proxies=ips,timeout = 3)
'''
def get_urls(urli,n):
'''
功能:分页网址url采集
n:页面参数
urli:网址
结果:得到分页网址list
'''
lst = []
for page in range(1,n+1):
ui = u.replace('\n','')
ui = urli.split('/?')[0] + '/pg' + '%i'%page + '/?' + urli.split('/?')[1]
lst.append(ui)
return lst
#解析页面
def get_dataurls(ui,d_h,d_c,ips):
'''
Parameters
----------
ui : 分页网址.
d_h : user-agent信息.
d_c : cookies信息
Returns
-------
列表网址.
'''
try:
ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
except:
try:
ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
except:
print('request failed 2 times')
#访问页面
soupi = BeautifulSoup(ri.text,'lxml')
ul = soupi.find('ul',class_="listContent")
lis = ul.find_all('li')
lst = []
for li in lis:
lst.append(li.find('a')['href'])
return lst
'''
ri = requests.get(url = urllst2[:1][0], headers = dic_h, cookies = dic_c, verify=False, proxies=None, timeout=3)
soupi = BeautifulSoup(ri.text,'lxml')
dic = {}#空字典存储数据
dic['单价'] = soupi.find('span',class_="xiaoquUnitPrice").text
dic['小区名称'] = soupi.find('div',class_="detailHeader fl").h1.text
'''
def get_data(ui,d_h,d_c,ips):
'''
Parameters
----------
ui : 分页网址.
d_h : user-agent信息.
d_c : cookies信息
'''
try:
proxies = None
ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
except:
# logdebug('requests failed one time')
try:
proxies = None
ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
except:
# logdebug('requests failed two time')
print('requests failed two time')
soupi = BeautifulSoup(ri.text,'lxml')
dic = {}#空字典存储数据
dic['单价'] = soupi.find('span',class_="xiaoquUnitPrice").text
dic['小区名称'] = soupi.find('div',class_="detailHeader fl").h1.text
return dic
def get_proxies(p_User,p_Pass,p_Host,p_Port):
'''
生成动态ip函数
Parameters
----------
p_Usermp_Pass :
设置代理服务器.
p_Host,p_Port :
代理服务器验证信息
Returns
-------
ip
'''
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host" : p_Host,
"port" : p_Port,
"user" : p_User,
"pass" : p_Pass,
}
ips = {
"http" : proxyMeta,
"https" : proxyMeta,
}
return ips
if __name__ == "__main__":
#设置爬取多少页
url_lst = url.strip().split(',')
url_u = []
for str in url_lst:
url_u.append(str.split(' '))
ip_dic = None
#设置代理ip
# ip_dic = get_proxies('123',
# '123',
# 'http-dyn.abuyun.com',
# '9020')
urllst1 = []
for url_p in url_u:
lst_test = get_urls(url_p[0],int(url_p[1]))
urllst1.extend(lst_test)
#u1 = urllst1[0]
dic_h = {'User-Agent':''}
dic_c = {}
cookies = ''' '''
for i in cookies.split('; '):
dic_c[i.split('=')[0]] = i.split('=')[1]
#获取agent cookies
urllst2 = []
#得到每个房子的网址
for u in urllst1:
try:
urllst2.extend(get_dataurls(u, dic_h, dic_c,ip_dic))
print('成功采集页面信息,成功采集%i条数据' %(len(urllst2)))
except:
print('获取页面信息失败,分页网址位:',u)
# print(urllst2)
#获取信息
errorlst = []
datalst = []
for u in urllst2:
try:
datalst.append(get_data(u,dic_h,dic_c,ip_dic))
print('数据采集成功,总共采集%i条数据' %len(datalst))
except:
errorlst.append(u)
print('采集数据失败,失败网址为',u)
print(12345)
datadf = pd.DataFrame(datalst)
datadf.to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\dataxiaoqu.xlsx')
'''
ui = 'https://tj.lianjia.com/ershoufang'
ri = requests.get(url=ui,headers = dic_h, cookies = dic_c)
position = re.search(r"resblockPosition",ri.text)
'''
本人python版本为3.8 成功运行。
总共的话小区是7000多条,房屋数据是大概40000多