爬虫天津链家二手房数据(requests + BeautifulSoup)

爬取天津链家各个房屋的数据

数据量很多,建议先改一下试一试在完全爬下来。

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

#导包
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


#链家网站的区分类,后面的数字是每个对应分区的总页数


url ='''https://tj.lianjia.com/ershoufang/heping/ 100,
https://tj.lianjia.com/ershoufang/nankai/ 100,
https://tj.lianjia.com/ershoufang/hexi/ 100,
https://tj.lianjia.com/ershoufang/hongqiao/ 100,
https://tj.lianjia.com/ershoufang/xiqing/ 100,
https://tj.lianjia.com/ershoufang/beichen/ 100,
https://tj.lianjia.com/ershoufang/dongli/ 100,
https://tj.lianjia.com/ershoufang/jinnan/ 100,
https://tj.lianjia.com/ershoufang/tanggu/ 100,
https://tj.lianjia.com/ershoufang/kaifaqutj/ 52,
https://tj.lianjia.com/ershoufang/wuqing/ 100,
https://tj.lianjia.com/ershoufang/binhaixinqu/ 100,
https://tj.lianjia.com/ershoufang/baodi/ 100,
https://tj.lianjia.com/ershoufang/jizhou/ 69,
https://tj.lianjia.com/ershoufang/haihejiaoyuyuanqu/ 43,
https://tj.lianjia.com/ershoufang/jinghai/ 26
'''


    
    
def get_urls(urli,n):
    '''
    功能:分页网址url采集
    n:页面参数
    urli:网址
    结果:得到分页网址list
    '''
    lst = []
    for page in range(1,n):
        ui = urli+'pg'+'%i'%page
        lst.append(ui)
    return lst



    

#解析页面
def get_dataurls(ui,d_h,d_c,ips):
    '''

    Parameters
    ----------
    ui :    分页网址.
    d_h :  user-agent信息.
    d_c :  cookies信息

    Returns
    -------
    列表网址.

    '''
    try:
        ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
    except:
        try:
            ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
        except:
            print('request failed 2 times')
    #访问页面
    soupi = BeautifulSoup(ri.text,'lxml')
    ul = soupi.find('ul',class_="sellListContent")
    lis = ul.find_all('li')
    lst = []
    for li in lis:
        lst.append(li.find('a')['href'])
    return lst
#获取数据
def get_data(ui,d_h,d_c,ips):
    '''
    

 Parameters
    ----------
    ui :    分页网址.
    d_h :  user-agent信息.
    d_c :  cookies信息


    '''
    try:
        proxies = None
        ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
    except:
    # logdebug('requests failed one time')
        try:
            proxies = None
            ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
        except:
            # logdebug('requests failed two time')
            print('requests failed two time')
    

        
    
    
    
    soupi = BeautifulSoup(ri.text,'lxml')
    dic = {}#空字典存储数据
    dic['房名'] = soupi.find('div',class_="title").h1.text   
    dic['总价'] = soupi.find('span',class_="total").text + soupi.find('span',class_="unit").text
    dic['单价'] = soupi.find('span',class_="unitPriceValue").text
    dic['户型'] = soupi.find('div', class_="room").div.text
    dic['朝向'] = soupi.find('div',class_="type").div.text
    dic['面积'] = soupi.find('div',class_="area").div.text
    dic['小区名称'] = soupi.find('div',class_="communityName").a.text
    dic['所在区域'] = soupi.find('span',class_="info").text
    infors = soupi.find('div',class_="introContent").text
    
    s = re.sub(r' +','',infors)
    dic['挂牌时间'] = re.search(r'挂牌时间\d+-\d+-\d+', s).group(0)
    position = re.search(r"resblockPosition:'([\d.]+),([\d.]+)'",ri.text)
    
    return dic
   
   

#这个如果遇见反爬虫网站可以设置动态IP,一小时一块钱吧,搜索阿布云即可
#使用方法可以看他们的文档
def get_proxies(p_User,p_Pass,p_Host,p_Port):
    '''
    生成动态ip函数
    Parameters
    ----------
    p_Usermp_Pass :
        设置代理服务器.
    p_Host,p_Port : 
        代理服务器验证信息

    Returns
    -------
    ip
    
    '''
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host" : p_Host,
        "port" : p_Port,
        "user" : p_User,
        "pass" : p_Pass,
    }
    ips = {
        "http"  : proxyMeta,
        "https" : proxyMeta,
    }
    return ips

    

    
if __name__ == "__main__":
    #设置爬取多少页
    
    url_lst = url.strip().split(',')
    url_u = []
    for str in url_lst:
        url_u.append(str.split(' '))
    ip_dic = None
    #设置代理ip
    # 123 为你花了钱之后给你的密钥
#    ip_dic = get_proxies('123',
#                         '123',
#                         'http-dyn.abuyun.com',
#                         '9020') 

    urllst1 = []
    #获取分页网址
    for url_p in url_u:
        lst_test = get_urls(url_p[0],int(url_p[1]))
        urllst1.extend(lst_test)

    #u1 = urllst1[0]
    #设置登录信息
    #dic_h 和cookies如何设置见图片
    dic_h = {''User-Agent':''}
    dic_c = {}
    cookies = ''' '''
    for i in cookies.split('; '):
        dic_c[i.split('=')[0]] = i.split('=')[1]
        #获取agent cookies
        
    urllst2 = []

#得到每个房子的网址
    for u in urllst1:
        try:
            urllst2.extend(get_dataurls(u, dic_h, dic_c,ip_dic))
            print('成功采集页面信息,成功采集%i条数据' %(len(urllst2)))
        except:
            print('获取页面信息失败,分页网址位:',u)
       # print(urllst2)  
      
#获取信息  
    errorlst = []    
    datalst = []
    for u in urllst2:
        try:
            datalst.append(get_data(u,dic_h,dic_c,ip_dic))
            print('数据采集成功,总共采集%i条数据' %len(datalst))
        except:
            errorlst.append(u)
            print('采集数据失败,失败网址为',u)


    for u in errorlst:
        try:
            datalst.append(get_data(u,dic_h,dic_c,ip_dic))
            print('数据采集成功,总共采集%i条数据' %len(datalst))
        except:
            errorlst.append(u)
            print('采集数据失败,失败网址为',u)

    data = datalst
    print(12345)    
    datadf = pd.DataFrame(datalst)
	//注意改下路径
    datadf.to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\dataultra.xlsx')
 #   pd.DataFrame(errorlst).to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\errorlst.xlsx')
 #   data_room.to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\dataroom.xlsx')
'''
ui = 'https://tj.lianjia.com/ershoufang'
ri = requests.get(url=ui,headers = dic_h, cookies = dic_c)
position = re.search(r"resblockPosition",ri.text)
'''

如何获取代码中的dic_h 和 cookies(注意都要是字符串)
去官网注册一个账号
然后刷新一下,右键检查,选择network

爬虫天津链家二手房数据(requests + BeautifulSoup)

然后下拉
爬虫天津链家二手房数据(requests + BeautifulSoup)
user_agent就是我们的dic_h,注意写成字典,
cookies是爬虫天津链家二手房数据(requests + BeautifulSoup)
全复制下来即可

爬取天津链家各个小区单价的数据

同样的,注意改下写excel的路径和登录信息。

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""


import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#天津链家小区数据的网址,后面是总页数
url ='''https://tj.lianjia.com/xiaoqu/heping/?from=recpage1 23,
https://tj.lianjia.com/xiaoqu/nankai/?from=recpage1 25,
https://tj.lianjia.com/xiaoqu/hexi/?from=recpage1 23,
https://tj.lianjia.com/xiaoqu/hebei/?from=recpage1 17,
https://tj.lianjia.com/xiaoqu/hedong/?from=recpage1 19,
https://tj.lianjia.com/xiaoqu/hongqiao/?from=recpage1 11,
https://tj.lianjia.com/xiaoqu/xiqing/?from=recpage1 17,
https://tj.lianjia.com/xiaoqu/beichen/?from=recpage1 13,
https://tj.lianjia.com/xiaoqu/dongli/?from=recpage1 15,
https://tj.lianjia.com/xiaoqu/jinnan/?from=recpage1 12,
https://tj.lianjia.com/xiaoqu/tanggu/?from=recpage1 18,
https://tj.lianjia.com/xiaoqu/kaifaqutj/?from=recpage1 5,
https://tj.lianjia.com/xiaoqu/wuqing/?from=recpage1 15,
https://tj.lianjia.com/xiaoqu/binhaixinqu/?from=recpage1 14,
https://tj.lianjia.com/xiaoqu/baodi/?from=recpage1 7,
https://tj.lianjia.com/xiaoqu/jizhou/?from=recpage1 10,
https://tj.lianjia.com/xiaoqu/haihejiaoyuyuanqu/?from=recpage1 2,
https://tj.lianjia.com/xiaoqu/jinghai/?from=recpage1 9
'''

 '''
u = url_lst[1].split(' ')[0]
lst = []
n = 1
for page in range(1,n+1):
    u = u.replace('\n','')
    u = u.split(r'/?')[0] + '/pg' + '%i'%page + '/?' + u.split('/?')[1]
ips = None
ri = requests.get(url=u,headers = dic_h, cookies = dic_c,proxies=ips,timeout = 3)
 '''  
def get_urls(urli,n):
    '''
    功能:分页网址url采集
    n:页面参数
    urli:网址
    结果:得到分页网址list
    '''
    lst = []
    for page in range(1,n+1):
        ui = u.replace('\n','')
        ui = urli.split('/?')[0] + '/pg' + '%i'%page + '/?' + urli.split('/?')[1]
        lst.append(ui)
    return lst



    

#解析页面
def get_dataurls(ui,d_h,d_c,ips):
    '''

    Parameters
    ----------
    ui :    分页网址.
    d_h :  user-agent信息.
    d_c :  cookies信息

    Returns
    -------
    列表网址.

    '''
    try:
        ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
    except:
        try:
            ri = requests.get(url=ui,headers = d_h, cookies = d_c,proxies=ips,timeout = 3)
        except:
            print('request failed 2 times')
    #访问页面
    soupi = BeautifulSoup(ri.text,'lxml')
    ul = soupi.find('ul',class_="listContent")
    lis = ul.find_all('li')
    lst = []
    for li in lis:
        lst.append(li.find('a')['href'])
    return lst

'''
ri = requests.get(url = urllst2[:1][0], headers = dic_h, cookies = dic_c, verify=False, proxies=None, timeout=3)
soupi = BeautifulSoup(ri.text,'lxml')
dic = {}#空字典存储数据  
dic['单价'] = soupi.find('span',class_="xiaoquUnitPrice").text
dic['小区名称'] = soupi.find('div',class_="detailHeader fl").h1.text   
'''

def get_data(ui,d_h,d_c,ips):
    '''
    

 Parameters
    ----------
    ui :    分页网址.
    d_h :  user-agent信息.
    d_c :  cookies信息


    '''
    try:
        proxies = None
        ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
    except:
    # logdebug('requests failed one time')
        try:
            proxies = None
            ri = requests.get(url = ui, headers = d_h, cookies = d_c, verify=False, proxies=ips, timeout=3)
        except:
            # logdebug('requests failed two time')
            print('requests failed two time')
    

        
    
    
    
    soupi = BeautifulSoup(ri.text,'lxml')
    dic = {}#空字典存储数据  
    dic['单价'] = soupi.find('span',class_="xiaoquUnitPrice").text
    dic['小区名称'] = soupi.find('div',class_="detailHeader fl").h1.text   
    return dic
   
   


def get_proxies(p_User,p_Pass,p_Host,p_Port):
    '''
    生成动态ip函数
    Parameters
    ----------
    p_Usermp_Pass :
        设置代理服务器.
    p_Host,p_Port : 
        代理服务器验证信息

    Returns
    -------
    ip
    
    '''
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host" : p_Host,
        "port" : p_Port,
        "user" : p_User,
        "pass" : p_Pass,
    }
    ips = {
        "http"  : proxyMeta,
        "https" : proxyMeta,
    }
    return ips

    

    
if __name__ == "__main__":
    #设置爬取多少页
    
    url_lst = url.strip().split(',')
    url_u = []
    for str in url_lst:
        url_u.append(str.split(' '))
    ip_dic = None
    #设置代理ip
#    ip_dic = get_proxies('123',
#                         '123',
#                         'http-dyn.abuyun.com',
#                         '9020') 

    urllst1 = []
    for url_p in url_u:
        lst_test = get_urls(url_p[0],int(url_p[1]))
        urllst1.extend(lst_test)

    #u1 = urllst1[0]
    dic_h = {'User-Agent':''}
    dic_c = {}
    cookies = ''' '''
    for i in cookies.split('; '):
        dic_c[i.split('=')[0]] = i.split('=')[1]
        #获取agent cookies
        
    urllst2 = []

#得到每个房子的网址
    for u in urllst1:
        try:
            urllst2.extend(get_dataurls(u, dic_h, dic_c,ip_dic))
            print('成功采集页面信息,成功采集%i条数据' %(len(urllst2)))
        except:
            print('获取页面信息失败,分页网址位:',u)
       # print(urllst2)  
      
#获取信息  
    errorlst = []    
    datalst = []
    for u in urllst2:
        try:
            datalst.append(get_data(u,dic_h,dic_c,ip_dic))
            print('数据采集成功,总共采集%i条数据' %len(datalst))
        except:
            errorlst.append(u)
            print('采集数据失败,失败网址为',u)

    print(12345)    
    datadf = pd.DataFrame(datalst)

    datadf.to_excel(r'F:\I_love_learning\junior\数据挖掘与数据仓库\课程设计\dataxiaoqu.xlsx')


'''
ui = 'https://tj.lianjia.com/ershoufang'
ri = requests.get(url=ui,headers = dic_h, cookies = dic_c)
position = re.search(r"resblockPosition",ri.text)
'''

本人python版本为3.8 成功运行。
总共的话小区是7000多条,房屋数据是大概40000多

上一篇:python-twisted和SIGKILL


下一篇:序列(sequence)