爬虫爬取房王

from selenium import webdriver
from lxml import etree
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
option = webdriver.ChromeOptions()
option.binary_location=r'C:\Program Files\Google\Chrome\Application\chrome.exe'
driver = webdriver.Chrome('C:\Program Files\Google\Chrome\Application\chromedriver.exe')
driver.get("https://gz.ihk.cn/myxf/houselist")
html = driver.page_source
tree = etree.HTML(html)
import re
detail_url=re.findall('<div class="ihknewconlist" οnclick="listPageClick(.*)">',html)
detail=[]
for i in detail_url:
    i=i.replace('(','').replace(')','').replace("'","")
    detail.append(i)
for i in detail:
    detail_all = "https://gz.ihk.cn/myxfdetail/main/" + i
    driver1=webdriver.Chrome('C:\Program Files\Google\Chrome\Application\chromedriver.exe')
    driver1.get(detail_all)
    html1 = driver1.page_source
    tree1=etree.HTML(html1)
    title=tree1.xpath('//title/text()')[0]
    area= tree1.xpath('//*[@id="main-rightbox"]/div/text()')
    try:
        name = tree1.xpath('//*[@id="main-rightbox"]/div[11]/dl[1]/dd[1]/strong/text()')[0]
        phone = tree1.xpath('//*[@id="main-rightbox"]/div[11]/dl[1]/dd[3]/a/text()')[0]
        a=tree1.xpath('//div[@class="ind-r07"]/dl[1]//text()')
        print(''.join(a))
        a=''.join(a)

        list = []
        for i in area:
            if i =='':
                continue

            wv = i.replace('\n','').replace(" ",'').replace("'",'')
            if wv=='':
                continue
            list.append(wv)
        print(list)
        for ii in list:
            if ii=='':
                continue
        with open('zy.txt', 'a', encoding='utf-8')as f:
            f.write("----------" + title + "----------\n")
            for i in list:
               f.write(i)
               f.write("\n")

            f.write(a)
            f.write("\n")
    except:
        a = tree1.xpath('//div[@class="ind-r07"]/dl[1]//text()')
        print(''.join(a))
        a = ''.join(a)
        list = []
        for i in area:
            if i == '':
                continue

            wv = i.replace('\n', '').replace(" ", '').replace("'", '')
            if wv == '':
                continue
            list.append(wv)
        print(list)
        for ii in list:
            if ii == '':
                continue
        with open('zy.txt', 'a', encoding='utf-8')as f:
            f.write("----------" + title + "----------\n")
            for i in list:
                f.write(i)
                f.write("\n")
            f.write("\n")
上一篇:关于sql json数据的处理


下一篇:bgi::detail::minmaxdist用法的测试程序