Python爬虫之Xpath解析

Xpath解析

xpath解析原理

  1. 实例化一个etree对象,且需要将被解析的页面的源码数据加载到该对象中
  2. 调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获

环境的安装

pip install lxml

如何实例化一个etree对象

from lxml import etree

1、将本地的html文件中的源码加载到etree对象中

etree.parse(filepath)

2、将互联网获取的源码加载到该对象中

etree.HTML(page_text)

xpath实战

58二手房信息爬取

import requests
from lxml import etree
from fake_useragent import UserAgent

url = 'https://cq.58.com/ershoufang/'
headers = {
    'user-agent': UserAgent().random
}

response = requests.get(url=url, headers=headers)
page_text = response.text
tree = etree.HTML(page_text)
room_list = tree.xpath("//section[@class='list']/div/a/div[2]")

for room in room_list:
    title = room.xpath(".//div[@class='property-content-title']//h3/text()")[0]
    price = room.xpath(
        ".//div[@class='property-price']//span[@class='property-price-total-num']/text()")[0]
    avg = room.xpath(".//p[@class='property-price-average']/text()")[0]
    print('标题: {0} 总价: {1}万元 均价: {2}元/m2'.format(title, price, avg))

彼岸图网图片爬取

from lxml import etree
import requests
from fake_useragent import UserAgent
import time


url = 'https://pic.netbian.com/e/search/result/'
ua = UserAgent()


picture_index = []  # 保存图片下载主页地址

# 彼岸图网图片爬取
def pic_down(page, searchid):

    param = {
        'page': page,
        'searchid': searchid
    }

    headers = {'User-Agent': ua.random}  # 随机UA
    response = requests.get(url=url, headers=headers, params=param)
    if response.status_code != 200:
        print("当前状态码为: ", response.status_code)
        return False
    page_text = response.text
    # 爬取当前页所有照片的主页链接
    index_etree = etree.HTML(page_text)
    index_list = index_etree.xpath("//ul[@class='clearfix']/li/a/@href")

    for picture_index_url in index_list:
        headers = {'User-Agent': ua.random}  # 随机UA
        pic_response = requests.get(url=picture_index_url, headers=headers)
        pic_etree = etree.HTML(pic_response.text)
        # 图片的链接
        pic_link = 'https://pic.netbian.com/' + \
            pic_etree.xpath("//a[@id='img']/img/@src")[0]
        fp.write(pic_link+'\n')
        print(pic_link)

    print('成功爬取第 {} 页\n', page)
    return True


if __name__ == '__main__':
    fp = open('图片链接.txt', 'w', encoding='utf-8')
    for i in range(0, 5):

        if pic_down(i, 16):
            time.sleep(3)
        else:
            print('爬取失败')
            break

    fp.close()

站长免费简历爬取


import requests
from lxml import etree
from fake_useragent import UserAgent
ua = UserAgent()

url = 'https://sc.chinaz.com/jianli/free.html'


# "//div[@id='container']/div/a/@href"
headers = {
    'user-agent': ua.random
}
response = requests.get(url=url, headers=headers)
page_text = response.content
index_tree = etree.HTML(page_text)
index_link = index_tree.xpath("//div[@id='container']/div/a/@href")
for link in index_link:
    # "https://"+link 为主页链接
    rel_link = "https:"+link
    # //div[@class='clearfix mt20 downlist']/ul/li/a/@href
    response = requests.get(url=rel_link, headers=headers)
    page_text = response.content
    download_tree = etree.HTML(page_text)
    download_link = download_tree.xpath(
        "//div[@class='clearfix mt20 downlist']/ul/li/a/@href")
    print(download_link[0])

版权属于:瞌学家 所有,转载请注明出处
本文链接:https://songonline.top/archives/139/
友情提示: 如果文章部分链接出现404,请留言或者联系博主修复。

上一篇:Android-RX子可观察-仅执行一次


下一篇:同源策略?同源策划的解决方法?