Xpath解析
xpath解析原理
- 实例化一个etree对象,且需要将被解析的页面的源码数据加载到该对象中
- 调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获
环境的安装
pip install lxml
如何实例化一个etree对象
from lxml import etree
1、将本地的html文件中的源码加载到etree对象中
etree.parse(filepath)
2、将互联网获取的源码加载到该对象中
etree.HTML(page_text)
xpath实战
58二手房信息爬取
import requests
from lxml import etree
from fake_useragent import UserAgent
url = 'https://cq.58.com/ershoufang/'
headers = {
'user-agent': UserAgent().random
}
response = requests.get(url=url, headers=headers)
page_text = response.text
tree = etree.HTML(page_text)
room_list = tree.xpath("//section[@class='list']/div/a/div[2]")
for room in room_list:
title = room.xpath(".//div[@class='property-content-title']//h3/text()")[0]
price = room.xpath(
".//div[@class='property-price']//span[@class='property-price-total-num']/text()")[0]
avg = room.xpath(".//p[@class='property-price-average']/text()")[0]
print('标题: {0} 总价: {1}万元 均价: {2}元/m2'.format(title, price, avg))
彼岸图网图片爬取
from lxml import etree
import requests
from fake_useragent import UserAgent
import time
url = 'https://pic.netbian.com/e/search/result/'
ua = UserAgent()
picture_index = [] # 保存图片下载主页地址
# 彼岸图网图片爬取
def pic_down(page, searchid):
param = {
'page': page,
'searchid': searchid
}
headers = {'User-Agent': ua.random} # 随机UA
response = requests.get(url=url, headers=headers, params=param)
if response.status_code != 200:
print("当前状态码为: ", response.status_code)
return False
page_text = response.text
# 爬取当前页所有照片的主页链接
index_etree = etree.HTML(page_text)
index_list = index_etree.xpath("//ul[@class='clearfix']/li/a/@href")
for picture_index_url in index_list:
headers = {'User-Agent': ua.random} # 随机UA
pic_response = requests.get(url=picture_index_url, headers=headers)
pic_etree = etree.HTML(pic_response.text)
# 图片的链接
pic_link = 'https://pic.netbian.com/' + \
pic_etree.xpath("//a[@id='img']/img/@src")[0]
fp.write(pic_link+'\n')
print(pic_link)
print('成功爬取第 {} 页\n', page)
return True
if __name__ == '__main__':
fp = open('图片链接.txt', 'w', encoding='utf-8')
for i in range(0, 5):
if pic_down(i, 16):
time.sleep(3)
else:
print('爬取失败')
break
fp.close()
站长免费简历爬取
import requests
from lxml import etree
from fake_useragent import UserAgent
ua = UserAgent()
url = 'https://sc.chinaz.com/jianli/free.html'
# "//div[@id='container']/div/a/@href"
headers = {
'user-agent': ua.random
}
response = requests.get(url=url, headers=headers)
page_text = response.content
index_tree = etree.HTML(page_text)
index_link = index_tree.xpath("//div[@id='container']/div/a/@href")
for link in index_link:
# "https://"+link 为主页链接
rel_link = "https:"+link
# //div[@class='clearfix mt20 downlist']/ul/li/a/@href
response = requests.get(url=rel_link, headers=headers)
page_text = response.content
download_tree = etree.HTML(page_text)
download_link = download_tree.xpath(
"//div[@class='clearfix mt20 downlist']/ul/li/a/@href")
print(download_link[0])
版权属于:瞌学家 所有,转载请注明出处
本文链接:https://songonline.top/archives/139/
友情提示: 如果文章部分链接出现404,请留言或者联系博主修复。