爬取到的内容做解析
xpath 解析
解析本地文件
xpath的返回值是一个列表型数据
xpath基本语法
- 路径查询
– // :查找所有子孙节点
– / :找直接子节点 - 谓词查询
– //div[@id]
– //div[@id=‘maincontent’] - 属性查询
– //@class - 模糊查询
– //div[contains(@id),“ci”]
– //div[starts-with(@id),“he”] - 内容查询
– //div/h1/text() - 逻辑运算
– //div[@id=“head” and @class=“s_down”]
– //div[@id=“head” and @class=“s_down”] 另一种写法 //title | //price
from lxml import etree
# 解析本地文件 etree.parse() 解析服务器响应数据 etree.HTML()
tree = etree.parse('F:/Temp/img/New_file.html')
# tree.xpath('xpath路径')
# 查找ul下的li
li_list = tree.xpath('//body/ul/li')
# 查询所有有id属性的li标签
li_list = tree.xpath('//ul/li[@id]/text()')
# 查询id为l1的标签
li_list = tree.xpath('//ul/li[@id="l1"]/text()')
# 查询id为l1的标签的 class属性值
li = tree.xpath('//ul/li[@id="l1"]/@class')
# 查询id为l1的标签的 class属性值
li = tree.xpath('//ul/li[@id="l1"]/@class')
# 查询id的值中包含l的li标签
li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')
# 查询id的值以u开头的li标签
li_list = tree.xpath('//ul/li[starts-with(@id,"u")]/text()')
# 查询id为l1 和 class值为c2的
li_list = tree.xpath('//ul/li[@id="l1" and @class="c2"]/text()')
# # 查询id为l3 或 id为l4的
li_list = tree.xpath('//ul/li[@id="l3" or @id="l4"]/text()')
# li_list = tree.xpath('//ul/li[@id="l3"]/text() | //ul/li[@id="l4"]/text()')
print(li_list)
New_file.html文件:
<html>
<head>
<meta charset="utf-8"/>
<title></title>
</head>
<body>
<ul>
<li id="l1" class='c2'>北京</li>
<li id="l2">上海</li>
<li class='c1'>广州</li>
<li class='c2'>深圳</li>
</ul>
<ul>
<li id="l3">成都</li>
<li id="u1">西安</li>
<li id="l4">昆明</li>
<li id="u2">杭州</li>
</ul>
</body>
</html>
解析网页
对网页进行解析,然后获取对应位置的内容。这样就不需要下载网页文件
给浏览器安装一个【XPath Helper】插件,安装好后,快捷键【Ctrl+Shift+x】可在网页顶部生成一左一右的两个黑色框。在左侧输入xpath路径,可在右侧查看相应内容。这能方便开发者。
import urllib.request
from lxml import etree
url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
# 获取内容
request = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(request)
# handler = urllib.request.HTTPHandler()
# opener = urllib.request.build_opener(handler)
# res = opener.open(request)
content = res.read().decode('utf-8')
# 解析网页源码 获取我们需要的数据
tree = etree.HTML(content)
result = tree.xpath('//input[@id="su"]/@value')
print(result)
下载图片
在站长素材下载图片,要求是前十页的美食图片
import urllib.request
from lxml import etree
''' 先看看每一页的网址
https://sc.chinaz.com/tupian/meishitupian.html
https://sc.chinaz.com/tupian/meishitupian_3.html
https://sc.chinaz.com/tupian/meishitupian_5.html
https://sc.chinaz.com/tupian/meishitupian_10.html
'''
def createRequest(page):
if page==1:
url = 'https://sc.chinaz.com/tupian/meishitupian.html'
else:
url = 'https://sc.chinaz.com/tupian/meishitupian_'+str(page)+'.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def getContent(request):
res = urllib.request.urlopen(request)
content = res.read().decode('utf-8')
return content
def Download(content):
tree = etree.HTML(content)
name_list = tree.xpath('//div[@id="container"]//a/img/@alt')
# 一般涉及到图片的网站 都会进行懒加载 所以不能直接用src
src_list = tree.xpath('//div[@id="container"]//a/img/@src2')
for i in range(len(name_list)):
name = name_list[i]
src = src_list[i]
# src 长这样 '//scpic2.chinaz.net/Files/pic/pic9/202107/apic34371_s.jpg' 去掉_s能下载到高清图片
url = 'https:'+src[:-6]+'.jpg'
# 下载图片 如果filename=name+".jpg" 会下载到当前文件夹下
urllib.request.urlretrieve(url=url,filename='F:/Temp/Pachong/'+name+".jpg")
if __name__ == "__main__":
start_page = 1
end_page = 5
for page in range(start_page,end_page+1):
# 1.请求对象的定制
request = createRequest(page)
# 2.获取网页的源码
content = getContent(request)
# 解析源码,并下载图片
Download(content)
JsonPath 解析
解析本地文件
JsonPath解析只能解析本地文件,所以网页资源需要先下载。
参考资料:https://blog.csdn.net/fu_huo_1993/article/details/88350147
import json
import jsonpath
obj = json.load(open('store.json','r',encoding='utf-8'))
# 书店所有书的作者
author_list = jsonpath.jsonpath(obj,'$.stroe.book[*].author')
# 所有的作者
author_list = jsonpath.jsonpath(obj,'$.stroe..author')
# stroe下面所有的价格
price_list = jsonpath.jsonpath(obj,'$.stroe..price')
# stroe下面的所有元素
tag_list = jsonpath.jsonpath(obj,'$.stroe.*')
# 第三本书
book = jsonpath.jsonpath(obj,'$..book[2]')
# 最后一本书
book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')
# 前两本书
books = jsonpath.jsonpath(obj,'$..book[0,1]')
books = jsonpath.jsonpath(obj,'$..book[:2]')
# 过滤出所有包含版本号的书
book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')
# 过滤出价格超过10的书
book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]')
store.json 文件
{ "store": {
"book": [
{ "category": "修真",
"author": "六道",
"title": "坏蛋是怎样练成的",
"price": 8.95
},
{ "category": "修改",
"author": "天蚕土豆",
"title": "斗破苍穹",
"price": 12.99
},
{ "category": "修真",
"author": "唐家三少",
"title": "斗罗大陆",
"isbn": "0-553-21311-3",
"price": 8.99
},
{ "category": "修真",
"author": "南派三叔",
"title": "星辰变",
"isbn": "0-395-19395-8",
"price": 22.99
}
],
"bicycle": {
"color": "黑色",
"price": 19.95
}
}
}
解析淘票票覆盖城市
import json
import jsonpath
import urllib.request
url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'
headers = {
'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
# 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'cna=UkO6F8VULRwCAXTqq7dbS5A8; miid=949542021157939863; sgcookie=E100F01JK9XMmyoZRigjfmZKExNdRHQqPf4v9NIWIC1nnpnxyNgROLshAf0gz7lGnkKvwCnu1umyfirMSAWtubqc4g%3D%3D; tracknick=action_li; _cc_=UIHiLt3xSw%3D%3D; enc=dA18hg7jG1xapfVGPHoQCAkPQ4as1%2FEUqsG4M6AcAjHFFUM54HWpBv4AAm0MbQgqO%2BiZ5qkUeLIxljrHkOW%2BtQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; _m_h5_tk=3ca69de1b9ad7dce614840fcd015dcdb_1629776735568; _m_h5_tk_enc=ab56df54999d1d2cac2f82753ae29f82; t=874e6ce33295bf6b95cfcfaff0af0db6; xlly_s=1; cookie2=13acd8f4dafac4f7bd2177d6710d60fe; v=0; _tb_token_=e65ebbe536158; tfstk=cGhRB7mNpnxkDmUx7YpDAMNM2gTGZbWLxUZN9U4ulewe025didli6j5AFPI8MEC..; l=eBrgmF1cOsMXqSxaBO5aFurza77tzIRb8sPzaNbMiInca6OdtFt_rNCK2Ns9SdtjgtfFBetPVKlOcRCEF3apbgiMW_N-1NKDSxJ6-; isg=BBoas2yXLzHdGp3pCh7XVmpja8A8S54lyLj1RySTHq14l7vRDNufNAjpZ2MLRxa9',
'referer': 'https://dianying.taobao.com/',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
}
request = urllib.request.Request(url = url, headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# split 切割
content = content.split('(')[1].split(')')[0]
with open('jsonpath解析淘票票.json','w',encoding='utf-8')as fp:
fp.write(content)
obj = json.load(open('jsonpath解析淘票票.json','r',encoding='utf-8'))
city_list = jsonpath.jsonpath(obj,'$..regionName')
print(city_list)
BeautifulSoup 解析
BeautifulSoup跟xpath类似,即能解析本地文件也能解析网页。
基本语法
from bs4 import BeautifulSoup
# 了解bs4的基本语法
soup = BeautifulSoup(open('F:/Temp/img/New_file.html','r',encoding='utf-8'),'lxml')
# 返回第一个符合条件的数据
print(soup.a)
# 获取标签的属性和属性值
print(soup.a.attrs)
# -------------------bs4的一些函数
# find()函数 返回一个对象
# 返回第一个符合条件的数据
print(soup.find('a'))
# 根据title的值来找到对应的标签对象
print(soup.find('a',title='a2'))
# 根据class的值来找到对应的标签对象
print(soup.find('a',class_='a1'))
print('-'*40)
# find_all()函数 返回的是一个列表
# 返回所有的a标签
print(soup.find_all('a'))
# 获取多个标签的数据
print(soup.find_all(['a','span']))
# 查找指定的前几个数据
print(soup.find_all('li',limit=2))
print('--------------------------------')
# select()函数 根据选择器得到节点对象
# 返回所有的a标签
print(soup.select('a'))
# 返回class标签值为a1的对象
print(soup.select('.a1'))
# 返回id标签值为l1的对象
print(soup.select('#l1'))
# 属性选择器 -- 通过属性来寻找对应的标签
# 返回li标签中有id属性的对象
print(soup.select('li[id]'))
# 返回li标签中id为l2的对象
print(soup.select('li[id="l2"]'))
print('-----------------------------')
# 层级选择器
# 打印div下面的li
print(soup.select('div li'))
# 打印ul的第一级子标签
print(soup.select('div > ul > li'))
# 打印a标签和li标签的所有的对象
print(soup.select('a,li'))
# 打印节点里的内容(返回的是列表。所以加了一个[0])
# 若标签对象中只有内容,那么string和get_text()都可以使用。若便签对象中除了内容还有标签,那么只能用get_text()
print(soup.select('#app')[0].string)
print(soup.select('#app')[0].get_text())
# 打印节点的名称和属性值(attrs返回的是字典形式)
print(soup.select("#p1")[0].name)
print(soup.select("#p1")[0].attrs)
# 获取class标签的的属性值
print(soup.select('#p1')[0].attrs.get('class')) # 通过字典的key来获取 这更稳妥
print(soup.select('#p1')[0].get('class'))
print(soup.select('#p1')[0]['class'])
New_file.html文件
<html>
<head>
<meta charset="utf-8"/>
<title></title>
</head>
<body>
<div>
<ul>
<li id="l1">白起</li>
<li id="l2">廉颇</li>
<li class='c1'>李牧</li>
<li class='c2'>王翦</li>
<a href="www.csdn.com" id="" class="a1">CSDN</a>
<span>你大爷</span>
</ul>
</div>
<a href="www.baidu.com" title="a2">百度</a>
<div id='app'>
<span>老大爷</span>
</div>
<p id="p1" class="p1">爬虫</p>
</body>
</html>
爬星巴克菜单(图片和名称)
import urllib.request
from bs4 import BeautifulSoup
url = 'https://www.starbucks.com.cn/menu/'
res = urllib.request.urlopen(url)
content = res.read().decode('utf-8')
soup = BeautifulSoup(content,'lxml')
name_list = soup.select('ul[class="grid padded-3 product"] strong')
src_list = soup.select('ul[class="grid padded-3 product"] div')
for i in range(len(name_list)):
name = name_list[i].get_text()
if name.find('/') > 0:
name = name.replace('/', '')
# print(src_list[i].attrs) 字典形式长这样 {'class': ['preview', 'circle'], 'style': 'background-image: url("/images/products/affogato.jpg")'}
src= str(src_list[i].attrs.get('style')).split("\"/")[1].split("\")")[0]
url = 'https://www.starbucks.com.cn/'+src
# 下载图片 如果filename=name+".jpg" 会下载到当前文件夹下
urllib.request.urlretrieve(url=url, filename='F:/Temp/Pachong/' + name + ".jpg")