解析爬取内容

爬取到的内容做解析

xpath 解析

解析本地文件

xpath的返回值是一个列表型数据
xpath基本语法

  • 路径查询
    – // :查找所有子孙节点
    – / :找直接子节点
  • 谓词查询
    – //div[@id]
    – //div[@id=‘maincontent’]
  • 属性查询
    – //@class
  • 模糊查询
    – //div[contains(@id),“ci”]
    – //div[starts-with(@id),“he”]
  • 内容查询
    – //div/h1/text()
  • 逻辑运算
    – //div[@id=“head” and @class=“s_down”]
    – //div[@id=“head” and @class=“s_down”] 另一种写法 //title | //price
from lxml import etree
# 解析本地文件 etree.parse()   解析服务器响应数据 etree.HTML()

tree = etree.parse('F:/Temp/img/New_file.html')

# tree.xpath('xpath路径')
# 查找ul下的li
li_list = tree.xpath('//body/ul/li')

# 查询所有有id属性的li标签
li_list = tree.xpath('//ul/li[@id]/text()')

# 查询id为l1的标签
li_list = tree.xpath('//ul/li[@id="l1"]/text()')

# 查询id为l1的标签的 class属性值
li = tree.xpath('//ul/li[@id="l1"]/@class')

# 查询id为l1的标签的 class属性值
li = tree.xpath('//ul/li[@id="l1"]/@class')

# 查询id的值中包含l的li标签
li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')

# 查询id的值以u开头的li标签
li_list = tree.xpath('//ul/li[starts-with(@id,"u")]/text()')

# 查询id为l1 和 class值为c2的
li_list = tree.xpath('//ul/li[@id="l1" and @class="c2"]/text()')

# # 查询id为l3 或 id为l4的
li_list = tree.xpath('//ul/li[@id="l3" or @id="l4"]/text()')
# li_list = tree.xpath('//ul/li[@id="l3"]/text() | //ul/li[@id="l4"]/text()')


print(li_list)

New_file.html文件:

<html>
	<head>
		<meta charset="utf-8"/>
		<title></title>
	</head>
	<body>
		<ul>
			<li id="l1" class='c2'>北京</li>
			<li id="l2">上海</li>
			<li class='c1'>广州</li>
			<li class='c2'>深圳</li>
		</ul>
		
		<ul>
			<li id="l3">成都</li>
			<li id="u1">西安</li>
			<li id="l4">昆明</li>
			<li id="u2">杭州</li>
		</ul>
	</body>
</html>

解析网页

对网页进行解析,然后获取对应位置的内容。这样就不需要下载网页文件
给浏览器安装一个【XPath Helper】插件,安装好后,快捷键【Ctrl+Shift+x】可在网页顶部生成一左一右的两个黑色框。在左侧输入xpath路径,可在右侧查看相应内容。这能方便开发者。

import urllib.request
from lxml import etree

url = 'https://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}

# 获取内容
request = urllib.request.Request(url=url,headers=headers)

res = urllib.request.urlopen(request)
# handler = urllib.request.HTTPHandler()
# opener = urllib.request.build_opener(handler)
# res = opener.open(request)

content = res.read().decode('utf-8')

# 解析网页源码 获取我们需要的数据
tree = etree.HTML(content)
result = tree.xpath('//input[@id="su"]/@value')
print(result)

下载图片

在站长素材下载图片,要求是前十页的美食图片

import urllib.request
from lxml import etree

''' 先看看每一页的网址
https://sc.chinaz.com/tupian/meishitupian.html
https://sc.chinaz.com/tupian/meishitupian_3.html
https://sc.chinaz.com/tupian/meishitupian_5.html
https://sc.chinaz.com/tupian/meishitupian_10.html
'''

def createRequest(page):
    if page==1:
        url = 'https://sc.chinaz.com/tupian/meishitupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/meishitupian_'+str(page)+'.html'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
    }
    request = urllib.request.Request(url=url,headers=headers)
    return request


def getContent(request):
    res = urllib.request.urlopen(request)
    content = res.read().decode('utf-8')
    return content


def Download(content):
    tree = etree.HTML(content)
    name_list = tree.xpath('//div[@id="container"]//a/img/@alt')
    # 一般涉及到图片的网站 都会进行懒加载 所以不能直接用src
    src_list = tree.xpath('//div[@id="container"]//a/img/@src2')

    for i in range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        # src 长这样 '//scpic2.chinaz.net/Files/pic/pic9/202107/apic34371_s.jpg' 去掉_s能下载到高清图片
        url = 'https:'+src[:-6]+'.jpg'
        # 下载图片 如果filename=name+".jpg" 会下载到当前文件夹下
        urllib.request.urlretrieve(url=url,filename='F:/Temp/Pachong/'+name+".jpg")


if __name__ == "__main__":
    start_page = 1
    end_page = 5

    for page in range(start_page,end_page+1):
        # 1.请求对象的定制
        request =  createRequest(page)
        # 2.获取网页的源码
        content = getContent(request)
        # 解析源码,并下载图片
        Download(content)

JsonPath 解析

解析本地文件

JsonPath解析只能解析本地文件,所以网页资源需要先下载。
参考资料:https://blog.csdn.net/fu_huo_1993/article/details/88350147

import json
import jsonpath

obj = json.load(open('store.json','r',encoding='utf-8'))

# 书店所有书的作者
author_list = jsonpath.jsonpath(obj,'$.stroe.book[*].author')

# 所有的作者
author_list = jsonpath.jsonpath(obj,'$.stroe..author')

# stroe下面所有的价格
price_list = jsonpath.jsonpath(obj,'$.stroe..price')

# stroe下面的所有元素
tag_list = jsonpath.jsonpath(obj,'$.stroe.*')

# 第三本书
book = jsonpath.jsonpath(obj,'$..book[2]')

# 最后一本书
book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')

# 前两本书
books = jsonpath.jsonpath(obj,'$..book[0,1]')
books = jsonpath.jsonpath(obj,'$..book[:2]')

# 过滤出所有包含版本号的书
book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')

# 过滤出价格超过10的书
book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]')

store.json 文件

{ "store": {
    "book": [
      { "category": "修真",
        "author": "六道",
        "title": "坏蛋是怎样练成的",
        "price": 8.95
      },
      { "category": "修改",
        "author": "天蚕土豆",
        "title": "斗破苍穹",
        "price": 12.99
      },
      { "category": "修真",
        "author": "唐家三少",
        "title": "斗罗大陆",
        "isbn": "0-553-21311-3",
        "price": 8.99
      },
      { "category": "修真",
        "author": "南派三叔",
        "title": "星辰变",
        "isbn": "0-395-19395-8",
        "price": 22.99
      }
    ],
    "bicycle": {
      "color": "黑色",
      "price": 19.95
    }
  }
}

解析淘票票覆盖城市

import json
import jsonpath
import urllib.request

url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'

headers = {
    'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
    # 'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cookie': 'cna=UkO6F8VULRwCAXTqq7dbS5A8; miid=949542021157939863; sgcookie=E100F01JK9XMmyoZRigjfmZKExNdRHQqPf4v9NIWIC1nnpnxyNgROLshAf0gz7lGnkKvwCnu1umyfirMSAWtubqc4g%3D%3D; tracknick=action_li; _cc_=UIHiLt3xSw%3D%3D; enc=dA18hg7jG1xapfVGPHoQCAkPQ4as1%2FEUqsG4M6AcAjHFFUM54HWpBv4AAm0MbQgqO%2BiZ5qkUeLIxljrHkOW%2BtQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; _m_h5_tk=3ca69de1b9ad7dce614840fcd015dcdb_1629776735568; _m_h5_tk_enc=ab56df54999d1d2cac2f82753ae29f82; t=874e6ce33295bf6b95cfcfaff0af0db6; xlly_s=1; cookie2=13acd8f4dafac4f7bd2177d6710d60fe; v=0; _tb_token_=e65ebbe536158; tfstk=cGhRB7mNpnxkDmUx7YpDAMNM2gTGZbWLxUZN9U4ulewe025didli6j5AFPI8MEC..; l=eBrgmF1cOsMXqSxaBO5aFurza77tzIRb8sPzaNbMiInca6OdtFt_rNCK2Ns9SdtjgtfFBetPVKlOcRCEF3apbgiMW_N-1NKDSxJ6-; isg=BBoas2yXLzHdGp3pCh7XVmpja8A8S54lyLj1RySTHq14l7vRDNufNAjpZ2MLRxa9',
    'referer': 'https://dianying.taobao.com/',
    'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
    'sec-ch-ua-mobile': '?0',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest',
}

request = urllib.request.Request(url = url, headers = headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

# split 切割
content = content.split('(')[1].split(')')[0]


with open('jsonpath解析淘票票.json','w',encoding='utf-8')as fp:
    fp.write(content)

obj = json.load(open('jsonpath解析淘票票.json','r',encoding='utf-8'))

city_list = jsonpath.jsonpath(obj,'$..regionName')

print(city_list)

BeautifulSoup 解析

BeautifulSoup跟xpath类似,即能解析本地文件也能解析网页。

基本语法

from bs4 import BeautifulSoup

# 了解bs4的基本语法

soup = BeautifulSoup(open('F:/Temp/img/New_file.html','r',encoding='utf-8'),'lxml')

# 返回第一个符合条件的数据
print(soup.a)

# 获取标签的属性和属性值
print(soup.a.attrs)

# -------------------bs4的一些函数
# find()函数 返回一个对象
# 返回第一个符合条件的数据
print(soup.find('a'))

# 根据title的值来找到对应的标签对象
print(soup.find('a',title='a2'))

# 根据class的值来找到对应的标签对象
print(soup.find('a',class_='a1'))

print('-'*40)

# find_all()函数 返回的是一个列表
# 返回所有的a标签
print(soup.find_all('a'))

# 获取多个标签的数据
print(soup.find_all(['a','span']))

# 查找指定的前几个数据
print(soup.find_all('li',limit=2))

print('--------------------------------')

# select()函数 根据选择器得到节点对象
# 返回所有的a标签
print(soup.select('a'))

# 返回class标签值为a1的对象
print(soup.select('.a1'))
# 返回id标签值为l1的对象
print(soup.select('#l1'))

# 属性选择器 -- 通过属性来寻找对应的标签
# 返回li标签中有id属性的对象
print(soup.select('li[id]'))

# 返回li标签中id为l2的对象
print(soup.select('li[id="l2"]'))

print('-----------------------------')

# 层级选择器
# 打印div下面的li
print(soup.select('div li'))

# 打印ul的第一级子标签
print(soup.select('div > ul > li'))

# 打印a标签和li标签的所有的对象
print(soup.select('a,li'))

# 打印节点里的内容(返回的是列表。所以加了一个[0])
# 若标签对象中只有内容,那么string和get_text()都可以使用。若便签对象中除了内容还有标签,那么只能用get_text()
print(soup.select('#app')[0].string)
print(soup.select('#app')[0].get_text())

# 打印节点的名称和属性值(attrs返回的是字典形式)
print(soup.select("#p1")[0].name)
print(soup.select("#p1")[0].attrs)

# 获取class标签的的属性值
print(soup.select('#p1')[0].attrs.get('class')) # 通过字典的key来获取 这更稳妥
print(soup.select('#p1')[0].get('class'))
print(soup.select('#p1')[0]['class'])

New_file.html文件

<html>
	<head>
		<meta charset="utf-8"/>
		<title></title>
	</head>
	<body>
		<div>
			<ul>
				<li id="l1">白起</li>
				<li id="l2">廉颇</li>
				<li class='c1'>李牧</li>
				<li class='c2'>王翦</li>
				<a href="www.csdn.com" id="" class="a1">CSDN</a>
				<span>你大爷</span>
			</ul>
		</div>
		
		<a href="www.baidu.com" title="a2">百度</a>
		
		
		<div id='app'>
			<span>老大爷</span>
		</div>
		
		<p id="p1" class="p1">爬虫</p>
		
	</body>
</html>

爬星巴克菜单(图片和名称)

import urllib.request
from bs4 import BeautifulSoup

url = 'https://www.starbucks.com.cn/menu/'
res = urllib.request.urlopen(url)
content = res.read().decode('utf-8')


soup = BeautifulSoup(content,'lxml')
name_list = soup.select('ul[class="grid padded-3 product"] strong')
src_list = soup.select('ul[class="grid padded-3 product"] div')



for i in range(len(name_list)):
    name = name_list[i].get_text()
    if name.find('/') > 0:
        name = name.replace('/', '')

    # print(src_list[i].attrs) 字典形式长这样 {'class': ['preview', 'circle'], 'style': 'background-image: url("/images/products/affogato.jpg")'}
    src= str(src_list[i].attrs.get('style')).split("\"/")[1].split("\")")[0]
    url = 'https://www.starbucks.com.cn/'+src

    # 下载图片 如果filename=name+".jpg" 会下载到当前文件夹下
    urllib.request.urlretrieve(url=url, filename='F:/Temp/Pachong/' + name + ".jpg")
上一篇:读Bsautiful Soup库有感


下一篇:Tomcat简介及安装