Spider--补充--jsonpath的使用

2022-09-29 14:12:25
# 知识点参见：https://blog.csdn.net/muzico425/article/details/102763176

# 示例：爬取示例网站的首页的评论：

# 解析得到的字符串r.text(即 json字符串)可以使用json库来完成解析：
import json
import requests
url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""

headers = {‘User-Agent‘ : ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362‘}
r = requests.get(url, headers= headers)
json_data_dict=json.loads(r.text[r.text.find(‘{‘):-2])    
# 将从左大括号开始至倒数第三个字符（即将字符串末尾的括号和分号去除掉）load反序列化成字典。
# json_data_dict是一个字典嵌套字典的数据结构（字典的value是字典）。
# 其中外部字典的results键对应一个字典，该字典的parents键对应一个值是列表（列表的元素又是字典）。

comments_list=json_data_dict[‘results‘][‘parents‘]
for comment_dict in comments_list:
    print(comment_dict[‘content‘])

# 或 ：
import json
import requests
import jsonpath
url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""

headers = {‘User-Agent‘ : ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362‘}
r = requests.get(url, headers= headers)
json_data_dict=json.loads(r.text[r.text.find(‘{‘):-2])  

comments_list=jsonpath.jsonpath(json_data_dict,‘$.results.parents[*].content‘)  # 使用 jsonpath替代复杂的数据结构的索引
for comment in comments_list:
    print(comment)
码农公寓

相关文章