"""
* project_name stu
* package
* file_name pt_craw_zh
* <p>
* description
* </p>
* @author <a href="mailto:joshualwork@163.com">joshua_liu</a>
* @date 2022/1/4 15:49
"""
import json
import pymongo
import requests
from jsonpath_rw import parse
url = 'https://www.lagou.com/jobs/v2/positionAjax.json?first=true&needAddtionalResult=false&city=%E9%83%91%E5%B7%9E' \
'&px=new&pn={}&kd=Java'
max_pg = 13
mongo_client = pymongo.MongoClient('localhost', 27017, username='db_python', password='**..!123')
db_list = mongo_client.list_database_names()
db = mongo_client.db_python
tb_lg_position = db.tb_lg_position
headers = {
"Cache-Control": "no-cache",
"Host": "www.lagou.com",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"x-l-req-header": "{deviceType:1}",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36"
}
'''
' @param
' @return
' @description 分页获取数据
' @author <a href="mailto:joshualwork@163.com">joshua_liu</a>
' @date 2022/1/4 21:35
'''
def query_json():
for i in range(1, max_pg):
temp_url = url.format(i)
print(temp_url)
request = requests.post(temp_url, headers=headers)
json_str = request.content.decode()
fmt_json = json.loads(json_str)
json_path_hr_expr = parse("content.hrInfoMap")
json_path_pos_expr = parse("content.positionResult.result")
hr_infos = json_path_hr_expr.find(fmt_json)
pos_infos = json_path_pos_expr.find(fmt_json)
for match in pos_infos:
match_value = match.value
print(match_value)
tb_lg_position.insert_many(match_value)
def main():
query_json()
if __name__ == '__main__':
main()
"""
* project_name stu
* package
* file_name pt_craw_zh
* <p>
* description
* </p>
* @author <a href="mailto:joshualwork@163.com">joshua_liu</a>
* @date 2022/1/4 15:49
"""
import json
import pymongo
import requests
from jsonpath_rw import parse
url = 'https://www.lagou.com/jobs/v2/positionAjax.json?first=true&needAddtionalResult=false&city=%E9%83%91%E5%B7%9E' \
'&px=new&pn={}&kd=Java'
max_pg = 13
mongo_client = pymongo.MongoClient('localhost', 27017, username='db_python', password='**..!g1i2t3L')
db_list = mongo_client.list_database_names()
db = mongo_client.db_python
tb_lg_position = db.tb_lg_position
headers = {
"Cache-Control": "no-cache",
"Host": "www.lagou.com",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"x-l-req-header": "{deviceType:1}",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36"
}
'''
' @param
' @return
' @description 分页获取数据
' @author <a href="mailto:joshualwork@163.com">joshua_liu</a>
' @date 2022/1/4 21:35
'''
def query_json():
for i in range(1, max_pg):
temp_url = url.format(i)
print(temp_url)
request = requests.post(temp_url, headers=headers)
json_str = request.content.decode()
fmt_json = json.loads(json_str)
json_path_hr_expr = parse("content.hrInfoMap")
json_path_pos_expr = parse("content.positionResult.result")
hr_infos = json_path_hr_expr.find(fmt_json)
pos_infos = json_path_pos_expr.find(fmt_json)
for match in pos_infos:
match_value = match.value
print(match_value)
tb_lg_position.insert_many(match_value)
def main():
query_json()
if __name__ == '__main__':
main()