python获取拉钩岗位信息

#!/usr/bin/env python
"""
 * project_name stu
 * package 
 * file_name  pt_craw_zh
 * <p>
 * description 
 * </p>
 * @author <a href="mailto:joshualwork@163.com">joshua_liu</a> 
 * @date 2022/1/4 15:49
"""
import json
import pymongo
import requests
from jsonpath_rw import parse

url = 'https://www.lagou.com/jobs/v2/positionAjax.json?first=true&needAddtionalResult=false&city=%E9%83%91%E5%B7%9E' \
      '&px=new&pn={}&kd=Java'
max_pg = 13
# mongo_client = pymongo.MongoClient('localhost','27017')
# mongo_client = pymongo.MongoClient('mongodb://localhost:27017')
mongo_client = pymongo.MongoClient('localhost', 27017, username='db_python', password='**..!123')
db_list = mongo_client.list_database_names()
db = mongo_client.db_python
tb_lg_position = db.tb_lg_position
headers = {
    "Cache-Control": "no-cache",
    "Host": "www.lagou.com",
    "Accept": "application/json, text/plain, */*",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "x-l-req-header": "{deviceType:1}",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/96.0.4664.110 Safari/537.36"
}

'''
 ' @param 
 ' @return 
 ' @description 分页获取数据
 ' @author <a href="mailto:joshualwork@163.com">joshua_liu</a>
 ' @date 2022/1/4 21:35
'''


def query_json():
    for i in range(1, max_pg):
        temp_url = url.format(i)
        print(temp_url)
        request = requests.post(temp_url, headers=headers)
        json_str = request.content.decode()
        # print(json_str)
        fmt_json = json.loads(json_str)
        json_path_hr_expr = parse("content.hrInfoMap")
        json_path_pos_expr = parse("content.positionResult.result")
        hr_infos = json_path_hr_expr.find(fmt_json)
        pos_infos = json_path_pos_expr.find(fmt_json)
        # content = fmt_json['content']
        # print(content)
        # print(hr_infos)
        for match in pos_infos:
            match_value = match.value
            # print(type(match_value))
            print(match_value)
            tb_lg_position.insert_many(match_value)
            # for match_value_child in match_value:
            #     print(json.dumps(match_value_child, ensure_ascii=False))


def main():
    query_json()


if __name__ == '__main__':
    main()
#!/usr/bin/env python
"""
 * project_name stu
 * package 
 * file_name  pt_craw_zh
 * <p>
 * description 
 * </p>
 * @author <a href="mailto:joshualwork@163.com">joshua_liu</a> 
 * @date 2022/1/4 15:49
"""
import json
import pymongo
import requests
from jsonpath_rw import parse

url = 'https://www.lagou.com/jobs/v2/positionAjax.json?first=true&needAddtionalResult=false&city=%E9%83%91%E5%B7%9E' \
      '&px=new&pn={}&kd=Java'
max_pg = 13
# mongo_client = pymongo.MongoClient('localhost','27017')
# mongo_client = pymongo.MongoClient('mongodb://localhost:27017')
mongo_client = pymongo.MongoClient('localhost', 27017, username='db_python', password='**..!g1i2t3L')
db_list = mongo_client.list_database_names()
db = mongo_client.db_python
tb_lg_position = db.tb_lg_position
headers = {
    "Cache-Control": "no-cache",
    "Host": "www.lagou.com",
    "Accept": "application/json, text/plain, */*",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "x-l-req-header": "{deviceType:1}",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/96.0.4664.110 Safari/537.36"
}

'''
 ' @param 
 ' @return 
 ' @description 分页获取数据
 ' @author <a href="mailto:joshualwork@163.com">joshua_liu</a>
 ' @date 2022/1/4 21:35
'''


def query_json():
    for i in range(1, max_pg):
        temp_url = url.format(i)
        print(temp_url)
        request = requests.post(temp_url, headers=headers)
        json_str = request.content.decode()
        # print(json_str)
        fmt_json = json.loads(json_str)
        json_path_hr_expr = parse("content.hrInfoMap")
        json_path_pos_expr = parse("content.positionResult.result")
        hr_infos = json_path_hr_expr.find(fmt_json)
        pos_infos = json_path_pos_expr.find(fmt_json)
        # content = fmt_json['content']
        # print(content)
        # print(hr_infos)
        for match in pos_infos:
            match_value = match.value
            # print(type(match_value))
            print(match_value)
            tb_lg_position.insert_many(match_value)
            # for match_value_child in match_value:
            #     print(json.dumps(match_value_child, ensure_ascii=False))


def main():
    query_json()


if __name__ == '__main__':
    main()
上一篇:构建java后端包的镜像


下一篇:通过注解、切面、反射实现返回信息脱敏(二)