爬取招聘网站符合关键字的网址

爬取华为招聘网站符合关键字的网址

# -*- coding:utf-8 -*-

import requests
import re
from http.cookiejar import CookieJar
import json
from bs4 import BeautifulSoup
from urllib import request,parse
from http import cookiejar
s = requests.session()
urlall = []
def login():
    for page in range(1,13):
        page = str(page)
        url1 = "http://career.huawei.com/socRecruitment/services/portal3/portalnew/getJobList/page/15/{0}?keywords=linux&orderBy=P_COUNT_DESC&jobType=1".format(page)
        param1 = { "jobType":1,
                   "keywords": "linux",
                   "orderBy": "P_COUNT_DESC",
                   }
        response = s.get(url1, params=param1, verify=False).text
        reex1 = re.compile('\"jobId\":[0-9]+')
        jobids = re.findall(reex1, response)
        print(jobids)
        for jobid in jobids:
            jobid = jobid.split(':')[1]
            print(jobid)
            url2 = "http://career.huawei.com/socRecruitment/services/portal/portalpub/getJobDetail?jobId={0}".format(jobid)
            param2 = {"jobId": jobid}
            response2 = s.get(url2, params=param2, verify=False).text
            print(response2)
            if re.match('.*shell.*|.*python.*', response2, re.I) and re.match('.*linux', response2, re.I) and not re.match('.*c\+\+.*', response2, re.I):
                urlget = "http://career.huawei.com/socRecruitment/soc_index.html#soc/pages/job/jobterminal.html?jobId={0}&language=cn&keywords=linux&keywords=linux".format(jobid)
                urlall.append(urlget)
        print(urlall)

login()

 

上一篇:QFileInfoList


下一篇:Spark-SubmitTask