爬虫（猫眼电影+校花网+github+今日头条+拉钩）

2022-10-28 18:54:09

Requests+正则表达式爬取猫眼TOP100榜电影信息

MARK：将信息写入文件解决乱码方法，开启进程池秒爬。

import requests

from requests.exceptions import RequestException

import re

import json

from multiprocessing import Pool

def get_one_page(url):

try:

response = requests.get(url)

if response.status_code == 200:

return response.text

return None

except RequestException:

return None

def parse_one_page(html):

pattern = re.compile('<dd>.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name"><a'

+ '.*?>(.*?)</a>.*?star">(.*?).*?releasetime">(.*?)'

+ '.*?integer">(.*?).*?fraction">(.*?).*?</dd>', re.S)

items = re.findall(pattern, html)

for item in items:

yield {

'排行': item[0],

'图片': item[1],

'电影': item[2],

'演员': item[3].strip()[3:],

'上映信息': item[4].strip()[5:],

'评分': item[5] + item[6]

}

def write_to_file(content):

with open('result.txt', 'a', encoding='utf-8') as f:

f.write(json.dumps(content, ensure_ascii=False) + '\n')

def main(offset):

url = 'http://maoyan.com/board/4?offset=' + str(offset)

html = get_one_page(url)

for item in parse_one_page(html):

print(item)

write_to_file(item)

if __name__ == '__main__':

# for i in range(10):

# main(i*10)

pool = Pool() # 进程池多进程

pool.map(main, [i * 10 for i in range(10)])

Requests+正则表达式爬取校花网视频

import requests

import re

import os

def get_page(url):

try:

response = requests.get(url)

response.raise_for_status()

response.encoding = response.apparent_encoding

return response.text

except:

print("爬取失败")

def get_url(html):

pattern = re.compile('class="items".*?href="(.*?)"', re.S)

urls = re.findall(pattern, html)

for url in urls:

if not url.startswith('http'):

url = 'http://www.xiaohuar.com' + url

yield url

def get_detail_url(detail_content):

pattern = re.compile('id="media".*?src="(.*?)"', re.S)

urls = re.findall(pattern, detail_content)

for url in urls:

if url:

if url.endswith('.mp4'):

yield url

def download(url):

root = "D://movie2//"

path = root + url.split('/')[-1]

try:

if not os.path.exists(root):

os.mkdir(root)

if not os.path.exists(path):

response = requests.get(url)

# with open(path, 'wb') as f:

# f.write(response.content)

with open(path, 'wb') as f:

for line in response.iter_content():

f.write(line)

print("文件保存成功")

else:

print("文件已存在")

except:

print("下载失败")

def main(page_num):

url = 'http://www.xiaohuar.com/list-3-{0}.html'.format(page_num)

html = get_page(url)

urls = get_url(html)

for url in urls:

detail_content = get_page(url)

detail_urls = get_detail_url(detail_content)

for detail_url in detail_urls:

download(detail_url)

if __name__ == '__main__':

for num in range(30):

main(num)

Requests+PyQuery模拟登陆github

import requests

from pyquery import PyQuery

LOGIN_URL = 'https://github.com/login'

SESSION_URL = 'https://github.com/session'

session = requests.session()

response = session.get(LOGIN_URL)

text = PyQuery(response.text)

authenticity_token = text('#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)').attr('value')

data = {

'commit': 'Sign in',

'utf8': '✓',

'authenticity_token': authenticity_token,

'login': 'lcgsmile@qq.com',

'password': 'lcg@pwd.'

}

response = session.post(SESSION_URL, data=data)

print(response.status_code) # 200

分析Ajax请求并抓取今日头条街拍美图

配置文件config.py

MONGO_URL = 'localhost'

MONGO_DB = 'toutiao'

MONGO_TABLE = 'toutiao'

GROUP_START = 1

GROUP_END = 20

KEYWORD = '街拍'

主爬虫文件

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

import json

import os

from urllib.parse import urlencode

import pymongo

import requests

from bs4 import BeautifulSoup

from requests.exceptions import ConnectionError

import re

from multiprocessing import Pool

from hashlib import md5

from json.decoder import JSONDecodeError

from config import *

client = pymongo.MongoClient(MONGO_URL, connect=False) # 多进程抓取connect=False

db = client[MONGO_DB]

def get_page_index(offset, keyword):

"""

爬取索引页

"""

data = {

'autoload': 'true',

'count': 20,

'cur_tab': 3,

'format': 'json',

'keyword': keyword,

'offset': offset,

}

params = urlencode(data) # 将字典类型构造成url的请求参数

base = 'http://www.toutiao.com/search_content/'

url = base + '?' + params

try:

response = requests.get(url)

if response.status_code == 200:

return response.text

return None

except ConnectionError:

print('Error occurred')

return None

def download_image(url):

"""

下载图片

"""

print('Downloading', url)

try:

response = requests.get(url)

if response.status_code == 200:

save_image(response.content)

return None

except ConnectionError:

return None

def save_image(content):

"""

保存图片

"""

file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')

# 用一个md5哈希生成的文件名防止重复

print(file_path)

if not os.path.exists(file_path):

with open(file_path, 'wb') as f:

f.write(content)

def parse_page_index(text):

"""

解析数据

"""

try:

data = json.loads(text) # json字符串转换成字典

if data and 'data' in data.keys():

for item in data.get('data'):

yield item.get('article_url')

except JSONDecodeError:

pass

def get_page_detail(url):

"""

请求详情页

"""

try:

response = requests.get(url)

if response.status_code == 200:

return response.text

return None

except ConnectionError:

print('Error occurred')

return None

def parse_page_detail(html, url):

"""

解析详情页

"""

soup = BeautifulSoup(html, 'lxml')

result = soup.select('title')

title = result[0].get_text() if result else ''

images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)

result = re.search(images_pattern, html)

if result:

data = json.loads(result.group(1).replace('\\', ''))

if data and 'sub_images' in data.keys():

sub_images = data.get('sub_images')

images = [item.get('url') for item in sub_images]

for image in images: download_image(image)

return {

'title': title,

'url': url,

'images': images

}

def save_to_mongo(result):

"""

将数据插入到MongoDB

"""

if db[MONGO_TABLE].insert(result):

print('Successfully Saved to Mongo', result)

return True

return False

def main(offset):

text = get_page_index(offset, KEYWORD)

urls = parse_page_index(text)

for url in urls:

html = get_page_detail(url)

result = parse_page_detail(html, url)

if result: save_to_mongo(result)

if __name__ == '__main__':

pool = Pool()

groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])

pool.map(main, groups)

pool.close()

pool.join()

拉勾网自动投递简历

import requests

import re

# 1、============================================认证流程

session = requests.session()

# 第一步：

# 请求的URL：https://passport.lagou.com/login/login.html，

# 请求的方法GET，

# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',

                 headers={

                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                 },

                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]

X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

# print(X_Anti_Forge_Code)

# print(X_Anti_Forge_Token)

# 第二步：

# 1、请求的URL:https://passport.lagou.com/login/login.json,

# 2、请求方法POST，

# 3、请求头：

#   Referer:https://passport.lagou.com/login/login.html

#   User-Agent:

#   X-Anit-Forge-Code

#   X-Anit-Forge-Token

#   X-Requested-With

# 4、请求体：

# isValidate:true

# username:1111111111

# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714

session.post('https://passport.lagou.com/login/login.json',

             headers={

                 'Referer': 'https://passport.lagou.com/login/login.html',

                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                 'X-Anit-Forge-Code': X_Anti_Forge_Code,

                 'X-Anit-Forge-Token': X_Anti_Forge_Token,

                 'X-Requested-With': 'XMLHttpRequest'

             },

             data={

                 'isValidate': True,

                 'username': '',

                 'password': '70621c64832c4d4d66a47be6150b4a8e'

             }

             )

# 第三：

# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,

# 2、请求方法GET，

# 3、请求头：

#   Referer:https://passport.lagou.com/login/login.html

#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',

            headers={

                'Referer': 'https://passport.lagou.com/login/login.html',

                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

            }

            )

# 验证

response = session.get('https://www.lagou.com/resume/myresume.html',

                       headers={

                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                       }

                       )

# print('18611453110' in response.text)

# 2、============================================爬取职位信息

# 1、请求的url：https://www.lagou.com/jobs/positionAjax.json

# 2、请求的方式：POST

#   请求参数：

#     gj:3年及以下

#     xl:不要求

#     jd:不需要融资

#     hy:移动互联网

#     px:default

#     yx:15k-25k

#     city:全国

# 3、请求头：

# User-Agent

# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD

# X-Anit-Forge-Code:0

# X-Anit-Forge-Token:None

# X-Requested-With:XMLHttpRequest

# 4、请求体：

# first:true

# pn:1

# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}

res = urlencode(params).split('=')[-1]

url = 'https://www.lagou.com/jobs/list_' + res

# print(url)

response = session.post('https://www.lagou.com/jobs/positionAjax.json',

                        params={

                            # 'gj': '3年及以下',

                            # 'xl': '不要求',

                            # 'jd': '不需要融资',

                            # 'hy': '移动互联网',

                            'px': 'default',

                            'yx': '15k-25k',

                            'city': '北京',

                            'district': '海淀区',

                        },

                        headers={

                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                            'Referer': url,

                        })

# print(response.status_code)

result = response.json()['content']['positionResult']['result']

for comanpy_info in result:

    fullname = comanpy_info['companyFullName']

    emp_num = comanpy_info['companySize']

    salary = comanpy_info['salary']

    workyear = comanpy_info['workYear']

    positionName = comanpy_info['positionName']

    positionId = comanpy_info['positionId']

    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)

    print(fullname)

    print(emp_num)

    print(salary)

    print(workyear)

    print(positionName)

    print(positionId)

    print()

    # 3、============================================爬取职位信息

    # 第一步：请求详情页：

    # 1、请求的detail_url：https://www.lagou.com/jobs/3984845.html

    # 2、请求的方式：GET

    # 3、请求头：

    #    User-Agent

    r1 = session.get(detail_url,

                     headers={

                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                     }

                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]

    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步：投递简历

    # 1、请求的url：https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json

    # 2、请求的方式：POST

    # 3、请求头：

    # User-Agent

    # Referer:detail_url

    # X-Anit-Forge-Code:31832262

    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7

    # X-Requested-With:XMLHttpRequest

    # 4、请求体：

    # 'positionId':3984845

    # 'type':1

    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',

                 headers={

                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                     'Referer': detail_url,

                     'X-Anit-Forge-Code': X_Anti_Forge_Code,

                     'X-Anit-Forge-Token': X_Anti_Forge_Token,

                     'X-Requested-With': 'XMLHttpRequest'

                 },

                 data={

                     'positionId': positionId,

                     'type': 1,

                     'force': True

                 }

                 )

    print('投递成功',detail_url)

lagou

import requests

import re

# 1、============================================认证流程

session = requests.session()

# 第一步：

# 请求的URL：https://passport.lagou.com/login/login.html，

# 请求的方法GET，

# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',

                 headers={

                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                 },

                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]

X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

# print(X_Anti_Forge_Code)

# print(X_Anti_Forge_Token)

# 第二步：

# 1、请求的URL:https://passport.lagou.com/login/login.json,

# 2、请求方法POST，

# 3、请求头：

#   Referer:https://passport.lagou.com/login/login.html

#   User-Agent:

#   X-Anit-Forge-Code

#   X-Anit-Forge-Token

#   X-Requested-With

# 4、请求体：

# isValidate:true

# username:1111111111

# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714

session.post('https://passport.lagou.com/login/login.json',

             headers={

                 'Referer': 'https://passport.lagou.com/login/login.html',

                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                 'X-Anit-Forge-Code': X_Anti_Forge_Code,

                 'X-Anit-Forge-Token': X_Anti_Forge_Token,

                 'X-Requested-With': 'XMLHttpRequest'

             },

             data={

                 'isValidate': True,

                 'username': '18611453110',

                 'password': '70621c64832c4d4d66a47be6150b4a8e'

             }

             )

# 第三：

# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,

# 2、请求方法GET，

# 3、请求头：

#   Referer:https://passport.lagou.com/login/login.html

#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',

            headers={

                'Referer': 'https://passport.lagou.com/login/login.html',

                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

            }

            )

# 验证

response = session.get('https://www.lagou.com/resume/myresume.html',

                       headers={

                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                       }

                       )

# print('18611453110' in response.text)

# 2、============================================爬取职位信息

# 1、请求的url：https://www.lagou.com/jobs/positionAjax.json

# 2、请求的方式：POST

#   请求参数：

#     gj:3年及以下

#     xl:不要求

#     jd:不需要融资

#     hy:移动互联网

#     px:default

#     yx:15k-25k

#     city:全国

# 3、请求头：

# User-Agent

# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD

# X-Anit-Forge-Code:0

# X-Anit-Forge-Token:None

# X-Requested-With:XMLHttpRequest

# 4、请求体：

# first:true

# pn:1

# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}

res = urlencode(params).split('=')[-1]

url = 'https://www.lagou.com/jobs/list_' + res

# print(url)

response = session.post('https://www.lagou.com/jobs/positionAjax.json',

                        params={

                            # 'gj': '3年及以下',

                            # 'xl': '不要求',

                            # 'jd': '不需要融资',

                            # 'hy': '移动互联网',

                            'px': 'default',

                            'yx': '15k-25k',

                            'city': '北京',

                            'district': '海淀区',

                        },

                        headers={

                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                            'Referer': url,

                        })

# print(response.status_code)

result = response.json()['content']['positionResult']['result']

for comanpy_info in result:

    fullname = comanpy_info['companyFullName']

    emp_num = comanpy_info['companySize']

    salary = comanpy_info['salary']

    workyear = comanpy_info['workYear']

    positionName = comanpy_info['positionName']

    positionId = comanpy_info['positionId']

    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)

    print(fullname)

    print(emp_num)

    print(salary)

    print(workyear)

    print(positionName)

    print(positionId)

    print()

    # 3、============================================爬取职位信息

    # 第一步：请求详情页：

    # 1、请求的detail_url：https://www.lagou.com/jobs/3984845.html

    # 2、请求的方式：GET

    # 3、请求头：

    #    User-Agent

    r1 = session.get(detail_url,

                     headers={

                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                     }

                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]

    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步：投递简历

    # 1、请求的url：https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json

    # 2、请求的方式：POST

    # 3、请求头：

    # User-Agent

    # Referer:detail_url

    # X-Anit-Forge-Code:31832262

    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7

    # X-Requested-With:XMLHttpRequest

    # 4、请求体：

    # 'positionId':3984845

    # 'type':1

    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',

                 headers={

                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                     'Referer': detail_url,

                     'X-Anit-Forge-Code': X_Anti_Forge_Code,

                     'X-Anit-Forge-Token': X_Anti_Forge_Token,

                     'X-Requested-With': 'XMLHttpRequest'

                 },

                 data={

                     'positionId': positionId,

                     'type': 1,

                     'force': True

                 }

                 )

    print('投递成功',detail_url)

import requests

import re

# 1、============================================认证流程

session = requests.session()

# 第一步：

# 请求的URL：https://passport.lagou.com/login/login.html，

# 请求的方法GET，

# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',

                 headers={

                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                 },

                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]

X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

# print(X_Anti_Forge_Code)

# print(X_Anti_Forge_Token)

# 第二步：

# 1、请求的URL:https://passport.lagou.com/login/login.json,

# 2、请求方法POST，

# 3、请求头：

#   Referer:https://passport.lagou.com/login/login.html

#   User-Agent:

#   X-Anit-Forge-Code

#   X-Anit-Forge-Token

#   X-Requested-With

# 4、请求体：

# isValidate:true

# username:1111111111

# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714

session.post('https://passport.lagou.com/login/login.json',

             headers={

                 'Referer': 'https://passport.lagou.com/login/login.html',

                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                 'X-Anit-Forge-Code': X_Anti_Forge_Code,

                 'X-Anit-Forge-Token': X_Anti_Forge_Token,

                 'X-Requested-With': 'XMLHttpRequest'

             },

             data={

                 'isValidate': True,

                 'username': '',

                 'password': '70621c64832c4d4d66a47be6150b4a8e'

             }

             )

# 第三：

# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,

# 2、请求方法GET，

# 3、请求头：

#   Referer:https://passport.lagou.com/login/login.html

#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',

            headers={

                'Referer': 'https://passport.lagou.com/login/login.html',

                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

            }

            )

# 验证

response = session.get('https://www.lagou.com/resume/myresume.html',

                       headers={

                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                       }

                       )

# print('18611453110' in response.text)

# 2、============================================爬取职位信息

# 1、请求的url：https://www.lagou.com/jobs/positionAjax.json

# 2、请求的方式：POST

#   请求参数：

#     gj:3年及以下

#     xl:不要求

#     jd:不需要融资

#     hy:移动互联网

#     px:default

#     yx:15k-25k

#     city:全国

# 3、请求头：

# User-Agent

# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD

# X-Anit-Forge-Code:0

# X-Anit-Forge-Token:None

# X-Requested-With:XMLHttpRequest

# 4、请求体：

# first:true

# pn:1

# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}

res = urlencode(params).split('=')[-1]

url = 'https://www.lagou.com/jobs/list_' + res

# print(url)

response = session.post('https://www.lagou.com/jobs/positionAjax.json',

                        params={

                            # 'gj': '3年及以下',

                            # 'xl': '不要求',

                            # 'jd': '不需要融资',

                            # 'hy': '移动互联网',

                            'px': 'default',

                            'yx': '15k-25k',

                            'city': '北京',

                            'district': '海淀区',

                        },

                        headers={

                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                            'Referer': url,

                        })

# print(response.status_code)

result = response.json()['content']['positionResult']['result']

for comanpy_info in result:

    fullname = comanpy_info['companyFullName']

    emp_num = comanpy_info['companySize']

    salary = comanpy_info['salary']

    workyear = comanpy_info['workYear']

    positionName = comanpy_info['positionName']

    positionId = comanpy_info['positionId']

    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)

    print(fullname)

    print(emp_num)

    print(salary)

    print(workyear)

    print(positionName)

    print(positionId)

    print()

    # 3、============================================爬取职位信息

    # 第一步：请求详情页：

    # 1、请求的detail_url：https://www.lagou.com/jobs/3984845.html

    # 2、请求的方式：GET

    # 3、请求头：

    #    User-Agent

    r1 = session.get(detail_url,

                     headers={

                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                     }

                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]

    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步：投递简历

    # 1、请求的url：https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json

    # 2、请求的方式：POST

    # 3、请求头：

    # User-Agent

    # Referer:detail_url

    # X-Anit-Forge-Code:31832262

    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7

    # X-Requested-With:XMLHttpRequest

    # 4、请求体：

    # 'positionId':3984845

    # 'type':1

    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',

                 headers={

                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',

                     'Referer': detail_url,

                     'X-Anit-Forge-Code': X_Anti_Forge_Code,

                     'X-Anit-Forge-Token': X_Anti_Forge_Token,

                     'X-Requested-With': 'XMLHttpRequest'

                 },

                 data={

                     'positionId': positionId,

                     'type': 1,

                     'force': True

                 }

                 )

    print('投递成功',detail_url)

lagou

码农公寓

Requests+正则表达式爬取猫眼TOP100榜电影信息

Requests+正则表达式爬取校花网视频

Requests+PyQuery模拟登陆github

分析Ajax请求并抓取今日头条街拍美图

拉勾网自动投递简历

相关文章