源码:
import requests
import re
from my_mysql import MysqlConnect
import time,random # 获取招聘详情链接
def get_urls(page, headers):
url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=page'.format(page)
response = requests.get(url, headers=headers)
pat = r'href="(position_detail.*?)">'
url_list_bytes = re.findall(pat.encode('utf-8'), response.content)
return url_list_bytes # 获取招聘详情
def get_info(url, headers):
response = requests.get(url, headers=headers)
html_bytes = response.content
# print(html_bytes) # title 标题
pat = r'id="sharetitle">(.*?)</td>'
res = re.search(pat.encode('utf-8'), html_bytes)
title = res.group(1).decode('utf-8')
# address 地点
pat = r'工作地点:</span>(.*?)</td>'
res = re.search(pat.encode('utf-8'), html_bytes)
address = res.group(1).decode('utf-8')
# types 类别
pat = r'职位类别:</span>(.*?)</td>'
res = re.search(pat.encode('utf-8'), html_bytes)
types = res.group(1).decode('utf-8')
# counts 人数
pat = r'招聘人数:</span>(.*?)</td>'
res = re.search(pat.encode('utf-8'), html_bytes)
counts = res.group(1).decode('utf-8')
# duty 职责
pat = r'工作职责.*?<ul class="squareli">(.*?)</ul>'
res = re.search(pat.encode('utf-8'), html_bytes)
duty_str = res.group(1).decode('utf-8')
pat = r'<li>(.*?)</li>'
duty = re.findall(pat,duty_str)
duty = ('\n').join(duty)
# requires 要求
pat = r'工作要求.*?<ul class="squareli">(.*?)</ul>'
res = re.search(pat.encode('utf-8'), html_bytes)
requires_str = res.group(1).decode('utf-8')
pat = r'<li>(.*?)</li>'
requires = re.findall(pat, requires_str)
requires = ('\n').join(requires)
return title,address,types,counts,duty,requires if __name__ == '__main__':
mc = MysqlConnect('127.0.0.1','root','','homework')
sql = "insert into tencentzp(title,address,types,counts,duty,requires) values(%s,%s,%s,%s,%s,%s)"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
for page in range(0,200,10):
url_list_bytes = get_urls(page,headers)
# print(url_list_bytes)
for url in url_list_bytes:
# print(url.decode('utf-8'))
url = 'https://hr.tencent.com/' + url.decode('utf-8')
info = get_info(url,headers)
print(info)
mc.exec_data(sql,info)
time.sleep(random.random()*5)