爬取51job-数据分析
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import csv
def page_generator():
web = webdriver.Chrome()
web.maximize_window()
try:
web.get('https://www.51job.com/')
input = web.find_element_by_id('kwdselectid')
input.send_keys('数据分析')
input.send_keys(Keys.ENTER)
while True:
location = web.find_element_by_css_selector('div.clist > a:nth-child(1)')
if location.text == '全国':
break
location.click()
time.sleep(1)
old_url = web.current_url
max_try = 5
try_count = 0
while True:
print(web.current_url)
# max_height = 600
# height = 100
# while True:
# web.execute_script(f'window.scrollTo(0, {height})')
# height += 100
# if height > max_height:
# break
# yield web.execute_script('return window.__SEARCH_RESULT__["engine_search_result"]')
text = web.page_source
try:
# 网络以及翻页速度等原因造成部分页面加载不全的情况,如果下一页标签没出现,则报异常
next_btn = web.find_element_by_css_selector('li.next')
next_btn.click()
except selenium.common.exceptions.ElementNotInteractableException as e:
# 异常时,最多执行5次重新获取,5次都获取不到可能网页元素有变,这时,不在反复执行
print('page_url:', web.current_url, '\terror:', e)
try_count += 1
if try_count < max_try:
# 取不到页面时,停顿一定时间后在尝试,否则时间太短,相同的问题会一直触发
time.sleep(1)
continue
else:
break
# 使用生成器获取页面
yield text
time.sleep(0.1)
# 最后一页跳下一页,因为下一页不存在,所以地址和最后一页相同,因此当前页为最后一页的时候跳出
if old_url == web.current_url:
break
old_url = web.current_url
try_count = 0
finally:
web.close()
def analysis_data(content: str):
res = []
soup = BeautifulSoup(content, 'lxml')
boxs = soup.select('div.j_joblist > div')
for box in boxs:
job_href = box.select_one('a').attrs['href']
job_name = box.select_one('a > p.t > span.jname.at').get_text()
updatedate = box.select_one('a > p.t > span.time').get_text()
tags = box.select_one('a > p.t > em')
tags = tags.attrs['alt'] if tags else ''
# 包含工作地点和条件
workplace_attribute = box.select_one('a > p.info > span.d.at').get_text()
salary = box.select_one('a > p.info > span.sal').get_text()
company_name = box.select_one('div.er > a').get_text()
company_href = box.select_one('div.er > a').attrs['href']
# 公司类型以及公司人数规模
companytype = box.select_one('div.er > p.dc.at').get_text().strip()
companyind = box.select_one('div.er > p.int.at').get_text()
jobwelf = box.select_one('a > p.tags')
jobwelf = jobwelf.attrs['title'] if jobwelf else ''
res.append([job_name, job_href, updatedate, tags, workplace_attribute, salary, company_name, company_href, companytype, companyind, jobwelf])
# print(job_name, job_href, updatedate, tags, workplace_attribute, salary, company_name, company_href, companytype, companyind, jobwelf)
return res
if __name__ == '__main__':
page = page_generator()
with open(r'./csvfile/51job.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['job_name', 'job_href', 'updatedate', 'tags', 'workplace_attribute', 'salary', 'company_name', 'company_href', 'companytype', 'companyind', 'jobwelf'])
with open(r'./csvfile/s51job.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
try:
while True:
date = next(page)
res = analysis_data(date)
writer.writerows(res)
except StopIteration:
print('页面加载完毕...')
finally:
print('程序运行结束...')
# print('='*40)
# print(next(page))
# with open(r'./temp.html', 'w', encoding='utf-8') as f:
# f.write(next(page))
#
# with open(r'./temp.html', 'r', encoding='utf-8') as f:
# html = f.read()