一、什么是爬虫?
- 简单一句话就是代替人去模拟浏览器进行网页操作,而这里我们需要的就是模拟打开网页,去获取我们需要的数据并储存在数据库中。
二、爬取代码
代码如下(实例):
#-*- codeing = utf-8 -*-
#@time : 2021/12/2 22:53
#@Author : 19310220204
#@File : spider test.py
#@Software: PyCharm
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import sqlite3 #进行SQLite数据库操作
import os
def main():
print("开始爬取。。。")
baseurl = "https://search.51job.com/list/150300,000000,0000,00,9,99,+,2,"
datalist=getData(baseurl)
db_savepath="51job.db"
saveData_db(datalist,db_savepath)
print("爬取完毕!")
# --*-- 匹配规则 --*--
findjob=re.compile(r'"is_special_job":"",(.*?),"adid":""')
# 职位详情链接
findjob_L=re.compile(r'"job_href":"(.*?)",')
# 职位名称
findjob_n=re.compile(r'"job_name":"(.*?)",')
# 公司详情链接
findcompany_L=re.compile(r'"company_href":"(.*?)",')
# 公司名称
findcompany_n=re.compile(r'"company_name":"(.*?)",')
# 提供薪水
findsalary=re.compile(r'"providesalary_text":"(.*?)",')
# 工作区域
findarea=re.compile(r'"workarea_text":"(.*?)",')
# 公司类型
findcompany_t=re.compile(r'"companytype_text":"(.*?)",')
# 职位福利
findjob_w=re.compile(r'"jobwelf":"(.*?)",')
# 公司规模
findcompany_s=re.compile(r'"companysize_text":"(.*?)",')
# 公司主营
findcompany_i=re.compile(r'"companyind_text":"(.*?)"')
# 其他
findjob_a=re.compile(r'"attribute_text":(.*?)]')
#爬取网页
def getData(baseurl):
datalist = []
count=0
for i in range(164):
url=baseurl+str(i)+".html"
html = askURL(url) #保存获取到的网页源码
print("正在爬取第%d页数据..."%i)
# 逐一解析数据
soup = BeautifulSoup(html, "html.parser")
ps=soup.find_all("script",type="text/javascript")
ps_l=str(ps[2])
for j_list in re.findall(findjob, ps_l):
data = []
job_href = re.findall(findjob_L, j_list)[0]
job_href = job_href.replace("\\", "")
data.append(job_href)
job_name = re.findall(findjob_n, j_list)[0]
job_name = job_name.replace("\\", "")
data.append(job_name)
company_href = re.findall(findcompany_L, j_list)[0]
company_href = company_href.replace("\\", "")
data.append(company_href)
company_name = re.findall(findcompany_n, j_list)[0]
company_name = company_name.replace("\\", "")
data.append(company_name)
providesalary = re.findall(findsalary, j_list)[0]
providesalary = providesalary.replace("\\", "")
data.append(providesalary)
workarea = re.findall(findarea, j_list)[0]
workarea = workarea.replace("\\", "")
data.append(workarea)
companytype = re.findall(findcompany_t, j_list)[0]
data.append(companytype)
jobwelf = re.findall(findjob_w, j_list)[0]
data.append(jobwelf)
companysize_text = re.findall(findcompany_s, j_list)[0]
data.append(companysize_text)
companyind_text = re.findall(findcompany_i, j_list)[0]
companyind_text = companyind_text.replace("\\", "")
data.append(companyind_text)
attribute_text = re.findall(findjob_a, j_list)[0]
attribute_text=attribute_text.replace('"',"")+"]"
data.append(attribute_text)
datalist.append(data)
count+=1
if count==1000:
break
print("爬取完毕!")
return datalist
#得到指定URL的网页内容
def askURL(url):
#用户代理
head={ #模拟头部伪装向服务器发送信息
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36 Edg/96.0.1054.53",
"Accept": "text / html, application / xhtml + xml, application / xml; q = 0.9, image / webp, image / apng, * / *;q = 0.8, application / signed - exchange; v = b3; q = 0.9",
#"Cookie": '''guid=129f518e4b1d964be3ec59fd44319ee3; _ujz=MTg0NDQxOTY5MA%3D%3D; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; 51job=cenglish%3D0%26%7C%26; search=jobarea%7E%60150300%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60150300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CE%DF%BA%FE%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60150300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21'''
}
request=urllib.request.Request(url,headers=head)
html=""
try:
response=urllib.request.urlopen(request)
html=response.read().decode('gbk')
# print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#保存数据
def init_db(dbpath):
if os.path.exists(dbpath):
print("数据库文件已存在!")
else:
sql = '''
create table if not exists job(
id integer primary key autoincrement,
job_href text ,
job_name text ,
company_href text ,
company_name text ,
providesalary text ,
workarea text ,
companytype text ,
jobwelf text ,
companysize_text text ,
companyind_text text ,
attribute_text text
)
'''
conn=sqlite3.connect(dbpath)
cursor=conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
def saveData_db(datalist,savepath):
init_db(savepath)
conn=sqlite3.connect(savepath)
cur=conn.cursor()
for data in datalist:
for index in range(len(data)):
data[index]='"'+data[index]+'"'
sql='''
insert into job(
job_href,job_name,company_href,company_name,providesalary,workarea,companytype,jobwelf,companysize_text,companyind_text,attribute_text)
values(%s)'''%",".join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.commit()
print("爬取数据已存储到数据库文件!|",savepath)
if __name__ == '__main__':
main()