1 # -*- coding: utf-8 -*- 2 # @Time : 2019/11/12 21:22 3 # @Author : AForever 4 # @Site : 5 # @File : cnblog_002.py 6 # @Software: PyCharm 7 8 from urllib import request 9 from bs4 import BeautifulSoup 10 import os 11 import pymysql 12 13 14 # 获取数据 15 def get_data(): 16 headers = { 17 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" 18 } 19 20 for i in range(7): 21 url = "https://search.51job.com/list/040000,000000,0000,00,9,99,python%25E5%25BC%2580%25E5%258F%2591,2," + str(i+1) + ".html" 22 req = request.Request(url, headers=headers) 23 file_name = 'E:\\python_project\\Spider\\cnblogs\\data\\cnblog_pythonjob' + str(i + 1) + '.html' 24 print(file_name, url) 25 response = request.urlopen(req) 26 27 if response.getcode() == 200: 28 data = response.read() 29 data = str(data, encoding='gbk') 30 with open(file_name, mode="w", encoding="gbk") as f: 31 f.write(data) 32 print("*"*15, "get data success", "*"*15) 33 34 35 # 解析数据 36 def parse_data(): 37 path = 'E:\\python_project\\Spider\\cnblogs\\data\\' 38 filenames = os.listdir(path) 39 result = [] 40 for filename in filenames: 42 with open(path+filename, mode="r", encoding="gbk") as f: 43 html = f.read() 44 bs = BeautifulSoup(html, 'html.parser') 45 divs = bs.select('#resultList .el') 46 for div in divs[1:]: 47 title = div.select('.t1')[0].get_text(strip=True) 48 company = div.select('.t2')[0].get_text(strip=True) 49 addr = div.select('.t3')[0].get_text(strip=True) 50 salary = div.select('.t4')[0].get_text(strip=True) 51 pubdate = div.select('.t5')[0].get_text(strip=True) 52 row = { 53 'title': title, 54 'company': company, 55 'addr': addr, 56 'salary': salary, 57 'pubdate': pubdate 58 } 59 result.append(row) 61 62 print('*' * 15, 'parse data success, ,Congratulations!', '*' * 15) 64 return result 65 66 67 # 创建数据表 68 def create_table(): 69 config = { 70 'host': 'localhost', 71 'port': 3306, 72 'user': 'root', 73 'password': '123456', 74 'database': 'python', 75 'charset': 'utf8' 76 } 77 conn = pymysql.connect(**config) 78 cursor = conn.cursor() 79 # 如果存在student表,则先删除 80 try: 81 cursor.execute('DROP TABLE IF EXISTS `t_job`;') 82 conn.commit() 83 print('*' * 15, "drop table success", '*' * 15) 84 except: 85 print('*' * 15, 'table dose not exist', '*' * 15) 86 87 create_table = ''' 88 create table t_job( 89 id int primary key auto_increment, 90 title varchar(200), 91 company varchar(200), 92 addr varchar(200), 93 salary varchar(200), 94 pubdate varchar(200) 95 )engine=Innodb charset utf8; 96 ''' 97 # 创建数据表 98 cursor.execute(create_table) 99 cursor.close() 100 conn.close() 101 print('*' * 15, 'create tables success,Congratulations!', '*' * 15) 102 103 104 # 存储数据到mysql 105 def save_to_mysql(data): 106 config = { 107 'host': 'localhost', 108 'port': 3306, 109 'user': 'root', 110 'password': 'lem600@HW', 111 'database': 'python', 112 'charset': 'utf8' 113 } 114 115 conn = pymysql.connect(**config) 116 cursor = conn.cursor() 117 sql = ''' 118 insert into t_job(title, company, addr, salary, pubdate) 119 values(%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubdate)s) 120 ''' 121 cursor.executemany(sql, data) 122 conn.commit() 123 cursor.close() 124 conn.close() 125 print('*' * 15, 'save data to mysql success ,Congratulations !', '*' * 15) 126 127 128 if __name__ == "__main__": 129 get_data() 130 # parse_data() 131 create_table() 132 save_to_mysql(parse_data())