python+urllib+BeautifulSoup+pymysql

  1 # -*- coding: utf-8 -*-
  2 # @Time : 2019/11/12 21:22
  3 # @Author : AForever
  4 # @Site : 
  5 # @File : cnblog_002.py
  6 # @Software: PyCharm
  7 
  8 from urllib import request
  9 from bs4 import BeautifulSoup
 10 import os
 11 import pymysql
 12 
 13 
 14 # 获取数据
 15 def get_data():
 16     headers = {
 17         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
 18     }
 19 
 20     for i in range(7):
 21         url = "https://search.51job.com/list/040000,000000,0000,00,9,99,python%25E5%25BC%2580%25E5%258F%2591,2," + str(i+1) + ".html"
 22         req = request.Request(url, headers=headers)
 23         file_name = 'E:\\python_project\\Spider\\cnblogs\\data\\cnblog_pythonjob' + str(i + 1) + '.html'
 24         print(file_name, url)
 25         response = request.urlopen(req)
 26 
 27         if response.getcode() == 200:
 28             data = response.read()
 29             data = str(data, encoding='gbk')
 30             with open(file_name, mode="w", encoding="gbk") as f:
 31                 f.write(data)
 32     print("*"*15, "get data success", "*"*15)
 33 
 34 
 35 # 解析数据
 36 def parse_data():
 37     path = 'E:\\python_project\\Spider\\cnblogs\\data\\'
 38     filenames = os.listdir(path)
 39     result = []
 40     for filename in filenames:
 42         with open(path+filename, mode="r", encoding="gbk") as f:
 43             html = f.read()
 44             bs = BeautifulSoup(html, 'html.parser')
 45             divs = bs.select('#resultList .el')
 46             for div in divs[1:]:
 47                 title = div.select('.t1')[0].get_text(strip=True)
 48                 company = div.select('.t2')[0].get_text(strip=True)
 49                 addr = div.select('.t3')[0].get_text(strip=True)
 50                 salary = div.select('.t4')[0].get_text(strip=True)
 51                 pubdate = div.select('.t5')[0].get_text(strip=True)
 52                 row = {
 53                     'title': title,
 54                     'company': company,
 55                     'addr': addr,
 56                     'salary': salary,
 57                     'pubdate': pubdate
 58                 }
 59                 result.append(row)
 61 
 62     print('*' * 15, 'parse data success, ,Congratulations!', '*' * 15)
 64     return result
 65 
 66 
 67 # 创建数据表
 68 def create_table():
 69     config = {
 70         'host': 'localhost',
 71         'port': 3306,
 72         'user': 'root',
 73         'password': '123456',
 74         'database': 'python',
 75         'charset': 'utf8'
 76     }
 77     conn = pymysql.connect(**config)
 78     cursor = conn.cursor()
 79     # 如果存在student表,则先删除
 80     try:
 81         cursor.execute('DROP TABLE IF EXISTS `t_job`;')
 82         conn.commit()
 83         print('*' * 15, "drop table success", '*' * 15)
 84     except:
 85         print('*' * 15, 'table dose not exist', '*' * 15)
 86 
 87     create_table = '''
 88         create table t_job(
 89         id int primary key auto_increment,
 90         title varchar(200),
 91         company varchar(200),
 92         addr varchar(200),
 93         salary varchar(200),
 94         pubdate varchar(200)
 95         )engine=Innodb charset utf8;
 96         '''
 97     # 创建数据表
 98     cursor.execute(create_table)
 99     cursor.close()
100     conn.close()
101     print('*' * 15, 'create tables success,Congratulations!', '*' * 15)
102 
103 
104 # 存储数据到mysql
105 def save_to_mysql(data):
106     config = {
107         'host': 'localhost',
108         'port': 3306,
109         'user': 'root',
110         'password': 'lem600@HW',
111         'database': 'python',
112         'charset': 'utf8'
113     }
114 
115     conn = pymysql.connect(**config)
116     cursor = conn.cursor()
117     sql = '''
118     insert into t_job(title, company, addr, salary, pubdate)
119     values(%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubdate)s)
120     '''
121     cursor.executemany(sql, data)
122     conn.commit()
123     cursor.close()
124     conn.close()
125     print('*' * 15, 'save data to mysql success ,Congratulations !', '*' * 15)
126 
127 
128 if __name__ == "__main__":
129     get_data()
130     # parse_data()
131     create_table()
132     save_to_mysql(parse_data())

 

上一篇:使用Python中的BS4,Selenium收集动态数据并避免重复


下一篇:20203117邓子啸《Python程序设计》实验四报告