docekr-compose的MySQL yaml文件
version: '3'
services:
mysql:
image: 'mysql/mysql-server:5.7'
restart: always
container_name: mysql
environment:
MYSQL_ROOT_PASSWORD: 123456
command:
--default-authentication-plugin=mysql_native_password
--character-set-server=utf8mb4
--collation-server=utf8mb4_general_ci
--explicit_defaults_for_timestamp=true
--lower_case_table_names=1
--max_allowed_packet=128M;
ports:
- 3306:3306
pymysql模块的使用
"""
p8
pymysql模块与MySQL数据库交互流程
创建数据库连接对象 db=pymysql.connect(xx,xx,xx,xx,xx)
创建游标对象 cursor=db.cursor()
执行SQL命令 cursor.execute(sql语句,[xx,xx])
提交到数据库执行 db.commit()
关闭游标 cursor.close()
断开数据库连接 db.close()
"""
import pymysql
# #1、创建数据库连接对象 + 游标对象
# db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
# cursor = db.cursor()
# #2、利用游标对象的execute()方法执行SQL命令
# ins = 'insert into maoyantab values(%s,%s,%s)'
# cursor.execute(ins, ['大话西游之月光宝盒', '周星驰', '1993-01-01'])
# #3、提交到数据库执行:commit()
# db.commit()
# #4、关闭游标 + 断开数据库连接
# cursor.close()
# db.close()
"""
pymysql之excutemany()方法
作用:
---减少数据库IO次数,提高效率,一次性插入多条表记录
示例:
ins = 'insert into maoyantab values(%s,%s,%s)'
film_li = [
('大话西游之月光宝盒', '周星驰', '1993-01-01'),
'大话西游之月光宝盒', '周星驰', '1994-01-01'
]
cursor.executemany(ins,film_li)
"""
#利用excutemany()方法,一次性插入多条表记录,提升数据库的效率
#1、创建数据库连接对象 + 游标对象
db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
cursor = db.cursor()
#2、利用游标对象的execute()方法执行SQL命令
ins = 'insert into maoyantab values(%s,%s,%s)'
film_li = [
('大话西游之月光宝盒', '周星驰', '1993'),
('大话西游之月光宝盒', '周星驰', '1994'),
('大话西游之大圣娶亲', '周星驰', '1995'),
('大话西游之大圣娶亲', '周星驰', '1996')
]
cursor.executemany(ins, film_li)
#3、提交到数据库执行:commit()
db.commit()
#4、关闭游标 + 断开数据库连接
cursor.close()
db.close()
使用pymysql的execute()方法 单次存储抓取到的数据
"""
猫眼电影TOP100抓取 - execute()
"""
from urllib import request
import random
import time
import re
import pymysql
class MaoyanSpider:
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
'Cookie': '__mta=214675752.1627866406865.1627875459110.1627875460018.12; uuid_n_v=v1; uuid=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; _csrf=38f9740349f3f3b55a88970a5164681765e4611ccbd2fc8ef5f526914970614d; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1627866407; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17b04661fb4c8-0813968a22224b-d7e1938-e1000-17b04661fb4c8; _lxsdk=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; __mta=214675752.1627866406865.1627866406865.1627866409991.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1627875460; _lxsdk_s=17b04f00a6e-9d7-717-d8%7C%7C9'
}
# 添加计数变量
self.i = 0
# 连接数据库 + 游标对象
self.db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
self.cursor = self.db.cursor()
def get_html(self, url):
"""获取HTML内容"""
req = request.Request(url=url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode()
# print(html)
# 直接调用解析函数
self.parse_html(html)
def parse_html(self, html):
"""提取HTML内容"""
regex = '<div class="movie-item-info">.*?title="(.*?)".*?">.*?</a></p>.*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>'
pattern = re.compile(regex, re.S)
r_list = pattern.findall(html)
# 调用数据处理函数
self.save_html(r_list)
def save_html(self, r_list):
"""数据处理函数"""
ins = 'insert into maoyantab values(%s,%s,%s)'
for r in r_list:
film_li = [
r[0].strip(),
r[1].strip(),
r[2].strip()
]
self.cursor.execute(ins, film_li)
self.db.commit()
print(film_li)
self.i += 1
def run(self):
"""程序运行调配"""
for page in range(0, 91, 10):
url = self.url.format(page)
# print(url)
self.get_html(url)
# 控制数据抓取频率
time.sleep(random.randint(1, 2))
# 所有页面数据抓取完成后关闭
self.cursor.close()
self.db.close()
if __name__ == '__main__':
spider = MaoyanSpider()
spider.run()
print('电影数量:', spider.i)
使用pymysql的executemany()方法 一次性存储抓取到的数据
"""
猫眼电影TOP100抓取 - executemany()
"""
from urllib import request
import random
import time
import re
import pymysql
class MaoyanSpider:
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
'Cookie': '__mta=214675752.1627866406865.1627875459110.1627875460018.12; uuid_n_v=v1; uuid=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; _csrf=38f9740349f3f3b55a88970a5164681765e4611ccbd2fc8ef5f526914970614d; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1627866407; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17b04661fb4c8-0813968a22224b-d7e1938-e1000-17b04661fb4c8; _lxsdk=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; __mta=214675752.1627866406865.1627866406865.1627866409991.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1627875460; _lxsdk_s=17b04f00a6e-9d7-717-d8%7C%7C9'
}
# 添加计数变量
self.i = 0
# 连接数据库 + 游标对象 + 存放所有电影信息的大列表
self.db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
self.cursor = self.db.cursor()
self.all_film_list = []
def get_html(self, url):
"""获取HTML内容"""
req = request.Request(url=url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode()
# print(html)
# 直接调用解析函数
self.parse_html(html)
def parse_html(self, html):
"""提取HTML内容"""
regex = '<div class="movie-item-info">.*?title="(.*?)".*?">.*?</a></p>.*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>'
pattern = re.compile(regex, re.S)
r_list = pattern.findall(html)
# 调用数据处理函数
# print(r_list)
self.save_html(r_list)
def save_html(self, r_list):
"""数据处理函数"""
for r in r_list:
film_t = (
r[0].strip(),
r[1].strip(),
r[2].strip()
)
# 把所有电影的元组添加到大列表中,用于最后的 executemany()
self.all_film_list.append(film_t)
print(film_t)
self.i += 1
def run(self):
"""程序运行调配"""
for page in range(0, 91, 10):
url = self.url.format(page)
# print(url)
self.get_html(url)
# 控制数据抓取频率
time.sleep(random.randint(1, 2))
# 所有页面数据抓取完成,一次性存入到MySQL数据库
ins = 'insert into maoyantab values(%s,%s,%s)'
self.cursor.executemany(ins, self.all_film_list)
self.db.commit()
self.cursor.close()
self.db.close()
if __name__ == '__main__':
spider = MaoyanSpider()
spider.run()
print('电影数量:', spider.i)