爬取猫眼电影榜单TOP100榜-以mysql数据库保存

在数据库中建库建表

# 连接到mysql数据库
mysql -h127.0.0.1 -uroot -p123456
# 建库建表
create database maoyandb charset utf8;
use maoyandb;
create table filmtab(
name varchar(100),
star varchar(300),
time varchar(50)
)charset=utf8;

 

回顾pymysql基本使用

import pymysql

# 创建2个对象
db = pymysql.connect('localhost','root','123456','maoyandb',charset='utf8')
cursor = db.cursor()

# 执行SQL命令并提交到数据库执行
# execute()方法第二个参数为列表传参补位
ins = 'insert into filmtab values(%s,%s,%s)'
cursor.execute(ins,['霸王别姬','张国荣','1993'])
db.commit()

# 关闭
cursor.close()
db.close()

 

来试试高效的executemany()方法?

import pymysql

# 创建2个对象
db = pymysql.connect('192.168.153.137','tiger','123456','maoyandb',charset='utf8')
cursor = db.cursor()

# 抓取的数据
film_list = [('月光宝盒','周星驰','1994'),('大圣娶亲','周星驰','1994')]

# 执行SQL命令并提交到数据库执行
# execute()方法第二个参数为列表传参补位
cursor.executemany('insert into filmtab values(%s,%s,%s)',film_list)
db.commit()

# 关闭
cursor.close()
db.close()

 

将电影信息存入MySQL数据库(尽量使用executemany方法)

from urllib import request
import re
import time
import random
from useragents import ua_list
import pymysql

class MaoyanSpider(object):
  def __init__(self):
    self.url = 'https://maoyan.com/board/4?offset={}'
    # 计数
    self.num = 0
    # 创建2个对象
    self.db = pymysql.connect(
      'localhost','root','123456','maoyandb',charset='utf8'
    )
    self.cursor = self.db.cursor()

  def get_html(self,url):
    headers = {
      'User-Agent' : random.choice(ua_list)
    }
    req = request.Request(url=url,headers=headers)
    res = request.urlopen(req)
    html = res.read().decode('utf-8')
    # 直接调用解析函数
    self.parse_html(html)

  def parse_html(self,html):
    re_bds = r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>'
    pattern = re.compile(re_bds,re.S)
    # film_list: [('霸王别姬','张国荣','1993'),()]
    film_list = pattern.findall(html)
    # 直接调用写入函数
    self.write_html(film_list)

  # # mysql - execute()
  # def write_html(self,film_list):
  #   ins = 'insert into filmtab values(%s,%s,%s)'
  #   for film in film_list:
  #     L = [
  #       film[0].strip(),
  #       film[1].strip(),
  #       film[2].strip()[5:15]
  #     ]
  #     self.cursor.execute(ins,L)
  #     # 千万别忘了提交到数据库执行
  #     self.db.commit()

    # mysql - executemany([(),(),()])
  def write_html(self, film_list):
    L = []
    ins = 'insert into filmtab values(%s,%s,%s)'
    for film in film_list:
      t = (
        film[0].strip(),
        film[1].strip(),
        film[2].strip()[5:15]
      )
      L.append(t)

    self.cursor.executemany(ins, L)
    # 千万别忘了提交到数据库执行
    self.db.commit()

  def main(self):
    for offset in range(0,31,10):
      url = self.url.format(offset)
      self.get_html(url)
      time.sleep(random.randint(1,2))

    # 断开数据库连接
    self.cursor.close()
    self.db.close()

if __name__ == '__main__':
  start = time.time()
  spider = MaoyanSpider()
  spider.main()
  end = time.time()
  print('执行时间:%.2f' % (end-start))

 

做个SQL查询

1、查询20年以前的电影的名字和上映时间
  select name,time from filmtab where
  time<(now()-interval 20 year);
2、查询1990-2000年的电影名字和上映时间
  select name,time from filmtab where
  time>='1990-01-01' and time<='2000-12-31';
上一篇:初入Python(猫眼top100)


下一篇:python爬虫入门新手向实战 - 爬取猫眼电影Top100排行榜