Python3爬虫（sqlite3存储信息）--AGE动漫网站排行榜

2022-10-31 11:11:22

目标

1.爬虫代码

爬取来源链接：https://m.agefans.org/rank

目标

爬取出排行榜1~100个动漫的排名、名称、热度的全部信息，将信息存储在sqlite3数据库中，并且最终用GUI展现出爬取出的结果。

1.爬虫代码

import requests
import re
import sqlite3

def get_page(url):#返回url的HTML源码
	response = requests.get(url)
	if response.status_code==200:
	  return response.text
	else:
	  return '失败'

def parse_html(html):#解析参数html的HTML源码
    #正则表达式匹配动漫排名，播放链接，名称，人气值
	pattern=re.compile('<li class="row mb-1 rank-item">.*?rank.*?>(\d+)</span>.*?href="(.*?)".*?age-text-blue small.*?>(.*?)</span>.*?col-3 small text-truncate.*?>(.*?)</span>', re.S)
	result=re.findall(pattern, html)
	return result

def link_html():#抓取来源html
    html = get_page('https://m.agefans.org/rank')
    result1 = parse_html(html)
    return result1

def save_db():#将爬取到的数据存储到sqlite中
    result1=link_html()
    con=sqlite3.connect(r'...\AGE.db')
    con.execute("""DROP TABLE data""")
    con.execute("create table data (rank primary key,link,title,rating)")
    cur=con.cursor()
    cur.executemany("insert into data(rank,link,title,rating) values(?,?,?,?)",result1)
    con.commit()
    cur.close()
    con.close()

if __name__ == '__main__':
    save_db()

1.1运行结果

用DB Browser for SQLite查看AGE.db爬取的内容（展示1~35条信息）

1.2爬虫难点

1.2.1编写正则表达式:

排名：.*?rank.*?>(\d+)</span>

链接：.*?href="(.*?)"

名称：.*?age-text-blue small.*?>(.*?)</span>

热度：.*?col-3 small text-truncate.*?>(.*?)</span>

每个网站对应的页面元素组成各不相同，需要根据实际所需要爬取的网站写出对应的正则表达式。CTRL+SHIFT+I进入检查页面查看元素。

1.3爬虫中的不足

1.3.1抓取的动漫播放链接不够完整

链接link因为网站的<a>标签内的href不是绝对路径而是相对路径，爬取出来的链接并不能直接跳转，没找到能够将相对路径换成绝对路径的方法

2.GUI展现爬虫内容

import tkinter
import tkinter.messagebox
from tkinter.messagebox import *
import tkinter.ttk
import tkinter as tk
import sqlite3
from PIL import ImageTk, Image
from tkinter import ttk
import pymysql
win=tkinter.Tk()

#页面大小
win.geometry("1390x750")
win.title('AGE排行榜')

#标题
label=tkinter.Label(win,compound = 'center',text='AGE动漫排行榜',font=('黑体',40),fg='#db7093',bg='#add8e6',width='500')
label.pack()

#背景图片
imgpath = (r'...\1.jpg')#背景图片路径
img = Image.open(imgpath)
canvas = tk.Canvas(win, width=2500, height=1000, bd=0)
photo = ImageTk.PhotoImage(img)
canvas.create_image(690, 280, image=photo)
canvas.pack()

from tkinter import *
Label(win, text="关键字查询：",bg='#add8e6',font=('黑体',15)).place(x=500, y=80, width=120, height=25)
selecttitle = StringVar()
Entry(win, textvariable=selecttitle).place(x=650, y=80, width=300, height=25)


# 数据库位置
database = (r'...\AGE.db')

# 显示函数
def showAllInfo():
    # 将之前显示的内容删除
    x = dataTreeview.get_children()
    for item in x:
        dataTreeview.delete(item)
    # 连接数据库
    con = sqlite3.connect(database)
    cur = con.cursor()
    cur.execute("select * from data")
    lst = cur.fetchall()
    for item in lst:
        dataTreeview.insert("", 100, text="line1", values=item)
    cur.close()
    con.close()

#按标题查询
def showTitle():
   if selecttitle.get() == "":
       showerror(title='提示', message='输入不能为空')
   else:
       x = dataTreeview.get_children()
       for item in x:
           dataTreeview.delete(item)
       con = sqlite3.connect(database)
       cur = con.cursor()
       content="'%"+selecttitle.get()+"%'"	#进行模糊查询
       cur.execute("select * from data where title like "+content)
       lst = cur.fetchall()
       if len(lst) == 0:  #判断如果查询不到则提示查询不到窗口
           showerror(title='提示', message='此动漫暂未上榜，或检查输入信息是否正确')
       else:#否则显示查询几条记录窗口
           showinfo(title='提示', message='查询到'+str(len(lst))+"条数据")
           for item in lst:
               dataTreeview.insert("", 100, text="line1", values=item)
       cur.close()
       con.close()

tkinter.Button(win,text='查询全部',width=40,command=showAllInfo,font=(12)).place(x=800, y=125, width=120, height=30)
Button(win, text="按标题查询", command=showTitle,font=(12)).place(x=550, y=125, width=120, height=30)

#列表sqlite数据
dataTreeview = ttk.Treeview(win, show='headings', column=('rank','link', 'title', 'rating'))
dataTreeview.column('rank', width=2, anchor="center")
dataTreeview.column('link', width=20, anchor="center")
dataTreeview.column('title', width=350, anchor="center")
dataTreeview.column('rating', width=15, anchor="center")

dataTreeview.heading('rank', text='排名')
dataTreeview.heading('link', text='链接')
dataTreeview.heading('title', text='名称')
dataTreeview.heading('rating', text='热度')
dataTreeview.place(x=200, y=180, width=1000, height=300)

#滚动条
s = tkinter.Scrollbar(dataTreeview, command=dataTreeview.yview)
s.pack(side="right", fill="y")
dataTreeview.config(yscrollcommand=s.set)
win.mainloop()

2.1思路

根据前面爬取的数据得到的数据库，实现GUI界面与SQLite数据库相连，即可查看排行榜信息，并且实现了能够查看全部信息，或者根据动漫名称关键字搜索可得相关信息。

除了可以按照标题查找，还能在此基础上拓展出按照热度、链接查找等功能。

2.2运行结果

2.3GUI设计难点

2.3.1按标题查询--模糊查询

content="'%"+selecttitle.get()+"%'"

码农公寓

目标

1.爬虫代码

1.1运行结果

1.2爬虫难点

1.2.1编写正则表达式:

1.3爬虫中的不足

1.3.1抓取的动漫播放链接不够完整

2.GUI展现爬虫内容

2.1思路

2.2运行结果

2.3GUI设计难点

2.3.1按标题查询--模糊查询

相关文章