(一)选题背景
当今世界电子小说阅读走进千家万户,其中各大网络电子小说网站更是琳琅满目,为了 探寻网站签约作家的各小说作品的热度对比。
我选择了纵横小说网的无罪作家来作为我本学期爬虫程序设计作业。
(二)主题式网络爬虫设计方案
1.主题式网络爬虫名称:
对纵横小说网签约作家作品数据进行爬取与分析
2.主题式网络爬虫爬取的内容与数据待征分析
http://home.zongheng.com/show/userInfo/110992.html
爬取纵横小说网中无罪作家的各本小说的书名和点击量
3.主题式网络爬虫设计方案概述(包括实现思路与技术难点)
从源代码页面爬取书籍url地址:
爬取书籍名称和点击量:
然后把名称和点击量等数据制成折线图,直方图,饼图。
(三)主题页面的结构特征分析
1..主题页面的结构与特征分析
2.Htmls 页面解析
(四)网络爬虫程序设计
数据爬取与采集和清洗
- #需要爬取的网站的url
- url = 'http://www.zongheng.com/'
- #获取页面信息
- def gethtml(url):'http://home.zongheng.com/show/userInfo/110992.htm'
- info = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
- try:
- data = requests.get(url, headers=info)
- data.raise_for_status()
- data.encoding = data.apparent_encoding
- return data.text
- except:
- return " "
-
- #书籍url
- def urlinfo(url):
- books = []
- book = gethtml(url)
- soup = BeautifulSoup(book, "html.parser")
- #获取属性为tit的p标签
- p = soup.find_all("p", attrs="tit")
- for item in p:
- #获取书籍地址
- books.append(item.a.attrs['href'])
- return books
- #点击量信息
- def numsinfo(html):
- n = []
- soup = BeautifulSoup(html, 'html.parser')
- div = soup.find_all("div", attrs='nums')
- nums = div[0]
- i = 0
- for spa in nums.find_all("i"):
- if i == 2:
- #获取点击量
- n.append(spa.string.split('.')[0])
- break
- i += 1
- return n
- #书名信息
- def namesinfo(html):
- soup = BeautifulSoup(html, 'html.parser')
- #获取属性为book-name的div
- name = soup.find_all("div", attrs='book-name')
- #正则获取中文书名
- namess = re.findall(r"[\u4e00-\u9fa5]+", str(name[0]))
- return namess
文本分析
- def file(book, nums, address):
- # 创建Excel
- excel = xlwt.Workbook(encoding='utf-8')
- #创建表
- sheet1 = excel.add_sheet(u'One', cell_overwrite_ok=True)
- #写入列名
- sheet1.write(0, 0, 'book')
- sheet1.write(0, 1, 'number')
- for i in range(1, len(book)):
- sheet1.write(i, 0, book[i])
- for j in range(1, len(number)):
- sheet1.write(j, 1, number[j])
- excel.save(address)
数据分析与可视化
#柱形图
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
x = ['渡劫之王', '巴山剑扬', '平天策', '神仙职员', '剑王朝', '流氓高手', '仙侠世界', '仙魔变',
'罗浮', '冰火破坏神', '众神王座']
y = [3255112,400640,3062812,913820,2325362,1113306,2723772,2807917,2436869,2224430,1007224]
plt.bar(x, y)
plt.title('纵横小说网无罪')
plt.xlabel('作品')
plt.ylabel('点击量')
plt.xticks(rotation=45)
plt.show()
将以上各部分的代码汇总,附上完整程序代码
- #导入相关库
- from bs4 import BeautifulSoup
- import requests
- import matplotlib
- import re
- import xlwt
- import matplotlib.pyplot as plt
- #需要爬取的网站的url
- url = 'http://www.zongheng.com/'
- #获取页面信息
- def gethtml(url):'http://home.zongheng.com/show/userInfo/110992.htm'
- info = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
- try:
- data = requests.get(url, headers=info)
- data.raise_for_status()
- data.encoding = data.apparent_encoding
- return data.text
- except:
- return " "
- #书籍url
- def urlinfo(url):
- books = []
- book = gethtml(url)
- soup = BeautifulSoup(book, "html.parser")
- #获取属性为tit的p标签
- p = soup.find_all("p", attrs="tit")
- for item in p:
- #获取书籍地址
- books.append(item.a.attrs['href'])
- return books
- #点击量信息
- def numsinfo(html):
- n = []
- soup = BeautifulSoup(html, 'html.parser')
- div = soup.find_all("div", attrs='nums')
- nums = div[0]
- i = 0
- for spa in nums.find_all("i"):
- if i == 2:
- #获取点击量
- n.append(spa.string.split('.')[0])
- break
- i += 1
- return n
- #书名信息
- def namesinfo(html):
- soup = BeautifulSoup(html, 'html.parser')
- #获取属性为book-name的div
- name = soup.find_all("div", attrs='book-name')
- #正则获取中文书名
- namess = re.findall(r"[\u4e00-\u9fa5]+", str(name[0]))
- return namess
- #修复中文方框
- matplotlib.rcParams['font.sans-serif'] = ['SimHei']
- matplotlib.rcParams['font.family'] = 'sans-serif'
- matplotlib.rcParams['axes.unicode_minus'] = False
- #柱形图
- from matplotlib import pyplot as plt
- plt.rcParams['font.sans-serif'] = ['SimHei']
- x = ['渡劫之王', '巴山剑扬', '平天策', '神仙职员', '剑王朝', '流氓高手', '仙侠世界', '仙魔变',
- '罗浮', '冰火破坏神', '众神王座']
- y = [3255112,400640,3062812,913820,2325362,1113306,2723772,2807917,2436869,2224430,1007224]
- plt.bar(x, y)
- plt.title('纵横小说网无罪')
- plt.xlabel('作品')
- plt.ylabel('点击量')
- plt.xticks(rotation=45)
- plt.show()
- def file(book, nums, address):
- # 创建Excel
- excel = xlwt.Workbook(encoding='utf-8')
- #创建表
- sheet1 = excel.add_sheet(u'One', cell_overwrite_ok=True)
- #写入列名
- sheet1.write(0, 0, 'book')
- sheet1.write(0, 1, 'number')
- for i in range(1, len(book)):
- sheet1.write(i, 0, book[i])
- for j in range(1, len(number)):
- sheet1.write(j, 1, number[j])
- excel.save(address)
- #列表元素类型转换
- def convert(lista):
- listb = []
- for i in lista:
- listb.append(i[0])
- return listb
- def main():
- #作者页面
- author = 'http://home.zongheng.com/show/userInfo/110992.html'
- user = '无罪'
- urls = urlinfo(author)
- namelist = []
- countlist = []
- for url in urls:
- html = gethtml(url)
- namelist.append(namesinfo(html))
- countlist.append(numsinfo(html))
- namelist = convert(namelist)
- countlist = convert(countlist)
- for i in range(len(countlist)):
- countlist[i] = int(countlist[i])
- #保存地址
- addr = f'D:\\{xxj}.xls'
- file(namelist, countlist, addr)
- Bar(namelist, countlist, user)
- if __name__ == '__main__':
- main()
五、总结
通过这次对python爬虫的实战运用我深刻认识到了自己学习上的不足,期间查阅借鉴了很多学长学姐的资料,从头开始一步步慢慢摸索最终完成了这次的课程设计。
期间在matplotliib绘制图形的学习上下了很大功夫,对网站的页面解析也是非常棘手,但本次学习让我深刻认识到python工具的强大,让我对编程语言产生了兴趣,希
望自己的在接下来的学习中可以进步,在爬虫方面的能力提升。