今天学习了一些简单的爬虫知识,并应用这些知识撸了一爬取古诗的程序
主要使用的第三方库:Request,bs4
直接上代码:
spider.py:
点击查看代码
# -*- coding:utf-8 -*-
# spider.py
import sys
import bs4
import requests
import re
from poem import Poem
def getPoem(poemText):
p = Poem(poemText)
return p
def page_turning():
global poem_url_0, page, poem_url
poem_url = poem_url_0 + '&page=' + str(page)
page += 1
poem_url_0 = 'https://so.gushiwen.cn/shiwens/default.aspx?tstr=&astr=&cstr=&xstr=%e8%af%97'
page = 1
poem_url = ''
page_turning()
def spider():
global poem_url
poem_html = requests.get(poem_url) # 获取网页原始HTML文档
# print(poem_html) # <Response [200]> 返回一个Response对象
soup = bs4.BeautifulSoup(poem_html.text, 'html.parser') # bs4 解析HTML文档
textareaTag_lst = soup.find_all('textarea') # 获取古诗文本的标签
poemText_ptn = re.compile(r'>(.[^w]*?《[\u4e00-\u9fa5·]+?》.+?aspx)<') # 产生一个对古诗文本的匹配方法
poemText_lst = poemText_ptn.findall(str(textareaTag_lst)) # 提取古诗文本,生成文本列表
poem_lst = list(map(getPoem, poemText_lst))
print('共爬取到 %d 首古诗词:' % poem_lst[0].poemCout)
for p in poem_lst:
print('#%2d %s' % (p.index, p.title))
print('#0 爬取下一页')
print('#-1 重新爬取')
print('#-2 退出')
while True:
num = int(input('输入编号查看对应详细内容或进行其他操作:'))
if num - 1 in range(len(poemText_lst)):
poem_lst[num - 1].showPoem()
elif num == 0:
page_turning()
spider()
elif num == -1:
spider()
elif num == -2:
sys.exit()
else:
print('错误:输入的号码有误.')
spider()