Python爬虫-爬取古诗

今天学习了一些简单的爬虫知识,并应用这些知识撸了一爬取古诗的程序

主要使用的第三方库:Request,bs4

直接上代码:
spider.py:

点击查看代码
# -*- coding:utf-8 -*-
# spider.py

import sys
import bs4
import requests
import re
from poem import Poem


def getPoem(poemText):
    p = Poem(poemText)
    return p


def page_turning():
    global poem_url_0, page, poem_url
    poem_url = poem_url_0 + '&page=' + str(page)
    page += 1


poem_url_0 = 'https://so.gushiwen.cn/shiwens/default.aspx?tstr=&astr=&cstr=&xstr=%e8%af%97'
page = 1
poem_url = ''

page_turning()


def spider():
    global poem_url
    poem_html = requests.get(poem_url)  # 获取网页原始HTML文档
    # print(poem_html)  # <Response [200]> 返回一个Response对象
    soup = bs4.BeautifulSoup(poem_html.text, 'html.parser')  # bs4 解析HTML文档
    textareaTag_lst = soup.find_all('textarea')  # 获取古诗文本的标签
    poemText_ptn = re.compile(r'>(.[^w]*?《[\u4e00-\u9fa5·]+?》.+?aspx)<')  # 产生一个对古诗文本的匹配方法
    poemText_lst = poemText_ptn.findall(str(textareaTag_lst))  # 提取古诗文本,生成文本列表

    poem_lst = list(map(getPoem, poemText_lst))
    print('共爬取到 %d 首古诗词:' % poem_lst[0].poemCout)

    for p in poem_lst:
        print('#%2d %s' % (p.index, p.title))

    print('#0 爬取下一页')
    print('#-1 重新爬取')
    print('#-2 退出')
    while True:
        num = int(input('输入编号查看对应详细内容或进行其他操作:'))

        if num - 1 in range(len(poemText_lst)):
            poem_lst[num - 1].showPoem()
        elif num == 0:
            page_turning()
            spider()
        elif num == -1:
            spider()
        elif num == -2:
            sys.exit()
        else:
            print('错误:输入的号码有误.')


spider()

上一篇:Lombok - 消除冗长的 java 代码


下一篇:14-bs4基本使用---爬取菜价