20 古诗文网站诗文爬取(正则方法)

 

 1 """古诗文网爬虫"""
 2 
 3 
 4 import re
 5 import requests
 6 
 7 def parse_page(url):
 8     headers = {
 9         User-Agent: Mozilla/5.0,
10     }
11 
12     response = requests.get(url, headers)
13     # print(response.text)
14     text = response.text
15 
16     # re解析
17     titles = re.findall(r<div\sclass="cont">.*?<b>(.*?)</b>, text, re.DOTALL)     # .本不会匹配\n,加上参数re.DOTALL即对任何字符都有效
18     # print(titles)
19     dynasties = re.findall(r<p class="source">.*?<a.*?>(.*?)</a>, text, re.DOTALL)
20     # print(dynasties)
21     authors = re.findall(r<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>, text, re.DOTALL)
22     # print(authors)
23     content_tags = re.findall(r<div class="contson" .*?>(.*?)</div>, text, re.DOTALL)
24     # print(content_tags)
25     contents = []
26     for content in content_tags:
27         x = re.sub(r<.*>, "", content).strip()
28         contents.append(x)
29     poems = []
30     for value in zip(titles, dynasties, authors, contents):
31         title, dynasty, author, content = value
32         poem = {
33             title: title,
34             dynasty: dynasty,
35             author: author,
36             content: content
37         }
38         poems.append(poem)
39     
40     # 输出诗文记录
41     for poem in poems:
42         print(poem)
43 
44 
45 def main():
46     url = "https://www.gushiwen.org/default_{}.aspx"
47     for x in range(1, 11):
48         newurl = url.format(x)
49         parse_page(newurl)
50 
51 if __name__ == __main__:
52     main()

 

20 古诗文网站诗文爬取(正则方法)

上一篇:GXYCTF2019 Ping Ping Ping


下一篇:git 上传文件到 gitee 码云远程仓库(强制上传)