通过request拿到网页原源代码
通过re来提取想要的有效信息
import requests
import re
请求头
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (HTML, "
"like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36 "
}
预编译所有re表达式
sub = re.compile(r'<li id=".*?"><i class=".*?"></i><a href="(?P<content_url>.*?)" target=".*?" title="('
r'?P<title>.*?)">.*?</a><span>(?P<time>.*?)</span></li>', re.S)
detail_1 = re.compile(r'<div id="vsb_content_501">(?P<detail>.*?)<div id="div_vote_id">', re.S)
detail_1 = re.compile(r'<img border="0".*"></p>(?P<detail>.*?)<div id="div_vote_id">', re.S)
爬取网页源代码
遍历389页新闻目录页
for i in range(389, 0, -1):
在所写文档中将每页区内容区分开
news = open("科院新闻.txt", "a", encoding='utf-8')
news.write("*"*10)
news.write("第"+str(390-i)+"页")
news.write("*"*10)
news.write("\n"*2)
news.close()
在控制台输出进度
print("第"+str(390-i)+"页新闻下载中")
获取每一页的源代码
url = "https://news.hist.edu.cn/kyyw/"+str(i)+".htm"
resp = requests.get(url, headers=headers)
content = resp.content.decode()
匹配所需内容
result = sub.finditer(content)
调出并使用匹配到的内容
for it in result:
title = it.group("title")
time = it.group("time")
half = it.group("content_url")
拼出接下来要前往的网页
whole = "https://news.hist.edu.cn/" + half
爬取新的网页的源代码
article = requests.get(whole, headers=headers)
detail_2 = article.content.decode()
匹配所需内容
result2 = detail_1.finditer(detail_2)
创建科院新闻txt文档,
a:打开一个文件用于追加
news = open("科院新闻.txt", "a", encoding='utf-8')
调出并处理匹配到的内容
for mor in result2:
content = mor.group("detail")
进行数据清洗
content = content.replace('(0, 0, 0)', '')
content = content.replace('&', '')
content = content.replace('<', '')
content = content.replace('>', '')
content = content.replace('/', '')
content = content.replace('a', '')
content = content.replace('b', '')
content = content.replace('c', '')
content = content.replace('d', '')
content = content.replace('e', '')
content = content.replace('f', '')
content = content.replace('g', '')
content = content.replace('h', '')
content = content.replace('i', '')
content = content.replace('j', '')
content = content.replace('k', '')
content = content.replace('b', '')
content = content.replace('l', '')
content = content.replace('m', '')
content = content.replace('n', '')
content = content.replace('o', '')
content = content.replace('p', '')
content = content.replace('q', '')
content = content.replace('r', '')
content = content.replace('s', '')
content = content.replace('t', '')
content = content.replace('u', '')
content = content.replace('v', '')
content = content.replace('w', '')
content = content.replace('x', '')
content = content.replace('y', '')
content = content.replace('z', '')
content = content.replace('=":', '')
content = content.replace(';-:', '')
content = content.replace('=":', '')
content = content.replace(': "', '')
content = content.replace('="-', '')
content = content.replace('="_"', '')
content = content.replace('\r\n', '')
写入标题,内容,时间
news.write("《")
news.write(title)
news.write("》")
news.write("\n")
news.write(content)
news.write("\n")
news.write(time)
news.write("\n" * 3)
当我们写文件时,操作系统往往不会立刻把数据写入磁盘,而是放到内存缓存起来,空闲的时候再慢慢写入。
只有调用close()方法时,操作系统才保证把没有写入的数据全部写入磁盘。忘记调用close()的后果是数据可能只写了一部分到磁盘,剩下的丢失了。
news.close()
自己看完视频敲得,有很多操作可能有些沙雕,欢迎大佬在评论区指手画脚