近日爬取疫情新闻数据,顺便把代码整理了一下,分享出来
from bs4 import BeautifulSoup
import re
import requests
def get_content(url):
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.content, 'html.parser')
newlist = soup.find_all('p')
content=''
for i in newlist[:-4:]:
content+=i.text
return content
#把文件存储在linajie.txt
data=[]
for i in range(25,39):
filename = '新浪2020' +str(i)+'.csv'
newsurl='https://search.sina.com.cn/?q=%e8%82%ba%e7%82%8e%e7%96%ab%e6%83%85&c=news&from=&col=1_3&range=title&source=&country=&size=10&stime=m-01-01+00%3A00%3A00&etime=m-12-31+23%3A59%3A59&time=m&dpc=0&a=&ps=0&pf=0&page={}'.format(i)
#https://search.sina.com.cn/?q=%e7%96%ab%e6%83%85&c=news&from=&col=&range=title&source=&country=&size=10&stime=&etime=&time=&dpc=0&a=&ps=0&pf=0&page=2
res = requests.get(newsurl)
soup = BeautifulSoup(res.text,'html.parser')
a=soup.find_all('h2')
for i in a:
i=str(i)
pattern='<h2><a href="(.*?)" target="_blank">(.*?<font color="red">疫情</font>.*?)</a>(?:.|\n)<span class="fgray_time">(?:.|\n).*? (.*?) .*?</span></h2>'
title=re.findall(pattern,i)
#处理标题
for i,j,k in title:
pattern='(.*?)<font color="red">(疫情)</font>(.*?)$'
j=re.findall(pattern,j)
for a,b,c in j:
j=a+b+c
title=[k,j,i,get_content(i)]
data.append(title)
with open(filename,'w',encoding='utf_8_sig',newline="") as f:
f.write('时间,标题,链接,内容\n')
i=0
while i<len(data):
line=data[i][0]
f.write(data[i][0]+','+data[i][1]+','+data[i][2]+','+data[i][3]+'\n')
i+=1
f.close()
print("已保存文件")
爬取网站
爬取数据部分展示