脚本运行在树莓派上,爬虫使用了python,定时发送通过sh脚本和crontab定时任务完成,邮件发送使用mutt。
爬虫的编写
import requests
from lxml import etree
import json
import re
from ast import literal_eval
import datetime
from prettytable import PrettyTable
headers = {
'Host': 'www.baidu.com',
'Referer': 'https://www.baidu.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81'
}
headers_hupu = {
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81'
}
def get_weather():
url = "http://www.weather.com.cn/weather1d/101010200.shtml" #海淀
res = requests.get(url)
res.encoding = 'utf-8'
html = etree.HTML(res.text)
weather = re.findall('"1d":(.*?),"23d"', res.text)[0]
weather = literal_eval(weather)
return weather
def get_baidutop10():
url = 'https://www.baidu.com/'
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
html = etree.HTML(res.text)
top10_title = html.xpath('//textarea[@id="hotsearch_data"]/text()')[0]
top_json = json.loads(top10_title)
return top_json
def get_weibotop10():
url = 'https://s.weibo.com/top/summary'
res = requests.get(url)
res.encoding = 'utf-8'
html = etree.HTML(res.text)
top10_link = html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr[position()<=11]/td[2]/a/@href')
top10_title = html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr[position()<=11]/td[2]/a/text()')
return(top10_link, top10_title)
def get_huputop10():
url = 'https://bbs.hupu.com/all-gambia'
res = requests.get(url, headers=headers_hupu)
res.encoding = 'utf-8'
html = etree.HTML(res.text)
top10_title = html.xpath('//div[@class="bbsHotPit"]/div/ul/li[position()<=3]/span/a/@title')[:10]
top10_link = html.xpath('//div[@class="bbsHotPit"]/div/ul/li[position()<=3]/span/a/@href')[:10]
return(top10_link, top10_title)
def main():
today = datetime.datetime.today()
#with open('today_info'+' '+today.strftime('%Y-%m-%d')+'.txt', 'a') as f:
with open('today_info.txt', 'a') as f:
weather_list = get_weather()
wbtop_list = get_weibotop10()[1]
hptop_list = get_huputop10()[1]
bdtop_json = get_baidutop10()
f.write('今天是%d年%d月%d日,下面是今天的天气与各平台热点。\n' %(today.year, today.month, today.day))
f.write("\n今日天气:\n")
tplt = "{0:10}\t{1:5}\t{2:5}\t{3:5}\t{4:5}"
for each in weather_list:
weatherInTime = each.split(",")
f.write(tplt.format(weatherInTime[0],weatherInTime[2],weatherInTime[3],weatherInTime[4],weatherInTime[5], chr(12288))+'\n')
f.write("\n今日百度热点时事新闻:\n")
tplt = "{0:5}\t{1:30}"
for i in range(10):
f.write(tplt.format(str(i+1), bdtop_json['hotsearch'][i]['pure_title'], chr(12288))+'\n')
f.write("\n今日微博热搜:\n")
f.write(tplt.format('置顶', wbtop_list[0])+'\n')
for i in range(1, len(wbtop_list)):
f.write(tplt.format(str(i), wbtop_list[i], chr(12288))+'\n')
f.write("\n今日虎扑步行街热点:\n")
for i in range(len(hptop_list)):
f.write(tplt.format(str(i+1), hptop_list[i], chr(12288))+'\n')
main()
爬虫主要涉及到中国天气网、虎扑步行街、微博热搜和百度新闻榜。基本没有碰到动态页面,爬取方式也比较简单,唯一要注意的是必须加上headers才能爬取到内容。
使用mutt进行邮件发送
爬虫爬取下来的内容保存在today_info.txt中。mutt发送邮件的命令:
mutt $YOUREMAILADRESS$ -s "今日天气与要闻" < today_info.txt
使用sh+crontab进行定时发送
首先写一个简单的sh脚本,运行python脚本和调用mutt命令:
python3 /home/lin/文档/everydaynews.py
mutt $YOUREMAILADRESS$ -s "今日天气与要闻" < today_info.txt
随后在crontab中进行计划任务:
sudo crontab -e
-----------
30 7 * * * bash /home/lin/文档/everydaynews.sh
每天早上七点半,树莓派会运行python脚本爬取内容,并用mutt把爬取内容发送到邮箱里。