一、shell脚本、统计cpu、内存、磁盘、I/O、连接数、历史登录用户、计划任务等信息(health_check.sh)
#!/bin/bash #Result_file=/usr/local/script/Health_check/test/result.txt Result_file=/tmp/system_tmp/systeminfo.txt #检测结果存储文件 [ -f $Result_file ]|| mkdir /tmp/system_tmp/ >$Result_file disk_judge=75 #磁盘使用率,磁盘使用率大于75%的检测会失败 cpu_judge=`cat /proc/cpuinfo| grep "processor"| wc -l|awk '{print $1*"0.50"}'` #cpu负载,cpu负载大于"cpu核数*50%"会检测失 Mem_judge=0.80 #内存使用率大于80%会检测失败 Io_judge=80 #I/O请求占用的CPU百分比,值越高,说明I/O越慢,大于80%则不通过,该值仅做参考 Last_judge="^172|^10" #ssh登录情况,排除内网登录 Cron_judge="Scp_Tool|first_pvuv2|logservicerestart|cut_log|log_cut|log_gzip|clear_log" #计划任务查看,排除日志切割脚本 Interval(){ if [ $1 = a ];then echo "********************************$2********************************" >>$Result_file elif [ $1 = b ];then echo "----------------------------------------" >>$Result_file else echo "----------------------------------------" >>$Result_file fi } Disk_Check(){ Result="PASS" Disk_use=`df -hP|grep -Ev 'Filesystem|tmpfs|boot'|awk '{print $5}'|xargs | sed 's/ /,/g'|sed 's/%//g'` array=(${Disk_use//\,/ }) for i in "${!array[@]}"; do if [ ${array[i]} -gt $disk_judge ];then Result=FAILED fi done Interval a "DISK INFO" df -h >>$Result_file Interval b echo "Check_Result:$Result" >>$Result_file Interval b } Cpu_Check(){ Result="PASS" Load_average=`uptime|awk -F "average:" '{print$2}'` array=(${Load_average//\,/ }) for i in "${!array[@]}"; do compare=`awk -v num1=${array[i]} -v num2=$cpu_judge 'BEGIN{print(num1>num2)?"0":"1"}'` if [ $compare -eq 0 ];then Result=FAILED fi done Interval a "CPU INFO" iostat |grep -A1 avg-cpu >>$Result_file echo "load_average:$Load_average">>$Result_file Interval b echo "Check_Result:$Result" >>$Result_file Interval b } Connection_Check(){ Interval a "CONNECTTION INFO" netstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}' >>$Result_file Interval b echo "Check_Result:$Result" >>$Result_file Interval b } Mem_Check(){ Result="PASS" Mem_use=`free -m|grep Mem|awk '{print "1"-($4+$6+$7)/$2}'` compare=`awk -v num1=$Mem_use -v num2=$Mem_judge 'BEGIN{print(num1>num2)?"0":"1"}'` if [ $compare -eq 0 ];then Result=FAILED fi Interval a "Memory INFO" free -m >>$Result_file Interval b echo "Check_Result:$Result" >>$Result_file Interval b } Io_Check(){ Result="PASS" Interval a "IO INFO" sar -dp 1 3|grep "Average" >>$Result_file Io_use=`cat /tmp/system_tmp/systeminfo.txt|grep "Average:"|grep -v util |awk '{print $10}'|xargs | sed 's/ /,/g'` array=(${Io_use//\,/ }) for i in "${!array[@]}"; do compare1=`awk -v num1=${array[i]} -v num2=$Io_judge 'BEGIN{print(num1>num2)?"0":"1"}'` if [ $compare -eq 0 ];then Result=FAILED fi done Interval b echo "Check_Result:$Result" >>$Result_file Interval b } Last_Check(){ Result="PASS" lastcheck=`last|grep -Ev "tty|reboot|wtmp" |awk '{print$3}'|grep -Ev $Last_judge|sed '/^$/d'|wc -l` if [ $lastcheck -ne 0 ];then Result=FAILED fi Interval a "LAST INFO" last -an 10 | uniq -f 9 >>$Result_file Interval b echo "Check_Result:$Result" >>$Result_file Interval b } Crontab_Check(){ Result="PASS" crocheck1=`cat /etc/crontab |grep -Ev "^#|HOME|MAILTO|PATH|SHELL|$Cron_judge"|sed '/^$/d'|wc -l` crocheck2=`cat /etc/passwd | cut -f 1 -d : |xargs -I {} crontab -l -u {}|grep -Ev $Cron_judge|wc -l` crocheck3=`cat /etc/passwd | cut -f 1 -d : |xargs -I {} crontab -l -u {}|wc -l` if [ $crocheck1 -ne 0 ]||[ $crocheck2 -ne 0 ];then Result=FAILED fi Interval a "CRONTAB INFO" if [ $crocheck3 -eq 0 ]&&[ $crocheck1 -eq 0 ];then echo "no crontab for all" >>$Result_file else cat /etc/passwd | cut -f 1 -d : |xargs -I {} crontab -l -u {} >>$Result_file cat /etc/crontab |grep -Ev "^#|HOME|MAILTO|PATH|SHELL|$Cron_judge"|sed '/^$/d' >>$Result_file fi Interval b echo "Check_Result:$Result" >>$Result_file Interval b } Disk_Check Cpu_Check Connection_Check Mem_Check Io_Check Last_Check Crontab_Check
二、使用ansible批量执行并返回执行结果(systemcheck.yml)
- name: create dir file: path: /tmp/system_tmp state: directory mode: 0755 - name: copy cpu_mem.sh to remote machine copy: src: "health_check.sh" dest: /tmp/system_tmp/health_check.sh owner: root group: root mode: a+x remote_src: False - name: write cpu,Mem to /tmp/systeminfo.txt shell: "sh /tmp/system_tmp/health_check.sh" - name: store file into /tmp/checklog/ fetch: src: /tmp/system_tmp/systeminfo.txt dest: /tmp/checklog/ mode: 0644
三、每台服务器返回的结果进行分析 并格式化输出
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- import os import datetime import docx from docx import Document from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.shared import Pt from docx.shared import RGBColor now_time = datetime.datetime.now() # 当前时间 Result_file = "/usr/local/script/Health_check/hostcheck_result.docx" #检查结果存储文件 #读取ansible返回的主机信息,以文件存储 Path="/tmp/checklog/" Listdir=os.listdir(Path) #统计检查主机总数 Total_count = 0 #统计检查不通过主机数 Fail_count = 0 #统计检查不通过主机IP存入Fail_list列表 Fail_list=[] for Dir in Listdir: #循环目录,获取每个ansible返回的每个文件 File="/tmp/checklog/"+Dir+"/tmp/system_tmp/systeminfo.txt" with open(File,"r",encoding="utf-8") as f: for line in f: if "Check_Result:FAILED" in line: Fail_list.append(Dir) Fail_count = Fail_count + 1 Total_count = Total_count + 1 Title="系统健康检测报告" Check_user="封孝立" Check_time=now_time.strftime("%Y-%m-%d") Check_hosts=int(Total_count) Pass_host=int(Total_count)-int(Fail_count) Failed_host=int(Fail_count) # 声明一个word对象 doc = Document() #title Report_Title = doc.add_heading(Title, 0) # 设置Title水平居中 paragraph_format = Report_Title.paragraph_format paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER Checkinfo = ''' 检测人:%s 检测时间:%s 检测项:cpu、内存、磁盘、I/O、连接数、历史登录用户、计划任务 检测标准:cpu负载>Cpu_core*0.5,内存使用率>0.8 ,磁盘使用率>0.75, IO>util*0.8,ssh外网登录 检测主机总数:%d 检测通过主机数:%d 检测故障主机数:%d 故障主机IP列表: '''%(Check_user,Check_time,Check_hosts,Pass_host,Failed_host) #输入检测基本信息 Basic_info = doc.add_paragraph(Checkinfo) Fail_list = list(set(Fail_list)) #检查不通过去重IP for failip in Fail_list: Failip = doc.add_paragraph('').add_run(u''+failip.strip('\n')) Failip.font.color.rgb = RGBColor(250,0,0) #分页 doc.add_page_break() for Dir in Listdir: #循环目录,获取每个ansible返回的每个文件 #将IP设置成标题,并修改字号 Ip_title=doc.add_heading('',0).add_run(u"HOST IP:"+Dir) Ip_title.font.size = Pt(20) File="/tmp/checklog/"+Dir+"/tmp/system_tmp/systeminfo.txt" with open(File,"r",encoding="utf-8") as f: for line in f: if "Check_Result:PASS" in line: paragraph = doc.add_paragraph('').add_run(u''+line.strip('\n')) paragraph.font.size = Pt(12) paragraph.font.color.rgb = RGBColor(0,255,0) # paragraph.bold = True elif "Check_Result:FAILED" in line: paragraph = doc.add_paragraph('').add_run(u''+line.strip('\n')) paragraph.font.size = Pt(12) paragraph.font.color.rgb = RGBColor(250,0,0) # paragraph.bold = True else: paragraph = doc.add_paragraph(line.strip('\n')) doc.add_page_break() doc.save(Result_file)
四、检测结果示例