Prometheus警报规则

groups:
- name: node_rules
  rules:
  - record: instance:node_cpu:avg_rate5m
    expr: 100 - avg (irate(node_cpu_seconds_total{job="node_prod",mode="idle"}[5m])) by (instance) * 100
  - record: instance:node_cpus:count
    expr: count by (instance)(node_cpu_seconds_total{mode="idle"})
  - record: instance:node_cpu_saturation_load1
    expr: node_load1 > on (instance) 2 * count by (instance)(node_cpu_seconds_total{job="node_prod",mode="idle"})
  - record: instance:node_memory_usage:percentage
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes{job="node_prod"} * 100
  - record: instance:node_memory_swap_io_bytes:sum_rate
    expr: 1024 * sum by (instance) (
                 (rate(node_vmstat_pgpgin[1m])
                 + rate(node_vmstat_pgpgout[1m]))
          )
  - record: instance:root:node_filesystem_usage:percentage
    expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100
groups:
- name: node_alerts
  rules:
  - alert: HighNodeCPU(CPU使用率)
    expr: instance:node_cpu:avg_rate5m > 90
    for: 60m
    labels:
      name: CPU
      severity: warning
    annotations:
      summary: 5分钟内的节点平均CPU使用率在至少60分钟内超过90%
      description: "CPU使用率过高,5分钟内平均CPU使用率为 {{ humanize $value}}%"
  - alert: HighNodeLoad(CPU饱和度)
    expr: instance:node_cpu_saturation_load1
    for: 5m
    labels:
      name: Load
      severity: warning
    annotations:
      summary: CPU负载平均数超过了CPU数量
      description: CPU平均负载至少5分钟内超过主机CPU数量的两倍
  - alert: HighNodeMem(内存使用率)
    expr: instance:node_memory_usage:percentage > 95
    for: 5m
    labels:
      name: Memory
      severity: warning
    annotations:
      summary: 使用的内存百分比至少在5分钟内超过95%
      description: "内存使用率过高,目前值为{{ humanize $value}}%"
  - alert: DiskUsage(磁盘使用量)
    expr: instance:root:node_filesystem_usage:percentage > 95
    for: 5m
    labels:
      name: Disk
      severity: warning
    annotations:
      summary: "{{$labels.device}}磁盘使用量超过95%"
      description: "{{$labels.instance}}的{{ $labels.mountpoint }}使用量为{{ humanize $value}}%"
  - alert: DiskWillFillIn4Hours(线性回归预测磁盘空间将耗尽)
    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 4*3600) < 0
    for: 5m
    labels:
      name: Disk
      severity: critical
    annotations:
      summary: 基于最后一小时的增长历史记录,根文件系统的磁盘空间将在接下来的四小时内耗尽
      description: "{{$labels.instance}}挂载在 {{ $labels.mountpoint }}的设备 {{$labels.device}}磁盘空间可能将用完"
  - alert: DiskIO(磁盘IO操作耗时)
    expr: 100-(avg(irate(node_disk_io_time_seconds_total{job="node_prod"}[1m])) by(instance)* 100) < 40
    for: 5m
    labels:
      name: Disk
      severity: critical
    annotations:
     summary: "{{$labels.instance}}磁盘IO使用率过高,磁盘IO大于60%"
     description: "{{$labels.instance}}:磁盘IO空闲百分比为:{{humanize $value}}%"
  - alert: InstanceDown(实例已停止响应抓取)
    expr: up{job="node_prod"} == 0
    for: 10s
    labels:
      severity: critical
    annotations:
      summary: Host {{ $labels.instance }} is down!
      description: 实例 {{ $labels.instance }}已停止响应抓取
  - alert: InstancesDown(作业中至少25%的实例无法响应抓取)
    expr:  avg(up{job="node_prod"}) by (job) < 0.75
    for: 10s
    labels:
      severity: critical
    annotations:
      summary: 作业中25%以上的实例停止响应抓取
      description: 作业 {{$labels.job}}中至少25%的实例无法响应抓取
  - alert: InstancesGone(UP指标缺失警报)
    expr:  absent(up{job="node_prod"})
    for: 10s
    labels:
      severity: critical
    annotations:
      summary: 节点作业中的UP指标消失
      description: 实例 {{ $labels.instance }} 的UP指标消失
groups:
- name: network_alerts
  rules:
  - alert: HostNetwork_receive(网卡接收流量异常)
    expr: irate(node_network_receive_bytes_total{job="node_prod",device!~"lo|qb.*|qv.*|tap.*|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1024 / 1024 > 20
    for: 1m
    labels:
      name: Network_receive
      severity: warning
    annotations:
      summary: "{{$labels.instance}} 网卡接收流量异常"
      description: "{{$labels.instance}} 网卡{{$labels.device}} 5分钟平均接收流量为 {{ humanize $value }}MB/s"
  - alert: hostNetwork_transmit(网卡流出流量异常)
    expr: irate(node_network_transmit_bytes_total{job="node_prod",device!~"lo|qb.*|qv.*|tap.*|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1024 / 1024 > 20
    for: 1m
    labels:
      name: Network_transmit
      severity: warning
    annotations:
      summary: "{{$labels.instance}} 网卡流出流量异常"
      description: "{{$labels.instance}} 网卡{{$labels.device}} 5分钟平均接收流量为 {{ humanize $value }}MB/s"
groups:
- name: SSL证书状态
  rules:
  - alert: "SSL证书过期警告"
    expr: (probe_ssl_earliest_cert_expiry - time())/86400 <3
    for: 1h
    labels:
      severity: 警告
    annotations:
      description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
      summary: "SSL证书过期警告"
groups:
- name: blackbox_network_stats
  rules:
  - alert: blackbox_network_stats
    expr: probe_success == 0
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: "接口/主机/端口 {{ $labels.instance }}  无法联通"
      description: "请尽快检测"
groups:
- name: prometheus_alerts
  rules:
  - alert: PrometheusConfigReloadFailed(Prometheus配置重载失败)
    expr: prometheus_config_last_reload_successful == 0
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Reloading Prometheus' configuration has failed on {{ $labels.instance }}.
  - alert: PrometheusNotConnectedToAlertmanagers(Prometheus没有发现任何Alertmanager)
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 10m
    labels:
      severity: warning
    annotations:
      description: Prometheus {{ $labels.instance }} is not connected to any Alertmanagers
上一篇:Linux Shell 学习笔记


下一篇:shell中$[]、(())、[[]]及expr表达式