Prometheus配置告警规则

https://www.cnblogs.com/linuxk/p/12036193.html

  • 修改prometheus配置文件
指定prometheus的规则文件路径或者文件名
vim prometheus.yml
rule_files:
  - 'rules/*_rules.yml'
  # - 'prometheus_rules.yml'
  # - "./rule/*.yaml"
  # - "first_rules.yml"
  # - "second_rules.yml"
chown -R prometheus:prometheus /opt/prometheus/rules/

mv prometheus_rules.yml rules/

此配置所有规则都写入一个文件里面。

  • 重启prometheus
systemctl restart prometheus

journalctl -u prometheus -fn 200
#停止
#ps -ef | grep prometheus | grep  -v grep | awk '{print $2}' | xagrs kill -9
#或者
#curl -XPOST http://localhost:9090/-/quit

#重载
#curl -XPOST http://localhost:9090/-/reload
vim prometheus_rules.yml
groups:
- name: alive
  rules:
  - record: node:ping:total
    expr: up
- name: cpu
  rules:
  - record: node:cpu_usage:ratio #别的文件使用,直接使用这个
    expr: ((100 - (avg by(instance,ip,hostname) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)))
- name: mem
  rules:
  - record: node:memory_usage:ratio
    expr: (100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100 )
  • 检查配置
[root@prometheus prometheus]# ./promtool check config prometheus.yml
Checking prometheus.yml
  SUCCESS: 1 rule files found

Checking prometheus_rules.yml
  FAILED:
     prometheus_rules.yml: groupname: "alive" is repeated in the same file

Prometheus配置告警规则

systemctl restart prometheus
journalctl -u prometheus -fn 200
vim rules/disk_rules.yml
groups:
  - name: disk-monitor
    rules:
      - alert: HostOutOfDiskSpace
        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"} < 30 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Host out of disk space (instance {{ $labels.instance }})
          description: "Disk is almost full (< 30% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
vim rules/cpu_rules.yml
groups:
  - name: cpu-monitor
    rules:
    - alert: HostHighCpuLoad
      expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host high CPU load (instance {{ $labels.instance }})
        # description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        description: "服务器5分钟内CPU使用率超过80%!(当前值: {{ $value }}%)"
vim rules/alertmanager_rules.yml
groups:
  - name: alertmanager-monitor
    rules:
      - alert: PrometheusNotConnectedToAlertmanager
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
vim rules/memory_rules.yml
groups:
  - name: memory-monitor
    rules:
      - alert: HostOutOfMemory
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 20
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host out of memory (instance {{ $labels.instance }})
          description: "Node memory is filling up (< 20% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
上一篇:prometheus监控主机


下一篇:prometheus快速部署