https://www.cnblogs.com/linuxk/p/12036193.html
- 修改prometheus配置文件
指定prometheus的规则文件路径或者文件名
vim prometheus.yml
rule_files:
- 'rules/*_rules.yml'
# - 'prometheus_rules.yml'
# - "./rule/*.yaml"
# - "first_rules.yml"
# - "second_rules.yml"
chown -R prometheus:prometheus /opt/prometheus/rules/
mv prometheus_rules.yml rules/
此配置所有规则都写入一个文件里面。
- 重启prometheus
systemctl restart prometheus
journalctl -u prometheus -fn 200
#停止
#ps -ef | grep prometheus | grep -v grep | awk '{print $2}' | xagrs kill -9
#或者
#curl -XPOST http://localhost:9090/-/quit
#重载
#curl -XPOST http://localhost:9090/-/reload
vim prometheus_rules.yml
groups:
- name: alive
rules:
- record: node:ping:total
expr: up
- name: cpu
rules:
- record: node:cpu_usage:ratio #别的文件使用,直接使用这个
expr: ((100 - (avg by(instance,ip,hostname) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)))
- name: mem
rules:
- record: node:memory_usage:ratio
expr: (100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100 )
- 检查配置
[root@prometheus prometheus]# ./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: 1 rule files found
Checking prometheus_rules.yml
FAILED:
prometheus_rules.yml: groupname: "alive" is repeated in the same file
systemctl restart prometheus
journalctl -u prometheus -fn 200
vim rules/disk_rules.yml
groups:
- name: disk-monitor
rules:
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"} < 30 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 1m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 30% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
vim rules/cpu_rules.yml
groups:
- name: cpu-monitor
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
# description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "服务器5分钟内CPU使用率超过80%!(当前值: {{ $value }}%)"
vim rules/alertmanager_rules.yml
groups:
- name: alertmanager-monitor
rules:
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
vim rules/memory_rules.yml
groups:
- name: memory-monitor
rules:
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 20
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 20% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"