Prometheus、Alertmanager、Grafana 监控 Linux 与 MySQL

//检查各个端口的放行

//部署各个模块与应用

cd /usr/local/Prometheus_compose
vim docker-compose.yml
version: "3"

services:
  prom:
    image: quay.io/prometheus/prometheus:latest
    container_name: prometheus
    volumes:
     - ./prometheus:/etc/prometheus
    command: "--config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus"
    ports:
     - 9090:9090 
    depends_on:
     - exporter
    environment: 
     - TZ=Asia/Shanghai

  exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    hostname: cicd
    ports:
     - "9100:9100"
    environment:
     - TZ=Asia/Shanghai

  grafana:
    image: grafana/grafana
    container_name: grafana
    ports:
     - "3000:3000"
    environment:
    - “GF_SECURITY_ADMIN_PASSWORD=123123”
    - “GF_INSTALL_PLUGINS=alexanderzobnin-zabbix-app”
    - TZ=Asia/Shanghai
    restart: "always"
    volumes:
    - ./grafana:/etc/grafana/”
    - ./grafana/conf/grafana.ini:/etc/grafana/grafana.ini
    - ./grafana/data:/var/lib/grafana:rw
    - ./grafana/plugins:/var/lib/grafana/plugins:rw
    - /etc/localtime:/etc/localtime
    depends_on:
      - prom

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    hostname: alertmanager
    restart: always
    ports:
      - '9093:9093'
    volumes:
      - './alertmanager/config:/etc/alertmanager'
      - './alertmanager/data:/alertmanager/data'
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
    environment:
     - TZ=Asia/Shanghai

  cadvisor:
    image: google/cadvisor
    container_name: cadvisors
    restart: always
    volumes:
       - /:/rootfs:ro
       - /var/run:/var/run:rw
       - /sys:/sys:ro
       - /var/lib/docker/:/var/lib/docker:ro
    ports: 
      - 8080:8080
    privileged: true
    environment:
    - TZ=Asia/Shanghai

  grafana-reporter:
     image: izakmarais/grafana-reporter
     container_name: grafana_reporter
     ports:
     - 8686:8686

     command: "-ip grafana.mitaiot.com"
     environment:
     - TZ=Asia/Shanghai

//编辑报警模块的配置文件

cd /usr/local/Prometheus_compose/alertmanager/config
cat alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_from: '123456789@sina.com'
  smtp_smarthost: 'smtp.sina.com:587'
  smtp_auth_username: '123456789@sina.com'
  smtp_auth_password: 'aabbccdd'
  smtp_require_tls: false   
  smtp_hello: 'sina.com'

route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'email'

receivers:
- name: 'email'
  email_configs:
   - to: '{{ template "email.to" . }}'
     html: '{{ template "email.to.html" . }}'
     send_resolved: true

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

templates:
 - "/etc/alertmanager/alertmanager-tmpl/email.tmpl"

//编辑发送的邮件模板


cd /usr/local/Prometheus_compose/alertmanager/config/alertmanager-tmpl
cat email.tmpl 
{{ define "email.from" }}123456789@sina.com{{ end }}
{{ define "email.to" }}123456789@sina.com{{ end }}
{{ define "email.to.html" }}
{{ range .Alerts }}
=========start==========<br>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} 级 <br>
告警类型: {{ .Labels.alertname }} <br>
故障主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Format "2019-08-04 16:58:15" }} <br>
=========end==========<br>
{{ end }}
{{ end }}

cd /usr/local/Prometheus_compose/grafana/conf
vim grafana.ini    # 配置文件太长,标出修改部分
[auth.anonymous]
enabled = true
org_name = Main Org.
org_role = Viewer
[smtp]
enabled = true
host = smtp.sina.com:587
user = 123456789@sina.com
password =dc28ac6ec64af9c1
skip_verify = true
from_address = 123456789@sina.com
from_name = Grafana
ehlo_identity =
default_timezone = Asia/Shanghai  #添加时区

//修改 普罗米修斯 的配置文件

cd /usr/local/Prometheus_compose/prometheus

global:
  scrape_interval:     5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 5s # Evaluate rules every 15 seconds. The default is every 1 minute.

scrape_configs:
    - job_name: 'prometheus'
      static_configs:
      - targets: ['172.16.225.154:9090']   

    - job_name: 'node'
      file_sd_configs:
      - files: ['/etc/prometheus/groups/nodegroups/*.json']
      static_configs:
      - targets:
          - '172.16.225.154:9100'
          - '172.16.225.156:9100'
          - '172.16.225.155:9100'
          - '172.16.225.157:9100'
          - '172.16.225.156:8085'
          - '172.16.225.154:8080'
          - '172.16.225.155:8085'
          - '172.16.225.157:8085'
          - '172.16.225.157:9104'

alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - '172.16.225.154:9093'

rule_files:
 - "/etc/prometheus/rules/*.yml"

//配置报警规则
cd /usr/local/Prometheus_compose/prometheus/rules
groups:
- name: node-up
  rules:
  - alert: node-up
    expr: up{job="node"} == 0
    for: 15s
    labels:
      severity: 1
      team: node
    annotations:
      summary: "{{ $labels.instance }} 已停止运行!"
      description: "{{ $labels.instance }} 检测到异常停止!请重点关注!!!" 

- name: node-cpu
  rules:
  - alert: node-cpu
    expr: 100 - ((avg by (instance,job,env)(irate(node_cpu_seconds_total{mode="idle"}[30s]))) *100) > 90
    for: 1m
    labels:
      severity: 1
      team: node
      level: warning
    annotations:
      summary: "{{ $labels.instance }} CPU使用率超过 百分之90!"
      description: "{{ $labels.instance }} 检测CPU连续1分钟占用率超出90%!请重点关注!!!"

- name: node-mem
  rules:
  - alert: node-mem
    expr: ((node_memory_MemTotal_bytes -(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) )/node_memory_MemTotal_bytes ) * 100 > 90 #设置内存使用率高于90时发送告警,计算方式为 总内存-空闲内存 - buffers - cached
    for: 5s
    labels:
      severity: 1
      team: node
      level: warning
    annotations:
      summary: "{{ $labels.instance }} MEM使用率超过 百分之90!"
      description: "{{ $labels.instance }} 检测CPU连续1分钟占用率超出90%!请重点关注!!!"

- name: node-disk_used
  rules:
  - alert: node-disk_used
    expr: 100 - (node_filesystem_free_bytes{fstype=~"ext3|ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs"} * 100) > 90  #设置挂载分区使用率为95以上时告警
    for: 1m
    labels:
      severity: 1
      team: node
      level: warning
    annotations:
      summary: "{{ $labels.instance }} 挂在分区使用率超过 百分之90!"
      description: "{{ $labels.instance }} 挂在分区使用率超出90%!请重点关注!!!"

#如需监控 MySQL和容器和主机信息 需要在 主机上部署 prom/node-exporter 、cadvisor、prom/mysqld-exporter。


version: "3"

services:
  exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    hostname: db01
    ports:
     - "9100:9100"

  cadvisor:
    image: google/cadvisor
    container_name: cadvisor
    restart: always
    volumes:
       - /:/rootfs:ro
       - /var/run:/var/run:rw
       - /sys:/sys:ro
       - /var/lib/docker/:/var/lib/docker:ro
    ports:
      - 8085:8080
    privileged: true

  mysqld-exporter:
    image: prom/mysqld-exporter
    ports:
    - 9104:9104
    restart: always
    container_name: mysql_exporter
    hostname: db01
    environment:
    - DATA_SOURCE_NAME=root:0GXwwchW4rP@(172.16.225.157:3306)/
    - TZ=Asia/Shanghai

导入模板,模板 ID 分别是:8919、7362
更多模块链接访问:点击获取监控模块,只需要导入对应ID即可

上一篇:Prometheus+Alertmanager配置邮件报警


下一篇:2.Prometheus邮件报警配置