===============================================
2021/2/9_第1次修改 ccb_warlock
===============================================
在很多年前整理过的容器监控方案(https://www.cnblogs.com/straycats/p/9281889.html)中,曾经采用在docker swarm中运行cAdvisor、Prometheus、Grafana来实现对容器与宿主机的监控。因为懂docker,上个月又被当作运维要求通过在kubernetes上实现监控系统的解决方案,于是我需要实现在kubernetes上运行这套解决方案。
在使用grafana的demo时,了解到监控k8s资源有个比cAdvisor更好用的服务Kube-state-metrics。
cAdvisor:
Kube-state-metrics:
Prometheus:
Grafana:
一、部署kubernetes
centos7可以参考:https://www.cnblogs.com/straycats/p/14322995.html
写教程时部署的版本是v1.20.1
二、部署cAdvisor
因为在kubernetes上运行,而kubelet已经集成了cAdvisor,所以不需要额外安装,直接使用kubelet即可。
三、部署Kube-state-metrics
3.1 创建编排脚本
# 创建目录
mkdir -p /opt/yaml
# 创建编排脚本
vi /opt/yaml/kube-state-metrics.yaml
将下面的内容保存到kube-state-metrics.yaml中,wq保存。
apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics rules: - apiGroups: - "" resources: - configmaps - secrets - nodes - pods - services - resourcequotas - replicationcontrollers - limitranges - persistentvolumeclaims - persistentvolumes - namespaces - endpoints verbs: - list - watch - apiGroups: - extensions resources: - daemonsets - deployments - replicasets - ingresses verbs: - list - watch - apiGroups: - apps resources: - statefulsets - daemonsets - deployments - replicasets verbs: - list - watch - apiGroups: - batch resources: - cronjobs - jobs verbs: - list - watch - apiGroups: - autoscaling resources: - horizontalpodautoscalers verbs: - list - watch - apiGroups: - authentication.k8s.io resources: - tokenreviews verbs: - create - apiGroups: - authorization.k8s.io resources: - subjectaccessreviews verbs: - create - apiGroups: - policy resources: - poddisruptionbudgets verbs: - list - watch - apiGroups: - certificates.k8s.io resources: - certificatesigningrequests verbs: - list - watch - apiGroups: - storage.k8s.io resources: - storageclasses - volumeattachments verbs: - list - watch - apiGroups: - admissionregistration.k8s.io resources: - mutatingwebhookconfigurations - validatingwebhookconfigurations verbs: - list - watch - apiGroups: - networking.k8s.io resources: - networkpolicies verbs: - list - watch --- apiVersion: v1 kind: ServiceAccount metadata: labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics namespace: monit --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: kube-state-metrics subjects: - kind: ServiceAccount name: kube-state-metrics namespace: monit --- apiVersion: v1 kind: Service metadata: annotations: prometheus.io/scrape: ‘true‘ labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics namespace: monit spec: type: NodePort ports: - name: http-metrics port: 8080 targetPort: http-metrics #nodePort: 30001 - name: telemetry port: 8081 targetPort: telemetry #nodePort: 30002 selector: app.kubernetes.io/name: kube-state-metrics --- apiVersion: apps/v1 kind: Deployment metadata: labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics namespace: monit spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: kube-state-metrics template: metadata: labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: v2.0.0-beta spec: containers: - image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0-beta livenessProbe: httpGet: path: /healthz port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 name: kube-state-metrics ports: - containerPort: 8080 name: http-metrics - containerPort: 8081 name: telemetry readinessProbe: httpGet: path: / port: 8081 initialDelaySeconds: 5 timeoutSeconds: 5 nodeSelector: beta.kubernetes.io/os: linux serviceAccountName: kube-state-metrics
PS.如何获取镜像k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0-beta,我会另外补一篇文章来描述操作。
3.2 部署
# 执行编排脚本
cd /opt/yaml kubectl apply -f kube-state-metrics.yaml
四、部署Prometheus
4.1 创建数据持久化目录
mkdir -p /opt/vol/prometheus/data
4.2 创建编排脚本
# 创建目录
mkdir -p /opt/vol/yaml
# 创建编排脚本
vi /opt/vol/yaml/prometheus.yaml
将下面的内容保存到prometheus.yaml中,wq保存。
apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: warlock namespace: monit rules: - apiGroups: [""] resources: - nodes - nodes/proxy - nodes/metrics - services - services/proxy - endpoints - endpoints/proxy - pods - pods/proxy verbs: ["get", "list", "watch"] --- apiVersion: v1 kind: ServiceAccount metadata: name: warlock namespace: monit --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: warlock namespace: monit roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: warlock subjects: - kind: ServiceAccount name: warlock namespace: monit --- apiVersion: v1 kind: Service metadata: name: prometheus-service namespace: monit labels: app: prometheus-service kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists spec: type: NodePort ports: - port: 9090 targetPort: 9090 nodePort: 30003 selector: app: prometheus-deployment --- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: monit data: prometheus.yml: | global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - alertmanager-service:9093 rule_files: - "node.yml" scrape_configs: - job_name: ‘prometheus‘ static_configs: - targets: [‘localhost:9090‘] - job_name: ‘k8s-cadvisor‘ metrics_path: /metrics/cadvisor kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) scheme: https tls_config: insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token metric_relabel_configs: - source_labels: [instance] separator: ; regex: (.+) target_label: node replacement: $1 action: replace - source_labels: [pod_name] separator: ; regex: (.+) target_label: pod replacement: $1 action: replace - source_labels: [container_name] separator: ; regex: (.+) target_label: container replacement: $1 action: replace - job_name: kube-state-metrics kubernetes_sd_configs: - role: endpoints namespaces: names: - ops-monit relabel_configs: - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] regex: kube-state-metrics replacement: $1 action: keep - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: k8s_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: k8s_sname --- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-node namespace: ops-monit data: node.yml: | groups: - name: node rules: - alert: PrometheusEndpointDown expr: up == 0 for: 10s labels: source: prometheus annotations: title: "Endpoint({{$labels.instance}}) Down" content: "The endpoint({{$labels.instance}}) of target({{$labels.job}}) has been down for more than 10 seconds." --- apiVersion: apps/v1 kind: Deployment metadata: name: prometheus-deployment namespace: ops-monit spec: replicas: 1 selector: matchLabels: app: prometheus-deployment template: metadata: labels: app: prometheus-deployment spec: serviceAccountName: warlock securityContext: runAsUser: 0 volumes: - name: config projected: sources: - configMap: name: prometheus-config - configMap: name: prometheus-node - name: data-vol hostPath: path: /opt/vol/prometheus/data containers: - name: prometheus image: prom/prometheus:v2.24.0 imagePullPolicy: IfNotPresent # Always # Never env: - name: TZ value: "Asia/Shanghai" volumeMounts: - name: config mountPath: "/etc/prometheus/prometheus.yml" subPath: prometheus.yml readOnly: true - name: config mountPath: "/etc/prometheus/node.yml" subPath: node.yml readOnly: true - name: data-vol mountPath: /prometheus ports: - containerPort: 9090
4.3 部署
# 执行编排脚本
cd /opt/yaml
kubectl apply -f prometheus.yaml
五、部署Alertmanager
休息了,明天继续写