前段时间看了马哥的k8s新书,最后几章讲了下EFK,尝试部署了下,很多问题, 这里改进下,写个笔记记录下吧。
准备工作
所有组件都通过helm3部署,选添加几个仓库。
helm repo add bitnami https://charts.bitnami.com/bitnami helm repo add fluent https://fluent.github.io/helm-charts
存储用的nfs的sc,自行解决。
三个4G内存工作节点的k8s集群。
部署es
[root@bjzb-lecar-ops-jenkins-master-33 cluster-log]# cat bitnami-elasticsearch-values.yaml clusterDomain: cluster.local # Kubernetes集群域名; name: elasticsearch # Elasticsearch集群名称; master: # 准主节点相关的配置; name: master replicas: 2 # 实例数量; heapSize: 512m # 堆内存大小; resources: limits: {} # cpu: 1000m # memory: 2048Mi requests: cpu: 200m memory: 512Mi persistence: # 持久卷相关的配置; enabled: true # 禁用时将自动使用emptyDir存储卷; storageClass: "managed-nfs-storage" # 从指定存储类中动态创建PV; # existingClaim: my-persistent-volume-claim # 使用现有的PVC; # existingVolume: my-persistent-volume # 使用现有的PV; accessModes: - ReadWriteOnce size: 8Gi service: # 服务配置 type: ClusterIP port: 9300 # 节点间的transport流量使用端口; coordinating: # 仅协调节点相关的配置; replicas: 2 # 实例数量; heapSize: 128m resources: requests: cpu: 250m memory: 512Mi service: # 仅协调节点相关的服务,这也是接收Elasticsearch客户端请求的入口; type: ClusterIP port: 9200 # nodePort: # loadBalancerIP: data: # 数据节点相关的配置; name: data replicas: 2 heapSize: 512m resources: # 数据节点是CPU密集及IO密集型的应用,资源需求和限制要谨慎设定; limits: {} # cpu: 100m # memory: 2176Mi requests: cpu: 250m memory: 512Mi persistence: enabled: true storageClass: "managed-nfs-storage" # existingClaim: my-persistent-volume-claim # existingVolume: my-persistent-volume accessModes: - ReadWriteOnce size: 10Gi ingest: # 摄取节点相关的配置; enabled: false # 默认为禁用状态; name: ingest replicas: 2 heapSize: 128m resources: limits: {} # cpu: 100m # memory: 384Mi requests: cpu: 500m memory: 512Mi service: type: ClusterIP port: 9300 curator: # curator相关的配置; enabled: false name: curator cronjob: # 执行周期及相关的配置; # At 01:00 every day schedule: "0 1 * * *" concurrencyPolicy: "" failedJobsHistoryLimit: "" successfulJobsHistoryLimit: "" jobRestartPolicy: Never metrics: # 用于暴露指标的exporter; enabled: true name: metrics service: type: ClusterIP annotations: # 指标采集相关的注解信息; prometheus.io/scrape: "true" prometheus.io/port: "9114" resources: limits: {} # cpu: 100m # memory: 128Mi requests: cpu: 100m memory: 128Mi podAnnotations: # Pod上的注解,用于支持指标采集; prometheus.io/scrape: "true" prometheus.io/port: "8080" serviceMonitor: # Service监控相关的配置 enabled: false namespace: monitoring interval: 10s scrapeTimeout: 10s
helm install es -f bitnami-elasticsearch-values.yaml bitnami/elasticsearch -n logging
哎,这一步各种问题,会遇到镜像下载慢,k8s集群资源不够(我已经把yml里申请的资源调的很低了),存储权限问题,反正大家注意点就行。
部署fluent-bit
[root@bj-k8s-master efk]# cat fluent-fluent-bit-values.yaml # kind -- DaemonSet or Deployment kind: DaemonSet image: repository: fluent/fluent-bit pullPolicy: IfNotPresent service: type: ClusterIP port: 2020 annotations: prometheus.io/path: "/api/v1/metrics/prometheus" prometheus.io/port: "2020" prometheus.io/scrape: "true" resources: {} # limits: # cpu: 100m # memory: 128Mi #requests: # cpu: 100m # memory: 128Mi tolerations: - key: node-role.kubernetes.io/master effect: NoSchedule config: service: | [SERVICE] Flush 3 Daemon Off #Log_Level info Log_Level debug Parsers_File custom_parsers.conf Parsers_File parsers.conf HTTP_Server On HTTP_Listen 0.0.0.0 HTTP_Port 2020 inputs: | [INPUT] Name tail Path /var/log/containers/*.log Parser docker Tag kube.* Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 [INPUT] Name tail Path /var/log/containers/nginx-demo*.log Parser docker Tag nginx-demo.* Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 [INPUT] Name tail Path /var/log/containers/ingress-nginx-controller*.log Parser docker Tag ingress-nginx-controller.* Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 filters: | [FILTER] Name kubernetes Match kube.* Kube_URL https://kubernetes.default.svc:443 Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token Kube_Tag_Prefix kube.var.log.containers. Merge_Log On Keep_Log Off K8S-Logging.Exclude On K8S-Logging.Parser On [FILTER] Name kubernetes Match ingress-nginx-controller.* Kube_URL https://kubernetes.default.svc:443 Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token Kube_Tag_Prefix kube.var.log.containers. Merge_Log On Merge_Parser ingress-nginx Keep_Log Off K8S-Logging.Exclude On K8S-Logging.Parser On outputs: | [OUTPUT] Name es Match kube.* Host es-elasticsearch-coordinating-only.logging.svc.cluster.local. Logstash_Format On Logstash_Prefix k8s-cluster Type flb_type Replace_Dots On [OUTPUT] Name es Match nginx-demo.* Host es-elasticsearch-coordinating-only.logging.svc.cluster.local. Logstash_Format On Logstash_Prefix nginx-demo Type flb_type Replace_Dots On [OUTPUT] Name es Match ingress-nginx-controller.* Host es-elasticsearch-coordinating-only.logging.svc.cluster.local. Logstash_Format On Logstash_Prefix ingress-nginx-controller Type flb_type Replace_Dots On customParsers: | [PARSER] Name docker_no_time Format json Time_Keep Off Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%L [PARSER] Name ingress-nginx Format regex Regex ^(?<message>(?<remote>[^ ]*) - (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*) "(?<referer>[^\"]*)" "(?<agent>[^\"]*)" (?<request_length>[^ ]*) (?<request_time>[^ ]*) \[(?<proxy_upstream_name>[^ ]*)\] \[(?<proxy_alternative_upstream_name>[^ ]*)\] (?<upstream_addr>[^ ]*) (?<upstream_response_length>[^ ]*) (?<upstream_response_time>[^ ]*) (?<upstream_status>[^ ]*) (?<req_id>[^ ]*).*)$ Time_Key time Time_Format %d/%b/%Y:%H:%M:%S %z
helm install fb -f fluent-fluent-bit-values.yaml fluent/fluent-bit -n logging
注意下es的host地址,如果跟我不是一样的namespace记得改下,吃过亏。这个书里提供的都写一个index里,nginx-ingress日志也不解析,fluent-bit研究了半天,大概就这程度吧,通过docker不同的log名字打上tag,这样每个应用在es里单独的index。当然量不大都写进一个index,通过label字段查询某应用的日志也行。
部署kibana
[root@bj-k8s-master efk]# cat bitnami-kibana-values.yaml replicaCount: 1 updateStrategy: type: RollingUpdate plugins: - https://github.com/pjhampton/kibana-prometheus-exporter/releases/download/7.8.1/kibana-prometheus-exporter-7.8.1.zip persistence: enabled: true storageClass: "managed-nfs-storage" # existingClaim: your-claim accessMode: ReadWriteOnce size: 10Gi service: port: 5601 type: ClusterIP # nodePort: externalTrafficPolicy: Cluster annotations: {} # loadBalancerIP: # extraPorts: ingress: enabled: true certManager: false annotations: kubernetes.io/ingress.class: nginx hostname: kibana.ilinux.io path: / tls: false # tlsHosts: # - www.kibana.local # - kibana.local # tlsSecret: kibana.local-tls configuration: server: basePath: "" rewriteBasePath: false metrics: enabled: true service: annotations: prometheus.io/scrape: "true" prometheus.io/port: "80" prometheus.io/path: "_prometheus/metrics" serviceMonitor: enabled: false # namespace: monitoring # interval: 10s # scrapeTimeout: 10s # selector: # prometheus: my-prometheus elasticsearch: hosts: - es-elasticsearch-coordinating-only.logging.svc.cluster.local. # - elasticsearch-2 port: 9200
helm install kib -f bitnami-kibana-values.yaml bitnami/kibana -n logging
同上如果跟我namespace不一样记得改下es地址。 这地方书里有坑,把charts pull下来看了下,values.yml文件有出入,估计是写书的时候chart版本不一样导致的。
配置kibana
自己改下hosts解析到ingress地址,访问kibana
添加匹配的索引
每个应用日志独立的index
ingress-nginx日志已解析
fluentbit官方文档
https://docs.fluentbit.io/
在线正则匹配
https://rubular.com/