环境简介
- k8s版本:1.15.2,三台主机
- 安装方式:kubeadm
主机说明:
操作系统 | IP地址 | 角色 | CPU | 内存 | 主机名 |
centos7 | 192.168.50.13 | master | 2 | 2 | k8s-master |
centos7 | 192.168.50.14 | node | 2 | 2 | k8s-node1 |
centos7 | 192.168.50.15 | node | 2 | 2 | k8s-node2 |
名称空间
[root@k8s-master ~]# cd prometheus/prometheus/
[root@k8s-master prometheus]# vim ns.yaml
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
[root@k8s-master prometheus]# kubectl apply -f ns.yaml
部署Prometheus
rbac授权
相当于在k8s中创建一个有权限的用户,Prometheus通过这个用户获取数据
[root@k8s-master prometheus]# vim rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
--- #获取永久sa的token
apiVersion: v1
kind: Secret
metadata:
name: prometheus-secrets
annotations:
kubernetes.io/service-account.name: "prometheus"
type: kubernetes.io/service-account-token
查看token
kubectl describe secret prometheus-secrets -n monitoring
使用configMap 创建报警规则文件和配置文件
[root@k8s-master prometheus]# vim prometheus-rule.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rule
namespace: monitoring
data:
example01.yaml: |
groups:
- name: node
rules:
- alert: NodeDown
expr: up == 0
for: 3m
labels:
severity: critical
annotations:
summary: "{
{ $labels.instance }}: down"
description: "{
{ $labels.instance }} has been down for more than 3m"
value: "{
{ $value }}"
- alert: NodeCPUHigh
expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 75
for: 5m
labels:
severity: warning
annotations:
summary: "{
{$labels.instance}}: High CPU usage"
description: "{
{$labels.instance}}: CPU usage is above 75%"
value: "{
{ $value }}"
- alert: NodeCPUIowaitHigh
expr: avg by (instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "{
{$labels.instance}}: High CPU iowait usage"
description: "{
{$labels.instance}}: CPU iowait usage is above 50%"
value: "{
{ $value }}"
- alert: NodeMemoryUsageHigh
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "{
{$labels.instance}}: High memory usage"
description: "{
{$labels.instance}}: Memory usage is above 90%"
value: "{
{ $value }}"
- alert: NodeDiskRootLow
expr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "{
{$labels.instance}}: Low disk(the / partition) space"
description: "{
{$labels.instance}}: Disk(the / partition) usage is above 80%"
value: "{
{ $value }}"
- alert: NodeDiskBootLow
expr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "{
{$labels.instance}}: Low disk(the /boot partition) space"
description: "{
{$labels.instance}}: Disk(the /boot partition) usage is above 80%"
value: "{
{ $value }}"
- alert: NodeLoad5High
expr: (node_load5) > (count by (instance) (node_cpu_seconds_total{mode='system'}) * 2)
for: 5m
labels:
severity: warning
annotations:
summary: "{
{$labels.instance}}: Load(5m) High"
description: "{
{$labels.instance}}: Load(5m) is 2 times the number of CPU cores"
value: "{
{ $value }}"
[root@k8s-master prometheus]# vim prometheus-configMap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager.monitoring.svc.cluster.local:9093
rule_files:
- "rules/*.yaml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'kubernetes-nodes-cadvisor'
metrics_path: /metrics
scheme: https
kubernetes_sd_configs:
- role: node
api_server: https://192.168.50.13:6443
bearer_token_file: /opt/prometh