Grafana Prometheus Altermanager 监控系统
基本概念
Prometheus 是一套开源的系统监控、报警、时间序列数据库的组合,最初有 SoundCloud 开发的,后来随着越来越多公司使用,于是便独立成开源项目。Alertmanager 主要用于接收 Prometheus 发送的告警信息,它支持丰富的告警通知渠道,例如邮件、微信、钉钉、Slack 等常用沟通工具,而且很容易做到告警信息进行去重,降噪,分组等,是一款很好用的告警通知系统。
安装Grafana服务
下载地址 https://grafana.com/grafana/download
[root@jk ~]# sudo yum install -y https://dl.grafana.com/enterprise/release/grafana-enterprise-11.4.0-1.x86_64.rpm
[root@jk ~]# systemctl enable --now grafana-server.service
Created symlink /etc/systemd/system/multi-user.target.wants/grafana-server.service → /usr/lib/systemd/system/grafana-server.service.
[root@jk ~]#
[root@jk ~]# systemctl status grafana-server.service
安装Prometheus服务
下载地址 https://github.com/prometheus/prometheus/releases/tag/
wget https://mirrors.chenby.cn/https://github.com/prometheus/prometheus/releases/download/v3.0.1/prometheus-3.0.1.linux-amd64.tar.gz
[root@jk ~]# tar xvf prometheus-3.0.1.linux-amd64.tar.gz
[root@jk ~]# mv prometheus-3.0.1.linux-amd64 /prometheus
[root@jk ~]#
进行全局配置
[root@jk ~]# vim /prometheus/prometheus.yml
[root@jk ~]# cat /prometheus/prometheus.yml
# Prometheus全局配置项
global:
scrape_interval: 15s # 设定抓取数据的周期,默认为1min
evaluation_interval: 15s # 设定更新rules文件的周期,默认为1min
scrape_timeout: 15s # 设定抓取数据的超时时间,默认为10s
external_labels: # 额外的属性,会添加到拉取得数据并存到数据库中
monitor: 'codelab_monitor'
# Alertmanager配置
alerting:
alertmanagers:
- static_configs:
- targets: ["127.0.0.1:9093"] # 设定alertmanager和prometheus交互的接口,即alertmanager监听的ip地址和端口
# rule配置,首次读取默认加载,之后根据evaluation_interval设定的周期加载
rule_files:
- "dist/*.yml"
# scape配置
scrape_configs:
- job_name: 'prometheus' # job_name默认写入timeseries的labels中,可以用于查询使用
scrape_interval: 15s # 抓取周期,默认采用global配置
static_configs: # 静态配置
- targets: ['127.0.0.1:9090'] # prometheus所要抓取数据的地址,即instance实例项
- job_name: 'web'
scrape_interval: 15s
static_configs:
- targets: ['192.168.1.130:9200']
- job_name: 'node-exporter'
scrape_interval: 15s
file_sd_configs:
- files:
- "static_conf/*.yaml"
refresh_interval: 1s
- job_name: server1_db
static_configs:
- targets: ['192.168.1.130:9104']
- job_name: mysql # To get metrics about the mysql exporter’s targets
params:
# Not required. Will match value to child in config file. Default value is `client`.
auth_module: [client.servers]
static_configs:
- targets:
# All mysql hostnames or unix sockets to monitor.
- 192.168.1.130:3306
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
# The mysqld_exporter host:port
replacement: 192.168.1.130:9104
[root@jk ~]#
进行写入动态配置文件
内容写需要监控的主机即可
root@cby:~# mkdir /prometheus/static_conf/
root@cby:~# vim /prometheus/static_conf/file.yaml
root@cby:~# cat /prometheus/static_conf/file.yaml
- targets: ['192.168.1.20:9200']
- targets: ['192.168.1.31:9200']
- targets: ['192.168.1.32:9200']
- targets: ['192.168.1.33:9200']
- targets: ['192.168.1.34:9200']
- targets: ['192.168.1.35:9200']
- targets: ['192.168.1.36:9200']
- targets: ['192.168.1.99:9200']
- targets: ['192.168.1.100:4445']
- targets: ['192.168.1.100:9182']
- targets: ['192.168.1.120:9200']
- targets: ['192.168.1.123:9200']
- targets: ['192.168.1.130:9200']
root@cby:~#
配置开机自启服务
cat > /etc/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus
After=network-online.target
[Service]
Type=simple
ExecStart=/prometheus/prometheus --config.file=/prometheus/prometheus.yml
Restart=on-failur
ExecStop=/bin/kill -9 $MAINPID
[Install]
WantedBy=multi-user.target
EOF
[root@jk ~]# systemctl daemon-reload
[root@jk ~]#
[root@jk ~]# systemctl enable --now prometheus.service
[root@jk ~]#
[root@jk ~]# systemctl status prometheus.service
安装Node_exporter监控组件
下载地址 https://github.com/prometheus/node_exporter/releases/
wget https://mirrors.chenby.cn/https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
[root@jk ~]# tar xvf node_exporter-1.8.2.linux-amd64.tar.gz
[root@jk ~]# mv node_exporter-1.8.2.linux-amd64 /node_exporter
设置为开机自启
cat > /etc/systemd/system/node_exporter.service <<EOF
[Unit]
Description=node_exporter
After=network-online.target
[Service]
Type=simple
ExecStart=/node_exporter/node_exporter --web.listen-address=":9200"
Restart=on-failur
ExecStop=/bin/kill -9 $MAINPID
[Install]
WantedBy=multi-user.target
EOF
[root@jk ~]# systemctl daemon-reload
[root@jk ~]# systemctl enable --now node_exporter.service
[root@jk ~]# systemctl status node_exporter.service
安装mysql_exporter监控组件
下载地址 https://github.com/prometheus/mysqld_exporter/releases/
wget https://mirrors.chenby.cn/https://github.com/prometheus/mysqld_exporter/releases/download/v0.16.0/mysqld_exporter-0.16.0.linux-amd64.tar.gz
[root@jk ~]# tar xvf mysqld_exporter-0.16.0.linux-amd64.tar.gz
[root@jk ~]# mv mysqld_exporter-0.16.0.linux-amd64 /mysqld_exporter
cat > /mysqld_exporter/my.cnf <<EOF
[client]
host=127.0.0.1
port=3306
user=root
password=Cby123..
EOF
设置为开机自启
cat > /etc/systemd/system/mysqld_exporter.service <<EOF
[Unit]
Description=mysqld_exporter
After=network-online.target
[Service]
Type=simple
ExecStart=/mysqld_exporter/mysqld_exporter --config.my-cnf=/mysqld_exporter/my.cnf --web.listen-address=:9104
Restart=on-failur
ExecStop=/bin/kill -9 $MAINPID
[Install]
WantedBy=multi-user.target
EOF
[root@jk ~]# systemctl daemon-reload
[root@jk ~]# systemctl enable --now mysqld_exporter.service
[root@jk ~]# systemctl status mysqld_exporter.service
下载安装alertmanager服务
下载地址 https://github.com/prometheus/alertmanager/releases/
wget https://mirrors.chenby.cn/https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz
[root@jk ~]# tar xvf alertmanager-0.27.0.linux-amd64.tar.gz
[root@jk ~]# mv alertmanager-0.27.0.linux-amd64 /alertmanager
[root@jk ~]#
全局配置
cat > /alertmanager/alertmanager.yml <<EOF
global:
resolve_timeout: 5m
smtp_from: '[email protected]'
smtp_smarthost: 'smtp.qiye.aliyun.com:465'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'xxxxxxxx'
smtp_require_tls: false
smtp_hello: 'chenby.cn'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '[email protected]'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
EOF
配置告警规则
规则模板建议在此网站找适合自己的
https://awesome-prometheus-alerts.grep.to/
举例
groups:
- name: test-rules
rules:
- alert: InstanceDown # 告警名称
expr: up == 0 # 告警的判定条件,参考Prometheus高级查询来设定
for: 2m # 满足告警条件持续时间多久后,才会发送告警
labels: #标签项
team: node
annotations: # 解析项,详细解释告警信息
summary: "{
{$labels.instance}}: has been down"
description: "{
{$labels.instance}}: job {
{$labels.job}} has been down "
value: {
{$value}}
我的告警配置,直接把他的仓库克隆了下来
[root@jk ~]# mkdir /prometheus/dist/
[root@jk ~]# vim /prometheus/dist/123.yml
[root@jk ~]# cat /prometheus/dist/123.yml
groups:
- name: generals.rules
rules:
- alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {
{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {
{ $value }}\n LABELS = {
{ $labels }}"
- alert: PrometheusTargetMissing
expr: up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {
{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {
{ $value }}\n LABELS = {
{ $labels }}"
- alert: PrometheusAllTargetsMissing
expr: sum by (job) (up) == 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus all targets missing (instance {
{ $labels.instance }})
description: "A Prometheus job does not have living target anymore.\n VALUE = {
{ $value }}\n LABELS = {
{ $labels }}"
- alert: PrometheusTargetMissingWithWarmupTime
expr: sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing with warmup time (instance {
{ $labels.instance }})
description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {
{ $value }}\n LABELS = {
{ $labels }}"
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus configuration reload failure (instance {
{ $labels.instance }})
description: "Prometheus configuration reload error\n VALUE = {
{ $value }}\n LABELS = {
{ $labels }}"
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus too many restarts (instance {
{ $labels.instance }})
description: "Prometheus has restarted more than twice i