1、官网下载安装包
本次使用alertmanager-0.28.1.linux-amd64.tar.gz
2、解压
[root@k8smaster promethus]# tar -zxvf alertmanager-0.28.1.linux-amd64.tar.gz
alertmanager-0.28.1.linux-amd64/
alertmanager-0.28.1.linux-amd64/LICENSE
alertmanager-0.28.1.linux-amd64/alertmanager
alertmanager-0.28.1.linux-amd64/alertmanager.yml
alertmanager-0.28.1.linux-amd64/amtool
alertmanager-0.28.1.linux-amd64/NOTICE
[root@k8smaster promethus]# lsalertmanager-0.28.1.linux-amd64
3、编辑配置文件
切换到安装目录,备份配置文件后,进行配置,注意配置成自己的邮箱。
[root@k8smaster promethus]# cd alertmanager-0.28.1.linux-amd64/
[root@k8smaster alertmanager-0.28.1.linux-amd64]# ls
alertmanager alertmanager.yml amtool LICENSE NOTICE
[root@k8smaster alertmanager-0.28.1.linux-amd64]# cp alertmanager.yml alertmanager.ymlbak
[root@k8smaster alertmanager-0.28.1.linux-amd64]# vim alertmanager.ymvim alertmanager.yml
#一、邮箱配置
global:
#解析失败超时时间;
resolve_timeout: 5m
#发件人邮箱
smtp_from: 'xx@qq.com'
#邮箱官方地址及端口
smtp_smarthost: 'smtp.qq.com:465'
#发件人邮箱
smtp_auth_username: 'xx@qq.com'
#发件人邮箱授权码
smtp_auth_password: 'xx'
#发送信息是否tls加密
smtp_require_tls: false
smtp_hello: 'qq.com'
#二、报警间隔配置;
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
#重复报警的间隔时间如5分钟;
repeat_interval: 5m
#采用什么报警方式;
receiver: 'email'
#三、接收告警
receivers:
#定义接收者方式
- name: 'email'
email_configs:
#【收件人】
- to: 'xx@qq.com'
send_resolved: true
inhibit_rules:
- source_match:
#匹配的告警级别
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
检查alertermanager.yml配置文件语法正确性
[root@k8smaster alertmanager-0.28.1.linux-amd64]# ./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' FAILED: yaml: unmarshal errors:
line 13: cannot unmarshal !!str `ture` into boolamtool: error: failed to validate 1 file(s)
[root@k8smaster alertmanager-0.28.1.linux-amd64]# vim alertmanager.yml
[root@k8smaster alertmanager-0.28.1.linux-amd64]# ./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' SUCCESS
Found:
- global config
- route
- 1 inhibit rules
- 1 receivers
- 0 templates
4、编写启动alertmanager文件
编写开机自启文件vim /etc/systemd/system/alertmanager.service,加载配置systemctl daemon-reload并启动。用户和组使用root,建议自己创建修改用户和组和配置文件路径,systemctl enable alertmanager.service设置开机自启。
[Unit]
Description=Alertmanager
After=network.target[Service]
User=root
Group=root
ExecStart=/home/admin/promethus/alertmanager-0.28.1.linux-amd64/alertmanager \
--config.file=/home/admin/promethus/alertmanager-0.28.1.linux-amd64/alertmanager.yml \
--storage.path=/home/admin/promethus/alertmanager-0.28.1.linux-amd64
Restart=on-failure
RestartSec=10s
TimeoutStartSec=30[Install]
WantedBy=multi-user.target
5、验证是alertmanager告警工具启动成功
浏览器访问地址:http://localhost:9093,注意换成自己IP地址访问。
6、Prometheus配置测试告警通知
编写Prometheus规则文件,测试告警工具能正常发送告警通知。
编辑prometheus.yaml添加告警地址,根据自己路径IP进行修改。
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.xx.3:9093# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
- "/home/admin/promethus/prometheus-3.4.1.linux-amd64/rules.yaml"# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.static_configs:
- targets: ["localhost:9090"]
# The label name is added as a label `label_name=<label_value>` to any timeseries scraped from this config.
labels:
app: "prometheus"
- job_name: "node-exporter"
static_configs:
#被监控的数据抓取地址;
- targets: ["10.xx.3:9100"]
vim prometheus/softwares/rules.yml编辑文件,根据自己路径IP进行修改,添加告警规则。
groups:
- name: prometheus-alert
rules:
- alert: xx节点挂掉啦
#当promeQL这个语句=0时(节点挂掉),开始报警
expr: up{instance="10.xx:9100"} == 0
#连续3s=0才触发报警;
for: 3s
labels:
prometheus: node-exporter
#被监控节点ip
node: 1.xx..xx3 #自己监控的节点IP
annotations:
summary: "{{ $labels.instance }} 已停止运行超过 10s!"
检查语法正确性./promtool check rules rules.yaml
重新加载Prometheus服务器服务:systemcat restart prometheus.service,并查看状态是正常。停止一台机器systemcl stop node-exporter.service。
7、查看邮件是否收到告警通知
成功截图