Grafana Prometheus Altermanager 监控-CSDN博客

本文链接：https://blog.csdn.net/qq_33921750/article/details/144732237

Grafana Prometheus Altermanager 监控系统

基本概念

Prometheus 是一套开源的系统监控、报警、时间序列数据库的组合，最初有 SoundCloud 开发的，后来随着越来越多公司使用，于是便独立成开源项目。Alertmanager 主要用于接收 Prometheus 发送的告警信息，它支持丰富的告警通知渠道，例如邮件、微信、钉钉、Slack 等常用沟通工具，而且很容易做到告警信息进行去重，降噪，分组等，是一款很好用的告警通知系统。

安装Grafana服务

下载地址 https://grafana.com/grafana/download

[root@jk ~]# sudo yum install -y https://dl.grafana.com/enterprise/release/grafana-enterprise-11.4.0-1.x86_64.rpm


[root@jk ~]# systemctl enable --now grafana-server.service
Created symlink /etc/systemd/system/multi-user.target.wants/grafana-server.service → /usr/lib/systemd/system/grafana-server.service.
[root@jk ~]# 
[root@jk ~]# systemctl status grafana-server.service

安装Prometheus服务

下载地址 https://github.com/prometheus/prometheus/releases/tag/

wget https://mirrors.chenby.cn/https://github.com/prometheus/prometheus/releases/download/v3.0.1/prometheus-3.0.1.linux-amd64.tar.gz

[root@jk ~]# tar xvf prometheus-3.0.1.linux-amd64.tar.gz
[root@jk ~]# mv prometheus-3.0.1.linux-amd64 /prometheus
[root@jk ~]#

进行全局配置

[root@jk ~]# vim /prometheus/prometheus.yml
[root@jk ~]# cat /prometheus/prometheus.yml
# Prometheus全局配置项
global:
  scrape_interval:     15s # 设定抓取数据的周期，默认为1min
  evaluation_interval: 15s # 设定更新rules文件的周期，默认为1min
  scrape_timeout: 15s # 设定抓取数据的超时时间，默认为10s
  external_labels: # 额外的属性，会添加到拉取得数据并存到数据库中
   monitor: 'codelab_monitor'

# Alertmanager配置
alerting:
 alertmanagers:
 - static_configs:
   - targets: ["127.0.0.1:9093"] # 设定alertmanager和prometheus交互的接口，即alertmanager监听的ip地址和端口

# rule配置，首次读取默认加载，之后根据evaluation_interval设定的周期加载
rule_files:
  - "dist/*.yml"

# scape配置
scrape_configs:
- job_name: 'prometheus' # job_name默认写入timeseries的labels中，可以用于查询使用
  scrape_interval: 15s # 抓取周期，默认采用global配置
  static_configs: # 静态配置
  - targets: ['127.0.0.1:9090'] # prometheus所要抓取数据的地址，即instance实例项

- job_name: 'web'
  scrape_interval: 15s
  static_configs:
  - targets: ['192.168.1.130:9200']

- job_name: 'node-exporter'
  scrape_interval: 15s
  file_sd_configs:
    - files:
      - "static_conf/*.yaml"
      refresh_interval: 1s

- job_name: server1_db
  static_configs:
    - targets: ['192.168.1.130:9104']

- job_name: mysql # To get metrics about the mysql exporter’s targets
  params:
    # Not required. Will match value to child in config file. Default value is `client`.
    auth_module: [client.servers]
  static_configs:
    - targets:
      # All mysql hostnames or unix sockets to monitor.
      - 192.168.1.130:3306
  relabel_configs:
    - source_labels: [__address__]
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      # The mysqld_exporter host:port
      replacement: 192.168.1.130:9104
[root@jk ~]#

进行写入动态配置文件

内容写需要监控的主机即可

root@cby:~# mkdir /prometheus/static_conf/
root@cby:~# vim /prometheus/static_conf/file.yaml 
root@cby:~# cat /prometheus/static_conf/file.yaml                             
- targets: ['192.168.1.20:9200']
- targets: ['192.168.1.31:9200']
- targets: ['192.168.1.32:9200']
- targets: ['192.168.1.33:9200']
- targets: ['192.168.1.34:9200']
- targets: ['192.168.1.35:9200']
- targets: ['192.168.1.36:9200']
- targets: ['192.168.1.99:9200']
- targets: ['192.168.1.100:4445']
- targets: ['192.168.1.100:9182']
- targets: ['192.168.1.120:9200']
- targets: ['192.168.1.123:9200']
- targets: ['192.168.1.130:9200']

root@cby:~#

配置开机自启服务

cat > /etc/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus
After=network-online.target

[Service]
Type=simple
ExecStart=/prometheus/prometheus  --config.file=/prometheus/prometheus.yml
Restart=on-failur
ExecStop=/bin/kill -9 $MAINPID

[Install]
WantedBy=multi-user.target
EOF

[root@jk ~]# systemctl daemon-reload
[root@jk ~]# 
[root@jk ~]# systemctl enable --now prometheus.service 
[root@jk ~]# 
[root@jk ~]# systemctl status prometheus.service

安装Node_exporter监控组件

下载地址 https://github.com/prometheus/node_exporter/releases/

wget https://mirrors.chenby.cn/https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz

[root@jk ~]# tar xvf node_exporter-1.8.2.linux-amd64.tar.gz
[root@jk ~]# mv node_exporter-1.8.2.linux-amd64 /node_exporter

设置为开机自启

cat > /etc/systemd/system/node_exporter.service <<EOF
[Unit]
Description=node_exporter
After=network-online.target

[Service]
Type=simple
ExecStart=/node_exporter/node_exporter  --web.listen-address=":9200"
Restart=on-failur
ExecStop=/bin/kill -9 $MAINPID

[Install]
WantedBy=multi-user.target
EOF

[root@jk ~]# systemctl daemon-reload
[root@jk ~]# systemctl enable --now node_exporter.service 
[root@jk ~]# systemctl status node_exporter.service

安装mysql_exporter监控组件

下载地址 https://github.com/prometheus/mysqld_exporter/releases/

wget https://mirrors.chenby.cn/https://github.com/prometheus/mysqld_exporter/releases/download/v0.16.0/mysqld_exporter-0.16.0.linux-amd64.tar.gz

[root@jk ~]# tar xvf mysqld_exporter-0.16.0.linux-amd64.tar.gz
[root@jk ~]# mv mysqld_exporter-0.16.0.linux-amd64 /mysqld_exporter

cat > /mysqld_exporter/my.cnf <<EOF
[client]
host=127.0.0.1
port=3306
user=root
password=Cby123..
EOF

设置为开机自启

cat > /etc/systemd/system/mysqld_exporter.service <<EOF
[Unit]
Description=mysqld_exporter
After=network-online.target

[Service]
Type=simple
ExecStart=/mysqld_exporter/mysqld_exporter --config.my-cnf=/mysqld_exporter/my.cnf --web.listen-address=:9104
Restart=on-failur
ExecStop=/bin/kill -9 $MAINPID

[Install]
WantedBy=multi-user.target
EOF

[root@jk ~]# systemctl daemon-reload
[root@jk ~]# systemctl enable --now mysqld_exporter.service 
[root@jk ~]# systemctl status mysqld_exporter.service

下载安装alertmanager服务

下载地址 https://github.com/prometheus/alertmanager/releases/

wget https://mirrors.chenby.cn/https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz

[root@jk ~]# tar xvf alertmanager-0.27.0.linux-amd64.tar.gz
[root@jk ~]# mv alertmanager-0.27.0.linux-amd64 /alertmanager
[root@jk ~]#

全局配置

cat > /alertmanager/alertmanager.yml <<EOF
global:
  resolve_timeout: 5m
  smtp_from: '[email protected]'
  smtp_smarthost: 'smtp.qiye.aliyun.com:465'
  smtp_auth_username: '[email protected]'
  smtp_auth_password: 'xxxxxxxx'
  smtp_require_tls: false
  smtp_hello: 'chenby.cn'
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'email'
receivers:
- name: 'email'
  email_configs:
  - to: '[email protected]'
    send_resolved: true
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
EOF

配置告警规则

规则模板建议在此网站找适合自己的
https://awesome-prometheus-alerts.grep.to/

举例

groups:
 - name: test-rules
   rules:
   - alert: InstanceDown # 告警名称
     expr: up == 0 # 告警的判定条件，参考Prometheus高级查询来设定
     for: 2m # 满足告警条件持续时间多久后，才会发送告警
     labels: #标签项
      team: node
     annotations: # 解析项，详细解释告警信息
      summary: "{
   {$labels.instance}}: has been down"
      description: "{
   {$labels.instance}}: job {
   {$labels.job}} has been down "
      value: {
   {$value}}

我的告警配置，直接把他的仓库克隆了下来

[root@jk ~]# mkdir /prometheus/dist/
[root@jk ~]# vim /prometheus/dist/123.yml 
[root@jk ~]# cat /prometheus/dist/123.yml 
groups:
  - name: generals.rules
    rules:
    - alert: PrometheusJobMissing
      expr: absent(up{job="prometheus"})
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus job missing (instance {
   { $labels.instance }})
        description: "A Prometheus job has disappeared\n  VALUE = {
   { $value }}\n  LABELS = {
   { $labels }}"

    - alert: PrometheusTargetMissing
      expr: up == 0
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target missing (instance {
   { $labels.instance }})
        description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {
   { $value }}\n  LABELS = {
   { $labels }}"

    - alert: PrometheusAllTargetsMissing
      expr: sum by (job) (up) == 0
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus all targets missing (instance {
   { $labels.instance }})
        description: "A Prometheus job does not have living target anymore.\n  VALUE = {
   { $value }}\n  LABELS = {
   { $labels }}"

    - alert: PrometheusTargetMissingWithWarmupTime
      expr: sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target missing with warmup time (instance {
   { $labels.instance }})
        description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {
   { $value }}\n  LABELS = {
   { $labels }}"

    - alert: PrometheusConfigurationReloadFailure
      expr: prometheus_config_last_reload_successful != 1
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus configuration reload failure (instance {
   { $labels.instance }})
        description: "Prometheus configuration reload error\n  VALUE = {
   { $value }}\n  LABELS = {
   { $labels }}"

    - alert: PrometheusTooManyRestarts
      expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus too many restarts (instance {
   { $labels.instance }})
        description: "Prometheus has restarted more than twice i