prometheus报警规则

原创

已于 2024-08-30 17:53:42 修改 · 629 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#prometheus

于 2024-08-30 10:50:19 首次发布

注意事项：如何看报警规则适用自己，去prometheus 的主页面查看，这里有监控项和监控参数
在这里插入图片描述

mysql

groups:
  - name: Mysql-rules
    rules:
      - alert: "Mysql status"
        expr: mysql_up == 0
        for: 5s
        labels:
          severity: error
        annotations:
          summary: "您的 {
   
   { $labels.instance }} 的 Mysql 已停止运行！"
          description: "Mysql数据库宕机，请检查"

      - alert: "Mysql slave io thread status"
        expr: mysql_slave_status_slave_io_running == 0
        for: 5s
        labels:
          severity: error
        annotations:
          summary: "您的 {
   
   { $labels.instance }} Mysql slave io thread 已停止"
          description: "Mysql主从IO线程故障，请检测"

      - alert: "Mysql slave sql thread status"
        expr: mysql_slave_status_slave_sql_running == 0
        for: 5s
        labels:
          severity: error
        annotations:
          summary: "您的 {
   
   { $labels.instance }} Mysql slave sql thread 已停止"
          description: "Mysql主从sql线程故障，请检测"

nginx

groups:
  - name: nginx
    rules:
      - alert: "nginx status"
        expr: sum(up{
   
   job="nginx"}) < 2
        for: 1m
        labels:
          severity: error
        annotations:
          summary: "您的 {
   
   { $labels.instance }} 的 Nginx 已停止运行！"
          description: "Nginx宕机，请检查"

      - alert: NginxHighHttp4xxErrorRate
        expr: |
          sum(rate(nginx_http_requests_total{
   
   status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Nginx high HTTP 4xx error rate (instance {
   
   { $labels.instance }})"
          description: "Too many HTTP requests with status 4xx (> 5%)\n  VALUE = {
   
   { $value }}\n  LABELS: {
   
   { $labels }}"

      - alert: NginxHighHttp5xxErrorRate
        expr: |
          sum(rate(nginx_http_requests_total{
   
   status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Nginx high HTTP 5xx error rate (instance {
   
   { $labels.instance }})"
          description: "Too many HTTP requests with status 5xx (> 5%)\n  VALUE = {
   
   { $value }}\n  LABELS: {
   
   { $labels }}"

      - alert: NginxLatencyHigh
        expr: |
          histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Nginx latency high (instance {
   
   { $labels.instance }})"
          description: "Nginx p99 latency is higher than 10 seconds\n  VALUE = {
   
   { $value }}\n  LABELS: {
   
   { $labels }}"

node-exporter

# 服务器资源告警策略
groups:
- name: 服务器资源监控
  rules:
  - alert: 内存使用率过高
    expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
    for: 5m  # 告警持续时间，超过这个时间才会发送给alertmanager
    labels:
      severity: 严重告警
    annotations:
      summary: "{
   
   { $labels.instance }} 内存使用率过高，请尽快处理！"
      description: "{
   
   { $labels.instance }}内存使用率超过90%,当前使用率{
   
   { $value }}%."
          
  - alert: 服务器宕机
    expr: up == 0
    for: 3m
    labels:
      severity: 严重告警
    annotations:
      summary: "{
   
   {
   
   $labels.instance}} 服务器宕机，请尽快处理！"
      description: "{
   
   {
   
   $labels.instance}} 服务器延时超过3分钟，当前状态{
   
   { $value }}. "
 
  - alert: CPU高负荷
    expr: 100 - (avg by