注意事项:如何看报警规则适用自己 ,去prometheus 的主页面查看,这里有监控项和监控参数
mysql
groups:
- name: Mysql-rules
rules:
- alert: "Mysql status"
expr: mysql_up == 0
for: 5s
labels:
severity: error
annotations:
summary: "您的 {
{ $labels.instance }} 的 Mysql 已停止运行!"
description: "Mysql数据库宕机,请检查"
- alert: "Mysql slave io thread status"
expr: mysql_slave_status_slave_io_running == 0
for: 5s
labels:
severity: error
annotations:
summary: "您的 {
{ $labels.instance }} Mysql slave io thread 已停止"
description: "Mysql主从IO线程故障,请检测"
- alert: "Mysql slave sql thread status"
expr: mysql_slave_status_slave_sql_running == 0
for: 5s
labels:
severity: error
annotations:
summary: "您的 {
{ $labels.instance }} Mysql slave sql thread 已停止"
description: "Mysql主从sql线程故障,请检测"
nginx
groups:
- name: nginx
rules:
- alert: "nginx status"
expr: sum(up{
job="nginx"}) < 2
for: 1m
labels:
severity: error
annotations:
summary: "您的 {
{ $labels.instance }} 的 Nginx 已停止运行!"
description: "Nginx宕机,请检查"
- alert: NginxHighHttp4xxErrorRate
expr: |
sum(rate(nginx_http_requests_total{
status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx high HTTP 4xx error rate (instance {
{ $labels.instance }})"
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {
{ $value }}\n LABELS: {
{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: |
sum(rate(nginx_http_requests_total{
status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx high HTTP 5xx error rate (instance {
{ $labels.instance }})"
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {
{ $value }}\n LABELS: {
{ $labels }}"
- alert: NginxLatencyHigh
expr: |
histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Nginx latency high (instance {
{ $labels.instance }})"
description: "Nginx p99 latency is higher than 10 seconds\n VALUE = {
{ $value }}\n LABELS: {
{ $labels }}"
node-exporter
# 服务器资源告警策略
groups:
- name: 服务器资源监控
rules:
- alert: 内存使用率过高
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 5m # 告警持续时间,超过这个时间才会发送给alertmanager
labels:
severity: 严重告警
annotations:
summary: "{
{ $labels.instance }} 内存使用率过高,请尽快处理!"
description: "{
{ $labels.instance }}内存使用率超过90%,当前使用率{
{ $value }}%."
- alert: 服务器宕机
expr: up == 0
for: 3m
labels:
severity: 严重告警
annotations:
summary: "{
{
$labels.instance}} 服务器宕机,请尽快处理!"
description: "{
{
$labels.instance}} 服务器延时超过3分钟,当前状态{
{ $value }}. "
- alert: CPU高负荷
expr: 100 - (avg by