Files

134 lines
4.1 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
groups:
- name: user-ms-alerts
interval: 30s
rules:
# 高错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
service: user-management
annotations:
summary: "高错误率告警"
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
# 高响应时间告警
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
) > 1
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高响应时间告警"
description: "API P95响应时间超过1秒路径: {{ $labels.path }},当前值: {{ $value }}s"
# 低缓存命中率告警
- alert: LowCacheHitRate
expr: |
(
sum(rate(cache_hits_total[5m]))
/
sum(rate(cache_operations_total[5m]))
) < 0.7
for: 10m
labels:
severity: warning
service: user-management
annotations:
summary: "低缓存命中率告警"
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
# CPU 使用率告警
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total[5m]) > 0.8
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高CPU使用率告警"
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
# 内存使用率告警
- alert: HighMemoryUsage
expr: |
(
system_memory_usage_bytes /
(node_memory_MemTotal_bytes)
) > 0.85
for: 5m
labels:
severity: critical
service: user-management
annotations:
summary: "高内存使用率告警"
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
# 数据库连接告警
- alert: DatabaseConnectionPoolExhausted
expr: |
(
db_connections_active /
db_connections_max
) > 0.9
for: 3m
labels:
severity: critical
service: user-management
annotations:
summary: "数据库连接池耗尽告警"
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
# 在线用户数告警
- alert: LowOnlineUsers
expr: active_users{period="5m"} < 10
for: 30m
labels:
severity: info
service: user-management
annotations:
summary: "在线用户数告警"
description: "过去5分钟活跃用户数低于10当前值: {{ $value }}"
# 登录失败率告警
- alert: HighLoginFailureRate
expr: |
(
sum(rate(user_logins_total{status="failed"}[5m]))
/
sum(rate(user_logins_total[5m]))
) > 0.3
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高登录失败率告警"
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
# API QPS 异常告警
- alert: UnusualAPIRequestRate
expr: |
abs(
sum(rate(http_requests_total[5m]))
-
avg(sum(rate(http_requests_total[5m])) over 1h)
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
for: 5m
labels:
severity: info
service: user-management
annotations:
summary: "API请求量异常告警"
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"