groups: - name: user-ms-alerts interval: 30s rules: # 高错误率告警 - alert: HighErrorRate expr: | ( sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) ) > 0.05 for: 5m labels: severity: critical service: user-management annotations: summary: "高错误率告警" description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}" # 高响应时间告警 - alert: HighResponseTime expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path) ) > 1 for: 5m labels: severity: warning service: user-management annotations: summary: "高响应时间告警" description: "API P95响应时间超过1秒,路径: {{ $labels.path }},当前值: {{ $value }}s" # 低缓存命中率告警 - alert: LowCacheHitRate expr: | ( sum(rate(cache_hits_total[5m])) / sum(rate(cache_operations_total[5m])) ) < 0.7 for: 10m labels: severity: warning service: user-management annotations: summary: "低缓存命中率告警" description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}" # CPU 使用率告警 - alert: HighCPUUsage expr: rate(process_cpu_seconds_total[5m]) > 0.8 for: 5m labels: severity: warning service: user-management annotations: summary: "高CPU使用率告警" description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}" # 内存使用率告警 - alert: HighMemoryUsage expr: | ( system_memory_usage_bytes / (node_memory_MemTotal_bytes) ) > 0.85 for: 5m labels: severity: critical service: user-management annotations: summary: "高内存使用率告警" description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}" # 数据库连接告警 - alert: DatabaseConnectionPoolExhausted expr: | ( db_connections_active / db_connections_max ) > 0.9 for: 3m labels: severity: critical service: user-management annotations: summary: "数据库连接池耗尽告警" description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}" # 在线用户数告警 - alert: LowOnlineUsers expr: active_users{period="5m"} < 10 for: 30m labels: severity: info service: user-management annotations: summary: "在线用户数告警" description: "过去5分钟活跃用户数低于10,当前值: {{ $value }}" # 登录失败率告警 - alert: HighLoginFailureRate expr: | ( sum(rate(user_logins_total{status="failed"}[5m])) / sum(rate(user_logins_total[5m])) ) > 0.3 for: 5m labels: severity: warning service: user-management annotations: summary: "高登录失败率告警" description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}" # API QPS 异常告警 - alert: UnusualAPIRequestRate expr: | abs( sum(rate(http_requests_total[5m])) - avg(sum(rate(http_requests_total[5m])) over 1h) ) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5 for: 5m labels: severity: info service: user-management annotations: summary: "API请求量异常告警" description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"