deployment/alertmanager/alerts.yml

groups:
  # =========================================================================
  # SLO 燃烧率告警（基于错误预算，替代简单阈值告警）
  # 参考：Google SRE Book - Alerting on SLOs
  # =========================================================================
  - name: ums-slo-burn-rate
    interval: 30s
    rules:
      # -----------------------------------------------------------------------
      # SLO-1: API 可用性 (目标: 99.9% / 30天错误预算: 43.8分钟)
      # -----------------------------------------------------------------------
      # 快速燃烧：5m + 1h 双窗口确认，燃烧率 14.4x
      # 含义：若持续，将在 2小时内 消耗本月 2% 错误预算
      - alert: APIAvailability_FastBurn
        expr: |
          (
            sum(rate(http_requests_total{status=~"5.."}[5m]))
            /
            sum(rate(http_requests_total[5m]))
          ) > (1 - 0.999) * 14.4
          AND
          (
            sum(rate(http_requests_total{status=~"5.."}[1h]))
            /
            sum(rate(http_requests_total[1h]))
          ) > (1 - 0.999) * 14.4
        for: 2m
        labels:
          severity: critical
          slo: api-availability
          page: "true"
          service: user-management
        annotations:
          summary: "🔴 [P0] API 可用性 SLO 快速燃烧 — 立即响应"
          description: |
            错误预算正在以 14.4x 速率消耗（正常速率的14倍）
            当前5分钟错误率: {{ $value | humanizePercentage }}
            若持续2小时，将消耗本月约 2% 错误预算（约50分钟）
            SLO 目标: 99.9% (月度允许宕机: 43.8分钟)
            运维手册: docs/sre/runbooks/api-availability.md
          dashboard_url: "http://grafana:3000/d/ums-slo"

      # 慢速燃烧：30m + 6h 双窗口确认，燃烧率 6x
      # 含义：若持续，将在 1天内 消耗本月 5% 错误预算
      - alert: APIAvailability_SlowBurn
        expr: |
          (
            sum(rate(http_requests_total{status=~"5.."}[30m]))
            /
            sum(rate(http_requests_total[30m]))
          ) > (1 - 0.999) * 6
          AND
          (
            sum(rate(http_requests_total{status=~"5.."}[6h]))
            /
            sum(rate(http_requests_total[6h]))
          ) > (1 - 0.999) * 6
        for: 15m
        labels:
          severity: warning
          slo: api-availability
          page: "false"
          service: user-management
        annotations:
          summary: "🟡 [P2] API 可用性 SLO 缓慢燃烧 — 需在工作时间内关注"
          description: |
            错误预算正在以 6x 速率缓慢消耗
            若持续1天，将消耗本月 5% 错误预算
            当前30分钟错误率: {{ $value | humanizePercentage }}

      # -----------------------------------------------------------------------
      # SLO-2: API 延迟 (目标: P99 < 500ms 覆盖 99% 请求)
      # -----------------------------------------------------------------------
      - alert: APILatency_FastBurn
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 0.5
          AND
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket[1h])) by (le)
          ) > 0.5
        for: 5m
        labels:
          severity: critical
          slo: api-latency
          page: "true"
          service: user-management
        annotations:
          summary: "🔴 [P0] API 延迟 SLO 违规 — P99 超过 500ms"
          description: |
            当前 P99 延迟: {{ $value | humanizeDuration }}
            SLO 目标: P99 < 500ms
            请检查慢查询和数据库连接池

      - alert: APILatency_CriticalPath
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket{
              path=~".*auth/login.*|.*auth/refresh.*"
            }[5m])) by (le, path)
          ) > 0.3
        for: 3m
        labels:
          severity: critical
          slo: api-latency-auth
          service: user-management
        annotations:
          summary: "🔴 [P0] 认证关键路径延迟超标"
          description: |
            路径 {{ $labels.path }} 的 P99 延迟: {{ $value | humanizeDuration }}
            认证路径 SLO: P99 < 300ms

      # -----------------------------------------------------------------------
      # SLO-3: 登录成功率 (目标: 99% 非攻击流量)
      # -----------------------------------------------------------------------
      - alert: LoginSuccessRate_Degraded
        expr: |
          (
            sum(rate(user_logins_total{status="success"}[10m]))
            /
            sum(rate(user_logins_total[10m]))
          ) < 0.9
        for: 5m
        labels:
          severity: warning
          slo: login-success-rate
          service: user-management
        annotations:
          summary: "🟡 [P2] 登录成功率下降"
          description: |
            当前10分钟登录成功率: {{ $value | humanizePercentage }}
            SLO 目标: 99%
            注意：高失败率可能是暴力破解也可能是系统问题，请结合安全事件判断

  # =========================================================================
  # 基础设施告警（阈值型，高置信度）
  # =========================================================================
  - name: ums-infrastructure
    interval: 30s
    rules:
      # 服务宕机（最高优先级）
      - alert: ServiceDown
        expr: up{job="user-management"} == 0
        for: 1m
        labels:
          severity: critical
          page: "true"
          service: user-management
        annotations:
          summary: "🚨 [P0] 用户管理服务实例宕机"
          description: "实例 {{ $labels.instance }} 已离线超过 1 分钟，健康检查失败"

      # 数据库不可用（通过高 503 率推断）
      - alert: DatabaseConnectionFailed
        expr: |
          sum(rate(http_requests_total{status="503"}[2m])) > 1
        for: 1m
        labels:
          severity: critical
          page: "true"
          service: user-management
        annotations:
          summary: "🚨 [P0] 数据库连接失败，服务不可用"
          description: |
            大量 503 响应，可能是数据库连接池耗尽或数据库宕机
            运维手册: docs/sre/runbooks/database-down.md

      # 数据库连接池使用率
      - alert: DatabaseConnectionPoolHigh
        expr: |
          (db_connections_active / db_connections_max) > 0.8
        for: 3m
        labels:
          severity: warning
          service: user-management
        annotations:
          summary: "🟡 数据库连接池使用率超过 80%"
          description: |
            活跃连接: {{ $value | humanizePercentage }} 使用率
            若持续增长，可能导致连接拒绝
            建议：检查慢查询，或增加连接池大小

      # 高内存使用
      - alert: HighMemoryUsage
        expr: |
          system_memory_usage_bytes > 800000000  # 800MB
        for: 5m
        labels:
          severity: warning
          service: user-management
        annotations:
          summary: "🟡 内存使用超过 800MB"
          description: "当前内存使用: {{ $value | humanize1024 }}B，请检查内存泄漏"

      # Goroutine 数量异常
      - alert: GoroutineLeakSuspected
        expr: system_goroutines > 1000
        for: 10m
        labels:
          severity: warning
          service: user-management
        annotations:
          summary: "🟡 Goroutine 数量异常，疑似泄漏"
          description: "当前 goroutine 数量: {{ $value }}，超过 1000"

      # 高响应时间（保留，作为绝对阈值兜底）
      - alert: HighResponseTime_Absolute
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
          ) > 2
        for: 5m
        labels:
          severity: warning
          service: user-management
        annotations:
          summary: "🟡 API P95 响应时间超过 2 秒"
          description: "路径 {{ $labels.path }} 响应时间 P95: {{ $value }}s，超过绝对阈值 2s"

  # =========================================================================
  # 安全事件告警
  # =========================================================================
  - name: ums-security
    interval: 30s
    rules:
      # 暴力破解检测
      - alert: BruteForceAttackDetected
        expr: |
          (
            sum(rate(user_logins_total{status="failed"}[5m]))
            /
            sum(rate(user_logins_total[5m]))
          ) > 0.5
          AND
          sum(rate(user_logins_total[5m])) > 1
        for: 3m
        labels:
          severity: critical
          category: security
          page: "true"
          service: user-management
        annotations:
          summary: "🔐 [P0-SEC] 疑似暴力破解攻击"
          description: |
            登录失败率: {{ $value | humanizePercentage }}，超过 50%
            请立即检查来源 IP 并确认封禁是否生效
            运维手册: docs/sre/runbooks/brute-force.md

      # 异常检测激增
      - alert: AnomalyDetectionSpike
        expr: |
          sum(rate(anomaly_detected_total[5m])) > 5
        for: 2m
        labels:
          severity: warning
          category: security
          service: user-management
        annotations:
          summary: "🔐 [P2-SEC] 异常登录检测激增"
          description: |
            每秒检测到 {{ $value | humanize }} 个异常事件
            可能存在地理位置异常、未知设备或账号泄露

      # Token 刷新失败激增
      - alert: TokenRefreshFailureSpike
        expr: |
          sum(rate(token_refresh_total{status="failure"}[5m])) > 10
        for: 2m
        labels:
          severity: warning
          category: auth
          service: user-management
        annotations:
          summary: "🟡 Token 刷新失败激增"
          description: |
            每分钟 Token 刷新失败: {{ $value | humanize }}
            可能原因：JWT Secret 轮换、时钟偏差、Redis 不可用

      # 账号锁定激增
      - alert: AccountLockoutSpike
        expr: |
          rate(account_lock_total[10m]) > 0.5
        for: 5m
        labels:
          severity: warning
          category: security
          service: user-management
        annotations:
          summary: "🔐 账号锁定事件激增"
          description: "每分钟账号锁定: {{ $value | humanize }}，可能存在针对性攻击"

  # =========================================================================
  # 缓存健康告警
  # =========================================================================
  - name: ums-cache
    interval: 60s
    rules:
      # 缓存命中率低
      - alert: LowCacheHitRate
        expr: |
          (
            sum(rate(cache_hits_total[10m]))
            /
            sum(rate(cache_operations_total[10m]))
          ) < 0.6
          AND
          sum(rate(cache_operations_total[10m])) > 1
        for: 15m
        labels:
          severity: warning
          service: user-management
        annotations:
          summary: "🟡 缓存命中率低于 60%"
          description: |
            当前命中率: {{ $value | humanizePercentage }}
            可能导致数据库压力增大
            请检查缓存 TTL 配置和热点 Key 分布

  # =========================================================================
  # 业务异常告警（信息类）
  # =========================================================================
  - name: ums-business
    interval: 60s
    rules:
      # API 请求量异常（使用相对偏差，而非绝对值）
      - alert: APIRequestVolumeAnomaly
        expr: |
          (
            sum(rate(http_requests_total[5m]))
            /
            avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m])
          ) > 3
          OR
          (
            sum(rate(http_requests_total[5m]))
            /
            avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m])
          ) < 0.1
        for: 5m
        labels:
          severity: info
          service: user-management
        annotations:
          summary: "📊 API 请求量异常偏离基线"
          description: |
            当前请求量是过去1小时均值的 {{ $value | humanize }} 倍
            可能是流量突增（>3x）或流量断崖（<0.1x）
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								groups:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								  # =========================================================================
 								  # SLO 燃烧率告警（基于错误预算，替代简单阈值告警）
 								  # 参考：Google SRE Book - Alerting on SLOs
 								  # =========================================================================
 								  - name: ums-slo-burn-rate
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								    interval: 30s
 								    rules:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								      # -----------------------------------------------------------------------
 								      # SLO-1: API 可用性 (目标: 99.9% / 30天错误预算: 43.8分钟)
 								      # -----------------------------------------------------------------------
 								      # 快速燃烧：5m + 1h 双窗口确认，燃烧率 14.4x
 								      # 含义：若持续，将在 2小时内 消耗本月 2% 错误预算
 								      - alert: APIAvailability_FastBurn
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        expr: |
 								          (
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								            sum(rate(http_requests_total{status=~"5.."}[5m]))
 								            /
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								            sum(rate(http_requests_total[5m]))
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          ) > (1 - 0.999) * 14.4
 								          AND
 								          (
 								            sum(rate(http_requests_total{status=~"5.."}[1h]))
 								            /
 								            sum(rate(http_requests_total[1h]))
 								          ) > (1 - 0.999) * 14.4
 								        for: 2m
 								        labels:
 								          severity: critical
 								          slo: api-availability
 								          page: "true"
 								          service: user-management
 								        annotations:
 								          summary: "🔴 [P0] API 可用性 SLO 快速燃烧 — 立即响应"
 								          description: |
 								            错误预算正在以 14.4x 速率消耗（正常速率的14倍）
 								            当前5分钟错误率: {{ $value | humanizePercentage }}
 								            若持续2小时，将消耗本月约 2% 错误预算（约50分钟）
 								            SLO 目标: 99.9% (月度允许宕机: 43.8分钟)
 								            运维手册: docs/sre/runbooks/api-availability.md
 								          dashboard_url: "http://grafana:3000/d/ums-slo"
 								      # 慢速燃烧：30m + 6h 双窗口确认，燃烧率 6x
 								      # 含义：若持续，将在 1天内 消耗本月 5% 错误预算
 								      - alert: APIAvailability_SlowBurn
 								        expr: |
 								          (
 								            sum(rate(http_requests_total{status=~"5.."}[30m]))
 								            /
 								            sum(rate(http_requests_total[30m]))
 								          ) > (1 - 0.999) * 6
 								          AND
 								          (
 								            sum(rate(http_requests_total{status=~"5.."}[6h]))
 								            /
 								            sum(rate(http_requests_total[6h]))
 								          ) > (1 - 0.999) * 6
 								        for: 15m
 								        labels:
 								          severity: warning
 								          slo: api-availability
 								          page: "false"
 								          service: user-management
 								        annotations:
 								          summary: "🟡 [P2] API 可用性 SLO 缓慢燃烧 — 需在工作时间内关注"
 								          description: |
 								            错误预算正在以 6x 速率缓慢消耗
 								            若持续1天，将消耗本月 5% 错误预算
 								            当前30分钟错误率: {{ $value | humanizePercentage }}
 								      # -----------------------------------------------------------------------
 								      # SLO-2: API 延迟 (目标: P99 < 500ms 覆盖 99% 请求)
 								      # -----------------------------------------------------------------------
 								      - alert: APILatency_FastBurn
 								        expr: |
 								          histogram_quantile(0.99,
 								            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
 								          ) > 0.5
 								          AND
 								          histogram_quantile(0.99,
 								            sum(rate(http_request_duration_seconds_bucket[1h])) by (le)
 								          ) > 0.5
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        for: 5m
 								        labels:
 								          severity: critical
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          slo: api-latency
 								          page: "true"
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🔴 [P0] API 延迟 SLO 违规 — P99 超过 500ms"
 								          description: |
 								            当前 P99 延迟: {{ $value | humanizeDuration }}
 								            SLO 目标: P99 < 500ms
 								            请检查慢查询和数据库连接池
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								      - alert: APILatency_CriticalPath
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        expr: |
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          histogram_quantile(0.99,
 								            sum(rate(http_request_duration_seconds_bucket{
 								              path=~".*auth/login.*|.*auth/refresh.*"
 								            }[5m])) by (le, path)
 								          ) > 0.3
 								        for: 3m
 								        labels:
 								          severity: critical
 								          slo: api-latency-auth
 								          service: user-management
 								        annotations:
 								          summary: "🔴 [P0] 认证关键路径延迟超标"
 								          description: |
 								            路径 {{ $labels.path }} 的 P99 延迟: {{ $value | humanizeDuration }}
 								            认证路径 SLO: P99 < 300ms
 								      # -----------------------------------------------------------------------
 								      # SLO-3: 登录成功率 (目标: 99% 非攻击流量)
 								      # -----------------------------------------------------------------------
 								      - alert: LoginSuccessRate_Degraded
 								        expr: |
 								          (
 								            sum(rate(user_logins_total{status="success"}[10m]))
 								            /
 								            sum(rate(user_logins_total[10m]))
 								          ) < 0.9
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        for: 5m
 								        labels:
 								          severity: warning
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          slo: login-success-rate
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🟡 [P2] 登录成功率下降"
 								          description: |
 								            当前10分钟登录成功率: {{ $value | humanizePercentage }}
 								            SLO 目标: 99%
 								            注意：高失败率可能是暴力破解也可能是系统问题，请结合安全事件判断
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								  # =========================================================================
 								  # 基础设施告警（阈值型，高置信度）
 								  # =========================================================================
 								  - name: ums-infrastructure
 								    interval: 30s
 								    rules:
 								      # 服务宕机（最高优先级）
 								      - alert: ServiceDown
 								        expr: up{job="user-management"} == 0
 								        for: 1m
 								        labels:
 								          severity: critical
 								          page: "true"
 								          service: user-management
 								        annotations:
 								          summary: "🚨 [P0] 用户管理服务实例宕机"
 								          description: "实例 {{ $labels.instance }} 已离线超过 1 分钟，健康检查失败"
 								      # 数据库不可用（通过高 503 率推断）
 								      - alert: DatabaseConnectionFailed
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        expr: |
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          sum(rate(http_requests_total{status="503"}[2m])) > 1
 								        for: 1m
 								        labels:
 								          severity: critical
 								          page: "true"
 								          service: user-management
 								        annotations:
 								          summary: "🚨 [P0] 数据库连接失败，服务不可用"
 								          description: |
 								            大量 503 响应，可能是数据库连接池耗尽或数据库宕机
 								            运维手册: docs/sre/runbooks/database-down.md
 								      # 数据库连接池使用率
 								      - alert: DatabaseConnectionPoolHigh
 								        expr: |
 								          (db_connections_active / db_connections_max) > 0.8
 								        for: 3m
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        labels:
 								          severity: warning
 								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🟡 数据库连接池使用率超过 80%"
 								          description: |
 								            活跃连接: {{ $value | humanizePercentage }} 使用率
 								            若持续增长，可能导致连接拒绝
 								            建议：检查慢查询，或增加连接池大小
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								      # 高内存使用
 								      - alert: HighMemoryUsage
 								        expr: |
 								          system_memory_usage_bytes > 800000000  # 800MB
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        for: 5m
 								        labels:
 								          severity: warning
 								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🟡 内存使用超过 800MB"
 								          description: "当前内存使用: {{ $value | humanize1024 }}B，请检查内存泄漏"
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								      # Goroutine 数量异常
 								      - alert: GoroutineLeakSuspected
 								        expr: system_goroutines > 1000
 								        for: 10m
 								        labels:
 								          severity: warning
 								          service: user-management
 								        annotations:
 								          summary: "🟡 Goroutine 数量异常，疑似泄漏"
 								          description: "当前 goroutine 数量: {{ $value }}，超过 1000"
 								      # 高响应时间（保留，作为绝对阈值兜底）
 								      - alert: HighResponseTime_Absolute
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        expr: |
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          histogram_quantile(0.95,
 								            sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
 								          ) > 2
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        for: 5m
 								        labels:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          severity: warning
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🟡 API P95 响应时间超过 2 秒"
 								          description: "路径 {{ $labels.path }} 响应时间 P95: {{ $value }}s，超过绝对阈值 2s"
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								  # =========================================================================
 								  # 安全事件告警
 								  # =========================================================================
 								  - name: ums-security
 								    interval: 30s
 								    rules:
 								      # 暴力破解检测
 								      - alert: BruteForceAttackDetected
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        expr: |
 								          (
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								            sum(rate(user_logins_total{status="failed"}[5m]))
 								            /
 								            sum(rate(user_logins_total[5m]))
 								          ) > 0.5
 								          AND
 								          sum(rate(user_logins_total[5m])) > 1
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        for: 3m
 								        labels:
 								          severity: critical
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          category: security
 								          page: "true"
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🔐 [P0-SEC] 疑似暴力破解攻击"
 								          description: |
 								            登录失败率: {{ $value | humanizePercentage }}，超过 50%
 								            请立即检查来源 IP 并确认封禁是否生效
 								            运维手册: docs/sre/runbooks/brute-force.md
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								      # 异常检测激增
 								      - alert: AnomalyDetectionSpike
 								        expr: |
 								          sum(rate(anomaly_detected_total[5m])) > 5
 								        for: 2m
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        labels:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          severity: warning
 								          category: security
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🔐 [P2-SEC] 异常登录检测激增"
 								          description: |
 								            每秒检测到 {{ $value | humanize }} 个异常事件
 								            可能存在地理位置异常、未知设备或账号泄露
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								      # Token 刷新失败激增
 								      - alert: TokenRefreshFailureSpike
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        expr: |
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          sum(rate(token_refresh_total{status="failure"}[5m])) > 10
 								        for: 2m
 								        labels:
 								          severity: warning
 								          category: auth
 								          service: user-management
 								        annotations:
 								          summary: "🟡 Token 刷新失败激增"
 								          description: |
 								            每分钟 Token 刷新失败: {{ $value | humanize }}
 								            可能原因：JWT Secret 轮换、时钟偏差、Redis 不可用
 								      # 账号锁定激增
 								      - alert: AccountLockoutSpike
 								        expr: |
 								          rate(account_lock_total[10m]) > 0.5
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        for: 5m
 								        labels:
 								          severity: warning
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          category: security
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "🔐 账号锁定事件激增"
 								          description: "每分钟账号锁定: {{ $value | humanize }}，可能存在针对性攻击"
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								  # =========================================================================
 								  # 缓存健康告警
 								  # =========================================================================
 								  - name: ums-cache
 								    interval: 60s
 								    rules:
 								      # 缓存命中率低
 								      - alert: LowCacheHitRate
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        expr: |
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          (
 								            sum(rate(cache_hits_total[10m]))
 								            /
 								            sum(rate(cache_operations_total[10m]))
 								          ) < 0.6
 								          AND
 								          sum(rate(cache_operations_total[10m])) > 1
 								        for: 15m
 								        labels:
 								          severity: warning
 								          service: user-management
 								        annotations:
 								          summary: "🟡 缓存命中率低于 60%"
 								          description: |
 								            当前命中率: {{ $value | humanizePercentage }}
 								            可能导致数据库压力增大
 								            请检查缓存 TTL 配置和热点 Key 分布
 								  # =========================================================================
 								  # 业务异常告警（信息类）
 								  # =========================================================================
 								  - name: ums-business
 								    interval: 60s
 								    rules:
 								      # API 请求量异常（使用相对偏差，而非绝对值）
 								      - alert: APIRequestVolumeAnomaly
 								        expr: |
 								          (
 								            sum(rate(http_requests_total[5m]))
 								            /
 								            avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m])
 								          ) > 3
 								          OR
 								          (
 								            sum(rate(http_requests_total[5m]))
 								            /
 								            avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m])
 								          ) < 0.1
-												docs: project docs, scripts, deployment configs, and evidence

											
										
										
											2026-04-02 11:22:17 +08:00
+								        for: 5m
 								        labels:
 								          severity: info
 								          service: user-management
 								        annotations:
-												refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过

											
										
										
											2026-04-07 18:10:36 +08:00
+								          summary: "📊 API 请求量异常偏离基线"
 								          description: |
 								            当前请求量是过去1小时均值的 {{ $value | humanize }} 倍
 								            可能是流量突增（>3x）或流量断崖（<0.1x）