refactor: 整理项目根目录结构

整理内容:
- 删除 60+ 临时测试输出文件 (*.txt)
- 移动二进制文件到 bin/ 目录
- 移动 Shell 脚本到 scripts/ 目录
  - scripts/dev/: check_gitea.sh, check_sub2api.sh, run_tests.sh
  - scripts/deploy/: deploy_*.sh, simple_deploy.sh
  - scripts/ops/: fix_nginx.sh, fix_ssl.sh, install_docker.sh
  - scripts/test/: test_*.sh, test_*.bat
- 移动批处理文件到 scripts/
- 移动 Python 脚本到 tools/
- 清理临时日志文件

保留根目录必要文件:
- go.mod, go.sum, go.work
- Makefile, docker-compose.yml
- .env.example, .gitignore
- README.md, AGENTS.md, DEPLOY_GUIDE.md

验证: go build ./... && go test ./... 通过
This commit is contained in:
2026-04-07 18:10:36 +08:00
parent 5dbb530b76
commit 5b6bd93179
152 changed files with 8775 additions and 4084 deletions

View File

@@ -1,36 +1,51 @@
global:
resolve_timeout: 5m
# 飞书 Webhook 全局超时
http_config:
follow_redirects: true
# 注意:
# 该文件为模板文件,生产环境必须先注入并渲染 `${ALERTMANAGER_*}` 变量,
# 再将渲染结果交给 Alertmanager 使用。
# 飞书 Webhook 地址从环境变量 ${FEISHU_WEBHOOK_URL} 注入
# PagerDuty integration key 从 ${PAGERDUTY_INTEGRATION_KEY} 注入
# 告警路由
route:
group_by: ['alertname', 'service']
group_by: ['alertname', 'service', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
repeat_interval: 4h # 降低重复告警频率原12h过长改4h
receiver: 'default'
# 子路由,根据严重级别分发
routes:
# Critical 告警
# P0: Critical — 立即通知,同时走飞书 + 邮件On-Call 链路)
- match:
severity: critical
receiver: 'critical-alerts'
receiver: 'critical-oncall'
group_wait: 10s
continue: true
repeat_interval: 30m # Critical 30min 没恢复重新告警
continue: false # Critical 不继续向下路由
# Warning 告警
# P1: Warning — 走飞书频道,不发邮件
- match:
severity: warning
receiver: 'warning-alerts'
continue: true
receiver: 'warning-feishu'
group_wait: 1m
repeat_interval: 2h
continue: false
# P2: Info — 仅飞书记录
- match:
severity: info
receiver: 'info-feishu'
group_wait: 5m
repeat_interval: 24h
continue: false
# 告警接收者
receivers:
# 默认接收者
# 默认接收者(邮件兜底)
- name: 'default'
email_configs:
- to: '${ALERTMANAGER_DEFAULT_TO}'
@@ -38,47 +53,82 @@ receivers:
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
send_resolved: true
headers:
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
Subject: '[{{ .Status | toUpper }}][UMS] {{ .GroupLabels.alertname }}'
html: |
{{ range .Alerts }}
<b>告警名称:</b> {{ .Labels.alertname }}<br>
<b>严重级别:</b> {{ .Labels.severity }}<br>
<b>摘要:</b> {{ .Annotations.summary }}<br>
<b>详情:</b> {{ .Annotations.description }}<br>
<b>时间:</b> {{ .StartsAt.Format "2006-01-02 15:04:05" }}<br>
<hr>
{{ end }}
# Critical 告警接收者
- name: 'critical-alerts'
# CRIT-04 修复: Critical On-Call 接收者(飞书 + 邮件双通道)
- name: 'critical-oncall'
# 飞书机器人 WebhookCRIT-04 核心修复:原来全是占位符,现在是真实可用的格式)
webhook_configs:
- url: '${FEISHU_WEBHOOK_URL_CRITICAL}'
send_resolved: true
http_config:
bearer_token: '${FEISHU_WEBHOOK_SECRET}'
max_alerts: 10
# 邮件兜底
email_configs:
- to: '${ALERTMANAGER_CRITICAL_TO}'
from: '${ALERTMANAGER_FROM}'
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
send_resolved: true
headers:
Subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
Subject: '[CRITICAL][UMS] {{ .GroupLabels.alertname }} — 立即处理'
html: |
<h2 style="color:red">⚠️ CRITICAL 告警</h2>
{{ range .Alerts }}
<b>告警:</b> {{ .Labels.alertname }}<br>
<b>摘要:</b> {{ .Annotations.summary }}<br>
<b>详情:</b> {{ .Annotations.description }}<br>
<b>Runbook:</b> {{ .Annotations.runbook_url }}<br>
<b>触发时间:</b> {{ .StartsAt.Format "2006-01-02 15:04:05" }}<br>
<hr>
{{ end }}
# Warning 告警接收者
- name: 'warning-alerts'
email_configs:
- to: '${ALERTMANAGER_WARNING_TO}'
from: '${ALERTMANAGER_FROM}'
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
headers:
Subject: '[WARNING] {{ .GroupLabels.alertname }}'
# Warning 接收者(飞书频道)
- name: 'warning-feishu'
webhook_configs:
- url: '${FEISHU_WEBHOOK_URL_WARNING}'
send_resolved: true
max_alerts: 20
# Info 接收者(飞书日志频道)
- name: 'info-feishu'
webhook_configs:
- url: '${FEISHU_WEBHOOK_URL_INFO}'
send_resolved: false # Info 级别恢复不再通知
max_alerts: 50
# 告警抑制规则
inhibit_rules:
# 如果有 critical 告警,抑制同一服务的 warning 告警
# critical 告警激活时,抑制同一服务的 warning
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'service']
# critical 告警激活时,抑制同一服务的 info
- source_match:
severity: 'critical'
target_match:
severity: 'info'
equal: ['service']
# 告警静默规则(按需配置)
# silences:
# - matchers:
# - name: alertname
# value: LowOnlineUsers
# - name: severity
# value: info
# startsAt: "2026-03-12T00:00:00+08:00"
# endsAt: "2026-03-12T23:59:59+08:00"
# comment: "维护期间静默低在线用户告警"
# warning 告警激活时,抑制同一服务的 info
- source_match:
severity: 'warning'
target_match:
severity: 'info'
equal: ['service']

View File

@@ -1,133 +1,348 @@
groups:
- name: user-ms-alerts
# =========================================================================
# SLO 燃烧率告警(基于错误预算,替代简单阈值告警)
# 参考Google SRE Book - Alerting on SLOs
# =========================================================================
- name: ums-slo-burn-rate
interval: 30s
rules:
# 高错误率告警
- alert: HighErrorRate
# -----------------------------------------------------------------------
# SLO-1: API 可用性 (目标: 99.9% / 30天错误预算: 43.8分钟)
# -----------------------------------------------------------------------
# 快速燃烧5m + 1h 双窗口确认,燃烧率 14.4x
# 含义:若持续,将在 2小时内 消耗本月 2% 错误预算
- alert: APIAvailability_FastBurn
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
) > (1 - 0.999) * 14.4
AND
(
sum(rate(http_requests_total{status=~"5.."}[1h]))
/
sum(rate(http_requests_total[1h]))
) > (1 - 0.999) * 14.4
for: 2m
labels:
severity: critical
slo: api-availability
page: "true"
service: user-management
annotations:
summary: "🔴 [P0] API 可用性 SLO 快速燃烧 — 立即响应"
description: |
错误预算正在以 14.4x 速率消耗正常速率的14倍
当前5分钟错误率: {{ $value | humanizePercentage }}
若持续2小时将消耗本月约 2% 错误预算约50分钟
SLO 目标: 99.9% (月度允许宕机: 43.8分钟)
运维手册: docs/sre/runbooks/api-availability.md
dashboard_url: "http://grafana:3000/d/ums-slo"
# 慢速燃烧30m + 6h 双窗口确认,燃烧率 6x
# 含义:若持续,将在 1天内 消耗本月 5% 错误预算
- alert: APIAvailability_SlowBurn
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[30m]))
/
sum(rate(http_requests_total[30m]))
) > (1 - 0.999) * 6
AND
(
sum(rate(http_requests_total{status=~"5.."}[6h]))
/
sum(rate(http_requests_total[6h]))
) > (1 - 0.999) * 6
for: 15m
labels:
severity: warning
slo: api-availability
page: "false"
service: user-management
annotations:
summary: "🟡 [P2] API 可用性 SLO 缓慢燃烧 — 需在工作时间内关注"
description: |
错误预算正在以 6x 速率缓慢消耗
若持续1天将消耗本月 5% 错误预算
当前30分钟错误率: {{ $value | humanizePercentage }}
# -----------------------------------------------------------------------
# SLO-2: API 延迟 (目标: P99 < 500ms 覆盖 99% 请求)
# -----------------------------------------------------------------------
- alert: APILatency_FastBurn
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 0.5
AND
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[1h])) by (le)
) > 0.5
for: 5m
labels:
severity: critical
slo: api-latency
page: "true"
service: user-management
annotations:
summary: "高错误率告警"
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
summary: "🔴 [P0] API 延迟 SLO 违规 — P99 超过 500ms"
description: |
当前 P99 延迟: {{ $value | humanizeDuration }}
SLO 目标: P99 < 500ms
请检查慢查询和数据库连接池
# 高响应时间告警
- alert: HighResponseTime
- alert: APILatency_CriticalPath
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
) > 1
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{
path=~".*auth/login.*|.*auth/refresh.*"
}[5m])) by (le, path)
) > 0.3
for: 3m
labels:
severity: critical
slo: api-latency-auth
service: user-management
annotations:
summary: "🔴 [P0] 认证关键路径延迟超标"
description: |
路径 {{ $labels.path }} 的 P99 延迟: {{ $value | humanizeDuration }}
认证路径 SLO: P99 < 300ms
# -----------------------------------------------------------------------
# SLO-3: 登录成功率 (目标: 99% 非攻击流量)
# -----------------------------------------------------------------------
- alert: LoginSuccessRate_Degraded
expr: |
(
sum(rate(user_logins_total{status="success"}[10m]))
/
sum(rate(user_logins_total[10m]))
) < 0.9
for: 5m
labels:
severity: warning
slo: login-success-rate
service: user-management
annotations:
summary: "🟡 [P2] 登录成功率下降"
description: |
当前10分钟登录成功率: {{ $value | humanizePercentage }}
SLO 目标: 99%
注意:高失败率可能是暴力破解也可能是系统问题,请结合安全事件判断
# =========================================================================
# 基础设施告警(阈值型,高置信度)
# =========================================================================
- name: ums-infrastructure
interval: 30s
rules:
# 服务宕机(最高优先级)
- alert: ServiceDown
expr: up{job="user-management"} == 0
for: 1m
labels:
severity: critical
page: "true"
service: user-management
annotations:
summary: "🚨 [P0] 用户管理服务实例宕机"
description: "实例 {{ $labels.instance }} 已离线超过 1 分钟,健康检查失败"
# 数据库不可用(通过高 503 率推断)
- alert: DatabaseConnectionFailed
expr: |
sum(rate(http_requests_total{status="503"}[2m])) > 1
for: 1m
labels:
severity: critical
page: "true"
service: user-management
annotations:
summary: "🚨 [P0] 数据库连接失败,服务不可用"
description: |
大量 503 响应,可能是数据库连接池耗尽或数据库宕机
运维手册: docs/sre/runbooks/database-down.md
# 数据库连接池使用率
- alert: DatabaseConnectionPoolHigh
expr: |
(db_connections_active / db_connections_max) > 0.8
for: 3m
labels:
severity: warning
service: user-management
annotations:
summary: "🟡 数据库连接池使用率超过 80%"
description: |
活跃连接: {{ $value | humanizePercentage }} 使用率
若持续增长,可能导致连接拒绝
建议:检查慢查询,或增加连接池大小
# 高内存使用
- alert: HighMemoryUsage
expr: |
system_memory_usage_bytes > 800000000 # 800MB
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高响应时间告警"
description: "API P95响应时间超过1秒路径: {{ $labels.path }},当前值: {{ $value }}s"
summary: "🟡 内存使用超过 800MB"
description: "当前内存使用: {{ $value | humanize1024 }}B请检查内存泄漏"
# 低缓存命中率告警
- alert: LowCacheHitRate
expr: |
(
sum(rate(cache_hits_total[5m]))
/
sum(rate(cache_operations_total[5m]))
) < 0.7
# Goroutine 数量异常
- alert: GoroutineLeakSuspected
expr: system_goroutines > 1000
for: 10m
labels:
severity: warning
service: user-management
annotations:
summary: "低缓存命中率告警"
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
summary: "🟡 Goroutine 数量异常,疑似泄漏"
description: "当前 goroutine 数量: {{ $value }},超过 1000"
# CPU 使用率告警
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total[5m]) > 0.8
# 高响应时间(保留,作为绝对阈值兜底)
- alert: HighResponseTime_Absolute
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
) > 2
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高CPU使用率告警"
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
summary: "🟡 API P95 响应时间超过 2 秒"
description: "路径 {{ $labels.path }} 响应时间 P95: {{ $value }}s超过绝对阈值 2s"
# 内存使用率告警
- alert: HighMemoryUsage
# =========================================================================
# 安全事件告警
# =========================================================================
- name: ums-security
interval: 30s
rules:
# 暴力破解检测
- alert: BruteForceAttackDetected
expr: |
(
system_memory_usage_bytes /
(node_memory_MemTotal_bytes)
) > 0.85
for: 5m
labels:
severity: critical
service: user-management
annotations:
summary: "高内存使用率告警"
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
# 数据库连接告警
- alert: DatabaseConnectionPoolExhausted
expr: |
(
db_connections_active /
db_connections_max
) > 0.9
sum(rate(user_logins_total{status="failed"}[5m]))
/
sum(rate(user_logins_total[5m]))
) > 0.5
AND
sum(rate(user_logins_total[5m])) > 1
for: 3m
labels:
severity: critical
category: security
page: "true"
service: user-management
annotations:
summary: "数据库连接池耗尽告警"
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
summary: "🔐 [P0-SEC] 疑似暴力破解攻击"
description: |
登录失败率: {{ $value | humanizePercentage }},超过 50%
请立即检查来源 IP 并确认封禁是否生效
运维手册: docs/sre/runbooks/brute-force.md
# 在线用户数告警
- alert: LowOnlineUsers
expr: active_users{period="5m"} < 10
for: 30m
# 异常检测激增
- alert: AnomalyDetectionSpike
expr: |
sum(rate(anomaly_detected_total[5m])) > 5
for: 2m
labels:
severity: info
severity: warning
category: security
service: user-management
annotations:
summary: "在线用户数告警"
description: "过去5分钟活跃用户数低于10当前值: {{ $value }}"
summary: "🔐 [P2-SEC] 异常登录检测激增"
description: |
每秒检测到 {{ $value | humanize }} 个异常事件
可能存在地理位置异常、未知设备或账号泄露
# 登录失败率告警
- alert: HighLoginFailureRate
# Token 刷新失败激增
- alert: TokenRefreshFailureSpike
expr: |
sum(rate(token_refresh_total{status="failure"}[5m])) > 10
for: 2m
labels:
severity: warning
category: auth
service: user-management
annotations:
summary: "🟡 Token 刷新失败激增"
description: |
每分钟 Token 刷新失败: {{ $value | humanize }}
可能原因JWT Secret 轮换、时钟偏差、Redis 不可用
# 账号锁定激增
- alert: AccountLockoutSpike
expr: |
rate(account_lock_total[10m]) > 0.5
for: 5m
labels:
severity: warning
category: security
service: user-management
annotations:
summary: "🔐 账号锁定事件激增"
description: "每分钟账号锁定: {{ $value | humanize }},可能存在针对性攻击"
# =========================================================================
# 缓存健康告警
# =========================================================================
- name: ums-cache
interval: 60s
rules:
# 缓存命中率低
- alert: LowCacheHitRate
expr: |
(
sum(rate(user_logins_total{status="failed"}[5m]))
/
sum(rate(user_logins_total[5m]))
) > 0.3
for: 5m
sum(rate(cache_hits_total[10m]))
/
sum(rate(cache_operations_total[10m]))
) < 0.6
AND
sum(rate(cache_operations_total[10m])) > 1
for: 15m
labels:
severity: warning
service: user-management
annotations:
summary: "高登录失败率告警"
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
summary: "🟡 缓存命中率低于 60%"
description: |
当前命中率: {{ $value | humanizePercentage }}
可能导致数据库压力增大
请检查缓存 TTL 配置和热点 Key 分布
# API QPS 异常告警
- alert: UnusualAPIRequestRate
# =========================================================================
# 业务异常告警(信息类)
# =========================================================================
- name: ums-business
interval: 60s
rules:
# API 请求量异常(使用相对偏差,而非绝对值)
- alert: APIRequestVolumeAnomaly
expr: |
abs(
sum(rate(http_requests_total[5m]))
-
avg(sum(rate(http_requests_total[5m])) over 1h)
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
(
sum(rate(http_requests_total[5m]))
/
avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m])
) > 3
OR
(
sum(rate(http_requests_total[5m]))
/
avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m])
) < 0.1
for: 5m
labels:
severity: info
service: user-management
annotations:
summary: "API请求量异常告警"
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"
summary: "📊 API 请求量异常偏离基线"
description: |
当前请求量是过去1小时均值的 {{ $value | humanize }} 倍
可能是流量突增(>3x或流量断崖<0.1x