docs: project docs, scripts, deployment configs, and evidence

This commit is contained in:
2026-04-02 11:22:17 +08:00
parent 4718980ab5
commit bbeeb63dfa
396 changed files with 165018 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
# Alertmanager notification channel injection example.
# Production should source these values from a secrets manager, CI/CD secret store,
# or environment-specific secure deployment mechanism.
ALERTMANAGER_DEFAULT_TO=ops@example.com
ALERTMANAGER_CRITICAL_TO=oncall-critical@example.com
ALERTMANAGER_WARNING_TO=oncall-warning@example.com
ALERTMANAGER_FROM=alertmanager@example.com
ALERTMANAGER_SMARTHOST=smtp.example.com:587
ALERTMANAGER_AUTH_USERNAME=alertmanager@example.com
ALERTMANAGER_AUTH_PASSWORD=replace-with-secret

View File

@@ -0,0 +1,84 @@
global:
resolve_timeout: 5m
# 注意:
# 该文件为模板文件,生产环境必须先注入并渲染 `${ALERTMANAGER_*}` 变量,
# 再将渲染结果交给 Alertmanager 使用。
# 告警路由
route:
group_by: ['alertname', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'default'
# 子路由,根据严重级别分发
routes:
# Critical 告警
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 10s
continue: true
# Warning 告警
- match:
severity: warning
receiver: 'warning-alerts'
continue: true
# 告警接收者
receivers:
# 默认接收者
- name: 'default'
email_configs:
- to: '${ALERTMANAGER_DEFAULT_TO}'
from: '${ALERTMANAGER_FROM}'
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
headers:
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
# Critical 告警接收者
- name: 'critical-alerts'
email_configs:
- to: '${ALERTMANAGER_CRITICAL_TO}'
from: '${ALERTMANAGER_FROM}'
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
headers:
Subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
# Warning 告警接收者
- name: 'warning-alerts'
email_configs:
- to: '${ALERTMANAGER_WARNING_TO}'
from: '${ALERTMANAGER_FROM}'
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
headers:
Subject: '[WARNING] {{ .GroupLabels.alertname }}'
# 告警抑制规则
inhibit_rules:
# 如果有 critical 告警,抑制同一服务的 warning 告警
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['service']
# 告警静默规则(按需配置)
# silences:
# - matchers:
# - name: alertname
# value: LowOnlineUsers
# - name: severity
# value: info
# startsAt: "2026-03-12T00:00:00+08:00"
# endsAt: "2026-03-12T23:59:59+08:00"
# comment: "维护期间静默低在线用户告警"

View File

@@ -0,0 +1,133 @@
groups:
- name: user-ms-alerts
interval: 30s
rules:
# 高错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
service: user-management
annotations:
summary: "高错误率告警"
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
# 高响应时间告警
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
) > 1
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高响应时间告警"
description: "API P95响应时间超过1秒路径: {{ $labels.path }},当前值: {{ $value }}s"
# 低缓存命中率告警
- alert: LowCacheHitRate
expr: |
(
sum(rate(cache_hits_total[5m]))
/
sum(rate(cache_operations_total[5m]))
) < 0.7
for: 10m
labels:
severity: warning
service: user-management
annotations:
summary: "低缓存命中率告警"
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
# CPU 使用率告警
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total[5m]) > 0.8
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高CPU使用率告警"
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
# 内存使用率告警
- alert: HighMemoryUsage
expr: |
(
system_memory_usage_bytes /
(node_memory_MemTotal_bytes)
) > 0.85
for: 5m
labels:
severity: critical
service: user-management
annotations:
summary: "高内存使用率告警"
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
# 数据库连接告警
- alert: DatabaseConnectionPoolExhausted
expr: |
(
db_connections_active /
db_connections_max
) > 0.9
for: 3m
labels:
severity: critical
service: user-management
annotations:
summary: "数据库连接池耗尽告警"
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
# 在线用户数告警
- alert: LowOnlineUsers
expr: active_users{period="5m"} < 10
for: 30m
labels:
severity: info
service: user-management
annotations:
summary: "在线用户数告警"
description: "过去5分钟活跃用户数低于10当前值: {{ $value }}"
# 登录失败率告警
- alert: HighLoginFailureRate
expr: |
(
sum(rate(user_logins_total{status="failed"}[5m]))
/
sum(rate(user_logins_total[5m]))
) > 0.3
for: 5m
labels:
severity: warning
service: user-management
annotations:
summary: "高登录失败率告警"
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
# API QPS 异常告警
- alert: UnusualAPIRequestRate
expr: |
abs(
sum(rate(http_requests_total[5m]))
-
avg(sum(rate(http_requests_total[5m])) over 1h)
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
for: 5m
labels:
severity: info
service: user-management
annotations:
summary: "API请求量异常告警"
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"