Files
user-system/internal/monitoring/health.go

208 lines
5.0 KiB
Go
Raw Normal View History

package monitoring
import (
"context"
"database/sql"
"net/http"
"time"
"github.com/gin-gonic/gin"
"gorm.io/gorm"
)
// HealthStatus 健康状态
type HealthStatus string
const (
HealthStatusUP HealthStatus = "UP"
HealthStatusDOWN HealthStatus = "DOWN"
HealthStatusDEGRADED HealthStatus = "DEGRADED"
HealthStatusUNKNOWN HealthStatus = "UNKNOWN"
)
// HealthCheck 健康检查器(增强版,支持 Redis 检查)
type HealthCheck struct {
db *gorm.DB
redisClient RedisChecker
startTime time.Time
}
// RedisChecker Redis 健康检查接口(避免直接依赖 Redis 包)
type RedisChecker interface {
Ping(ctx context.Context) error
}
// Status 健康状态
type Status struct {
Status HealthStatus `json:"status"`
Checks map[string]CheckResult `json:"checks"`
Uptime string `json:"uptime,omitempty"`
Timestamp string `json:"timestamp"`
}
// CheckResult 检查结果
type CheckResult struct {
Status HealthStatus `json:"status"`
Error string `json:"error,omitempty"`
Latency string `json:"latency_ms,omitempty"`
}
// NewHealthCheck 创建健康检查器
func NewHealthCheck(db *gorm.DB) *HealthCheck {
return &HealthCheck{
db: db,
startTime: time.Now(),
}
}
// WithRedis 注入 Redis 检查器(可选)
func (h *HealthCheck) WithRedis(r RedisChecker) *HealthCheck {
h.redisClient = r
return h
}
// Check 执行完整健康检查
func (h *HealthCheck) Check() *Status {
status := &Status{
Status: HealthStatusUP,
Checks: make(map[string]CheckResult),
Timestamp: time.Now().UTC().Format(time.RFC3339),
}
if h.startTime != (time.Time{}) {
status.Uptime = time.Since(h.startTime).Round(time.Second).String()
}
// 检查数据库强依赖DOWN 则服务 DOWN
dbResult := h.checkDatabase()
status.Checks["database"] = dbResult
if dbResult.Status == HealthStatusDOWN {
status.Status = HealthStatusDOWN
}
// 检查 Redis弱依赖DOWN 则服务 DEGRADED不影响主功能
if h.redisClient != nil {
redisResult := h.checkRedis()
status.Checks["redis"] = redisResult
if redisResult.Status == HealthStatusDOWN && status.Status == HealthStatusUP {
status.Status = HealthStatusDEGRADED
}
}
return status
}
// LivenessCheck 存活检查(只检查进程是否运行,不检查依赖)
func (h *HealthCheck) LivenessCheck() *Status {
return &Status{
Status: HealthStatusUP,
Checks: map[string]CheckResult{},
Timestamp: time.Now().UTC().Format(time.RFC3339),
}
}
// checkDatabase 检查数据库连接
func (h *HealthCheck) checkDatabase() CheckResult {
if h == nil || h.db == nil {
return CheckResult{
Status: HealthStatusDOWN,
Error: "database not configured",
}
}
start := time.Now()
sqlDB, err := h.db.DB()
if err != nil {
return CheckResult{
Status: HealthStatusDOWN,
Error: err.Error(),
}
}
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
if err := sqlDB.PingContext(ctx); err != nil {
return CheckResult{
Status: HealthStatusDOWN,
Error: err.Error(),
Latency: formatLatency(time.Since(start)),
}
}
// 同时更新连接池指标
go h.updateDBConnectionMetrics(sqlDB)
return CheckResult{
Status: HealthStatusUP,
Latency: formatLatency(time.Since(start)),
}
}
// checkRedis 检查 Redis 连接
func (h *HealthCheck) checkRedis() CheckResult {
if h.redisClient == nil {
return CheckResult{Status: HealthStatusUNKNOWN}
}
start := time.Now()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if err := h.redisClient.Ping(ctx); err != nil {
return CheckResult{
Status: HealthStatusDOWN,
Error: err.Error(),
Latency: formatLatency(time.Since(start)),
}
}
return CheckResult{
Status: HealthStatusUP,
Latency: formatLatency(time.Since(start)),
}
}
// updateDBConnectionMetrics 更新数据库连接池 Prometheus 指标
func (h *HealthCheck) updateDBConnectionMetrics(sqlDB *sql.DB) {
stats := sqlDB.Stats()
sloMetrics := GetGlobalSLOMetrics()
sloMetrics.SetDBConnections(
float64(stats.InUse),
float64(stats.MaxOpenConnections),
)
}
// ReadinessHandler 就绪检查 Handler检查所有依赖
func (h *HealthCheck) ReadinessHandler(c *gin.Context) {
status := h.Check()
httpStatus := http.StatusOK
if status.Status == HealthStatusDOWN {
httpStatus = http.StatusServiceUnavailable
} else if status.Status == HealthStatusDEGRADED {
// DEGRADED 仍返回 200但在响应体中标注
httpStatus = http.StatusOK
}
c.JSON(httpStatus, status)
}
// LivenessHandler 存活检查 Handler只检查进程存活不检查依赖
// 返回 204 No Content进程存活不需要响应体节省 k8s probe 开销)
func (h *HealthCheck) LivenessHandler(c *gin.Context) {
c.AbortWithStatus(http.StatusNoContent)
}
// Handler 兼容旧 /health 端点
func (h *HealthCheck) Handler(c *gin.Context) {
h.ReadinessHandler(c)
}
func formatLatency(d time.Duration) string {
if d < time.Millisecond {
return "< 1ms"
}
return d.Round(time.Millisecond).String()
}