Files
user-system/internal/monitoring/slo.go

178 lines
5.2 KiB
Go
Raw Normal View History

package monitoring
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
)
// SLOMetrics 服务级别目标SLO相关指标
// 这些指标是 SLO 测量的基础,用于计算错误预算燃烧率
type SLOMetrics struct {
// 缓存命中统计alerts.yml 引用但原来未定义)
CacheHitsTotal *prometheus.CounterVec
CacheOperationsTotal *prometheus.CounterVec
// 数据库连接池状态alerts.yml 引用但原来未定义)
DBConnectionsActive prometheus.Gauge
DBConnectionsMax prometheus.Gauge
// Token 操作
TokenRefreshTotal *prometheus.CounterVec
// 账号安全事件
AccountLockTotal prometheus.Counter
AnomalyDetectedTotal *prometheus.CounterVec
// 错误预算燃烧率(可选,用于自定义仪表盘)
ErrorBudgetBurnRate *prometheus.GaugeVec
registry *prometheus.Registry
once sync.Once
}
var (
globalSLOMetrics *SLOMetrics
globalSLOMetricsOnce sync.Once
)
// NewSLOMetrics 创建 SLO 指标实例(使用独立 registry 避免测试冲突)
func NewSLOMetrics() *SLOMetrics {
reg := prometheus.NewRegistry()
m := &SLOMetrics{registry: reg}
m.CacheHitsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cache_hits_total",
Help: "Total number of cache hits",
},
[]string{"level", "operation"}, // level: l1/l2, operation: get/set
)
m.CacheOperationsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cache_operations_total",
Help: "Total number of cache operations",
},
[]string{"level", "operation"},
)
m.DBConnectionsActive = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "db_connections_active",
Help: "Number of active database connections",
},
)
m.DBConnectionsMax = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "db_connections_max",
Help: "Maximum number of database connections configured",
},
)
m.TokenRefreshTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "token_refresh_total",
Help: "Total number of token refresh attempts",
},
[]string{"status"}, // success/failure/rate_limited
)
m.AccountLockTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "account_lock_total",
Help: "Total number of account lockout events due to failed login attempts",
},
)
m.AnomalyDetectedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "anomaly_detected_total",
Help: "Total number of anomaly login detections",
},
[]string{"type"}, // geo_anomaly/device_anomaly/brute_force/suspicious_ip
)
m.ErrorBudgetBurnRate = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "error_budget_burn_rate",
Help: "Current error budget burn rate multiplier (1.0 = nominal consumption)",
},
[]string{"slo"}, // api-availability/api-latency/login-success-rate
)
reg.MustRegister(
m.CacheHitsTotal,
m.CacheOperationsTotal,
m.DBConnectionsActive,
m.DBConnectionsMax,
m.TokenRefreshTotal,
m.AccountLockTotal,
m.AnomalyDetectedTotal,
m.ErrorBudgetBurnRate,
)
return m
}
// GetGlobalSLOMetrics 获取全局 SLO 指标单例(生产使用)
func GetGlobalSLOMetrics() *SLOMetrics {
globalSLOMetricsOnce.Do(func() {
m := NewSLOMetrics()
// 注册到默认 registry 以便 /metrics 端点暴露
prometheus.DefaultRegisterer.Register(m.CacheHitsTotal) //nolint:errcheck
prometheus.DefaultRegisterer.Register(m.CacheOperationsTotal) //nolint:errcheck
prometheus.DefaultRegisterer.Register(m.DBConnectionsActive) //nolint:errcheck
prometheus.DefaultRegisterer.Register(m.DBConnectionsMax) //nolint:errcheck
prometheus.DefaultRegisterer.Register(m.TokenRefreshTotal) //nolint:errcheck
prometheus.DefaultRegisterer.Register(m.AccountLockTotal) //nolint:errcheck
prometheus.DefaultRegisterer.Register(m.AnomalyDetectedTotal) //nolint:errcheck
prometheus.DefaultRegisterer.Register(m.ErrorBudgetBurnRate) //nolint:errcheck
globalSLOMetrics = m
})
return globalSLOMetrics
}
// GetRegistry 获取私有 registry测试使用
func (m *SLOMetrics) GetRegistry() *prometheus.Registry {
return m.registry
}
// RecordCacheHit 记录缓存命中
func (m *SLOMetrics) RecordCacheHit(level, operation string) {
m.CacheHitsTotal.WithLabelValues(level, operation).Inc()
m.CacheOperationsTotal.WithLabelValues(level, operation).Inc()
}
// RecordCacheMiss 记录缓存未命中
func (m *SLOMetrics) RecordCacheMiss(level, operation string) {
m.CacheOperationsTotal.WithLabelValues(level, operation).Inc()
}
// RecordTokenRefresh 记录 Token 刷新操作
func (m *SLOMetrics) RecordTokenRefresh(status string) {
m.TokenRefreshTotal.WithLabelValues(status).Inc()
}
// RecordAccountLock 记录账号锁定事件
func (m *SLOMetrics) RecordAccountLock() {
m.AccountLockTotal.Inc()
}
// RecordAnomaly 记录异常检测事件
func (m *SLOMetrics) RecordAnomaly(anomalyType string) {
m.AnomalyDetectedTotal.WithLabelValues(anomalyType).Inc()
}
// SetDBConnections 更新数据库连接池状态
func (m *SLOMetrics) SetDBConnections(active, max float64) {
m.DBConnectionsActive.Set(active)
m.DBConnectionsMax.Set(max)
}
// SetErrorBudgetBurnRate 设置错误预算燃烧率
func (m *SLOMetrics) SetErrorBudgetBurnRate(slo string, burnRate float64) {
m.ErrorBudgetBurnRate.WithLabelValues(slo).Set(burnRate)
}