388 lines
13 KiB
Markdown
388 lines
13 KiB
Markdown
|
|
# AI-Ops 核心接口设计
|
|||
|
|
|
|||
|
|
> 版本:v1.0 | 状态:初稿
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 1. 内部模块间接口
|
|||
|
|
|
|||
|
|
### 1.1 MetricService
|
|||
|
|
|
|||
|
|
```go
|
|||
|
|
type MetricService interface {
|
|||
|
|
// 采集指标
|
|||
|
|
Collect(ctx context.Context, source string, metrics []MetricPoint) error
|
|||
|
|
// 查询时序数据
|
|||
|
|
Query(ctx context.Context, req MetricQueryRequest) (*MetricQueryResult, error)
|
|||
|
|
// 获取最新值
|
|||
|
|
GetLatest(ctx context.Context, source, metricName string) (*MetricPoint, error)
|
|||
|
|
// 存储保留期检查
|
|||
|
|
PurgeExpired(ctx context.Context, before time.Time) (int64, error)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type MetricPoint struct {
|
|||
|
|
Source string
|
|||
|
|
Name string
|
|||
|
|
Value float64
|
|||
|
|
Tags map[string]string
|
|||
|
|
Timestamp time.Time
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type MetricQueryRequest struct {
|
|||
|
|
Source string
|
|||
|
|
Name string
|
|||
|
|
StartTime time.Time
|
|||
|
|
EndTime time.Time
|
|||
|
|
Interval time.Duration // 聚合间隔
|
|||
|
|
Tags map[string]string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type MetricQueryResult struct {
|
|||
|
|
Points []MetricPoint
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 1.2 AlertService
|
|||
|
|
|
|||
|
|
```go
|
|||
|
|
type AlertService interface {
|
|||
|
|
// 规则 CRUD
|
|||
|
|
CreateRule(ctx context.Context, rule AlertRule) (*AlertRule, error)
|
|||
|
|
UpdateRule(ctx context.Context, rule AlertRule) (*AlertRule, error)
|
|||
|
|
DeleteRule(ctx context.Context, ruleID string) error
|
|||
|
|
GetRule(ctx context.Context, ruleID string) (*AlertRule, error)
|
|||
|
|
ListRules(ctx context.Context, filter RuleFilter) ([]AlertRule, error)
|
|||
|
|
|
|||
|
|
// 告警事件管理
|
|||
|
|
ListAlerts(ctx context.Context, filter AlertFilter) ([]AlertEvent, error)
|
|||
|
|
Acknowledge(ctx context.Context, alertID, actorID string) error
|
|||
|
|
Ignore(ctx context.Context, alertID, actorID string) error
|
|||
|
|
Escalate(ctx context.Context, alertID, reason string) error
|
|||
|
|
|
|||
|
|
// 实时评估
|
|||
|
|
Evaluate(ctx context.Context, ruleID string) (*AlertEvent, error)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type AlertRule struct {
|
|||
|
|
ID string
|
|||
|
|
Name string
|
|||
|
|
MetricSource string
|
|||
|
|
MetricName string
|
|||
|
|
ThresholdType string // > < = regex
|
|||
|
|
ThresholdValue string
|
|||
|
|
DurationMin int
|
|||
|
|
Level string // P0 P1 P2 P3
|
|||
|
|
ChannelIDs []string
|
|||
|
|
HealingAction *string
|
|||
|
|
HealingConfig map[string]any
|
|||
|
|
IsSandboxed bool
|
|||
|
|
Enabled bool
|
|||
|
|
Version int
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type AlertEvent struct {
|
|||
|
|
ID string
|
|||
|
|
RuleID string
|
|||
|
|
Level string
|
|||
|
|
ResourceType string
|
|||
|
|
ResourceID string
|
|||
|
|
CurrentValue string
|
|||
|
|
ThresholdValue string
|
|||
|
|
Status string // triggered notified healing resolved escalated acknowledged
|
|||
|
|
IsAggregated bool
|
|||
|
|
AggregatedCount int
|
|||
|
|
CreatedAt time.Time
|
|||
|
|
UpdatedAt time.Time
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 1.3 HealingService
|
|||
|
|
|
|||
|
|
```go
|
|||
|
|
type HealingService interface {
|
|||
|
|
// 执行自愈动作
|
|||
|
|
Execute(ctx context.Context, action HealingAction, target ResourceTarget) (*HealingResult, error)
|
|||
|
|
// 获取可用动作列表
|
|||
|
|
ListActions(ctx context.Context) []HealingActionMeta
|
|||
|
|
// 回滚自愈动作
|
|||
|
|
Rollback(ctx context.Context, executionID string) error
|
|||
|
|
// 查询执行历史
|
|||
|
|
ListExecutions(ctx context.Context, filter ExecutionFilter) ([]HealingExecution, error)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type HealingAction struct {
|
|||
|
|
Type string // restart_instance switch_route throttle isolate_node invoke_script
|
|||
|
|
Config map[string]any
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type ResourceTarget struct {
|
|||
|
|
Type string // service provider model
|
|||
|
|
ID string
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type HealingResult struct {
|
|||
|
|
ExecutionID string
|
|||
|
|
Success bool
|
|||
|
|
BeforeState map[string]any
|
|||
|
|
AfterState map[string]any
|
|||
|
|
Error *string
|
|||
|
|
ExecutedAt time.Time
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 1.4 AuditService
|
|||
|
|
|
|||
|
|
```go
|
|||
|
|
type AuditService interface {
|
|||
|
|
// 记录审计事件
|
|||
|
|
Record(ctx context.Context, event AuditEvent) error
|
|||
|
|
// 查询审计日志
|
|||
|
|
Query(ctx context.Context, filter AuditFilter) ([]AuditEvent, error)
|
|||
|
|
// 回滚操作
|
|||
|
|
Rollback(ctx context.Context, eventID string, actorID string) (*AuditEvent, error)
|
|||
|
|
// 影响面计算
|
|||
|
|
CalculateImpact(ctx context.Context, objectType, objectID string, proposedState map[string]any) (*ImpactReport, error)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type AuditEvent struct {
|
|||
|
|
EventID string
|
|||
|
|
TenantID string
|
|||
|
|
ObjectType string
|
|||
|
|
ObjectID string
|
|||
|
|
Action string // create update delete rollback
|
|||
|
|
BeforeState map[string]any
|
|||
|
|
AfterState map[string]any
|
|||
|
|
RequestID string
|
|||
|
|
ResultCode string
|
|||
|
|
SourceIP string
|
|||
|
|
ActorID string
|
|||
|
|
CreatedAt time.Time
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type ImpactReport struct {
|
|||
|
|
RiskLevel string // low medium high
|
|||
|
|
EstimatedRejectRate float64 // 预估拒绝率
|
|||
|
|
AffectedResources []string
|
|||
|
|
RequiresConfirm bool
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 1.5 CapacityService
|
|||
|
|
|
|||
|
|
```go
|
|||
|
|
type CapacityService interface {
|
|||
|
|
// 获取容量视图
|
|||
|
|
GetDashboard(ctx context.Context, scope CapacityScope) (*CapacityDashboard, error)
|
|||
|
|
// 增长率预测
|
|||
|
|
PredictGrowth(ctx context.Context, metric string, horizon time.Duration) (*GrowthPrediction, error)
|
|||
|
|
// 设置容量阈值
|
|||
|
|
SetThreshold(ctx context.Context, metric string, threshold float64) error
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type CapacityDashboard struct {
|
|||
|
|
Metrics []CapacityMetric
|
|||
|
|
Predictions []GrowthPrediction
|
|||
|
|
LastUpdated time.Time
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type CapacityMetric struct {
|
|||
|
|
Name string
|
|||
|
|
Current float64
|
|||
|
|
Limit float64
|
|||
|
|
Unit string
|
|||
|
|
Utilization float64
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type GrowthPrediction struct {
|
|||
|
|
Metric string
|
|||
|
|
DailyGrowth float64
|
|||
|
|
DaysToLimit *int // nil 表示不会达到上限
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 1.6 IntegrationPlugin
|
|||
|
|
|
|||
|
|
`IntegrationPlugin` 是 AI-Ops 与立交桥主项目(gateway/supply-api)集成运行时的核心接口。主项目通过实现该接口,将 AI-Ops 的能力挂载到自身进程中。
|
|||
|
|
|
|||
|
|
```go
|
|||
|
|
// IntegrationPlugin 定义了 AI-Ops 模块在集成运行时必须实现的接口契约
|
|||
|
|
// 注意:模块必须通过显式 import + init 注册到全局注册表,
|
|||
|
|
// 且主程序必须通过配置显式 Enable 才能激活模块。
|
|||
|
|
type IntegrationPlugin interface {
|
|||
|
|
// Name 返回模块唯一标识,用于配置关联和日志区分
|
|||
|
|
// 示例: "alert", "healing", "audit", "capacity"
|
|||
|
|
Name() string
|
|||
|
|
|
|||
|
|
// Init 在模块被启用时执行一次初始化
|
|||
|
|
// 负责: 连接数据库、初始化缓存、启动后台 worker 等
|
|||
|
|
// 若初始化失败,整个模块不得启动,主程序应记录错误并继续启动其他模块
|
|||
|
|
Init(ctx context.Context, cfg Config) error
|
|||
|
|
|
|||
|
|
// RegisterRoutes 将模块的 HTTP 接口注册到主程序的 ServeMux
|
|||
|
|
// 路径必须以 /internal/ai-ops/{module}/ 为前缀
|
|||
|
|
// 示例: /internal/ai-ops/alert/rules, /internal/ai-ops/healing/actions
|
|||
|
|
RegisterRoutes(mux *http.ServeMux) error
|
|||
|
|
|
|||
|
|
// HealthChecks 返回模块的健康检查函数列表
|
|||
|
|
// 主程序将聚合所有模块的健康检查到 /actuator/health 和 /actuator/health/ready
|
|||
|
|
HealthChecks() []HealthCheckFunc
|
|||
|
|
|
|||
|
|
// Shutdown 在主程序退出时按 LIFO 顺序调用
|
|||
|
|
// 负责: 关闭数据库连接、停止 worker、释放资源
|
|||
|
|
// 超时上限 30 秒,超时后强制终止
|
|||
|
|
Shutdown(ctx context.Context) error
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HealthCheckFunc 是健康检查函数签名
|
|||
|
|
type HealthCheckFunc func(ctx context.Context) (name string, status string, detail string)
|
|||
|
|
|
|||
|
|
// PluginRegistry 是全局模块注册表(线程安全)
|
|||
|
|
var registry = make(map[string]IntegrationPlugin)
|
|||
|
|
var registryMu sync.RWMutex
|
|||
|
|
|
|||
|
|
// Register 在 init() 中调用,将模块注册到全局注册表
|
|||
|
|
func Register(p IntegrationPlugin) {
|
|||
|
|
registryMu.Lock()
|
|||
|
|
defer registryMu.Unlock()
|
|||
|
|
if _, exists := registry[p.Name()]; exists {
|
|||
|
|
panic("duplicate plugin registration: " + p.Name())
|
|||
|
|
}
|
|||
|
|
registry[p.Name()] = p
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GetRegisteredPlugins 返回已注册的所有模块拷贝
|
|||
|
|
func GetRegisteredPlugins() []IntegrationPlugin {
|
|||
|
|
registryMu.RLock()
|
|||
|
|
defer registryMu.RUnlock()
|
|||
|
|
result := make([]IntegrationPlugin, 0, len(registry))
|
|||
|
|
for _, p := range registry {
|
|||
|
|
result = append(result, p)
|
|||
|
|
}
|
|||
|
|
return result
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
**注册与使用示例**:
|
|||
|
|
|
|||
|
|
```go
|
|||
|
|
package alert
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"context"
|
|||
|
|
"net/http"
|
|||
|
|
aiops "github.com/company/ai-ops"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
func init() {
|
|||
|
|
// 显式注册到全局注册表
|
|||
|
|
aiops.Register(&AlertPlugin{})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
type AlertPlugin struct{ /* ... */ }
|
|||
|
|
|
|||
|
|
func (p *AlertPlugin) Name() string { return "alert" }
|
|||
|
|
func (p *AlertPlugin) Init(ctx context.Context, cfg aiops.Config) error { /* ... */ }
|
|||
|
|
func (p *AlertPlugin) RegisterRoutes(mux *http.ServeMux) error { /* ... */ }
|
|||
|
|
func (p *AlertPlugin) HealthChecks() []aiops.HealthCheckFunc { /* ... */ }
|
|||
|
|
func (p *AlertPlugin) Shutdown(ctx context.Context) error { /* ... */ }
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
**关键约束**:
|
|||
|
|
1. **显式 Enable**:主程序配置文件中必须显式开启模块,默认关闭。示例:`ai_ops.alert.enabled: true`。
|
|||
|
|
2. **路由前缀统一**:所有注册的路由必须以 `/internal/ai-ops/` 为前缀,避免与主系统路径冲突。
|
|||
|
|
3. **数据库前缀统一**:插件创建的表必须使用 `ai_ops_` 前缀,避免 schema 冲突。
|
|||
|
|
4. **健康检查注入**:插件实现的 HealthChecks 必须被主程序聚合到 /actuator/health 和 /actuator/health/ready 。
|
|||
|
|
5. **顺序关闭**:主程序关闭时必须按后进先出(LIFO)顺序调用各插件的 Shutdown 。
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 2. 外部系统集成接口
|
|||
|
|
|
|||
|
|
### 2.1 与 Bridge Gateway 集成
|
|||
|
|
|
|||
|
|
| 方法 | 路径 | 请求 | 响应 | 说明 |
|
|||
|
|
|------|------|------|------|------|
|
|||
|
|
| 查询服务状态 | `GET /internal/gateway/health` | - | `{"status":"up","services":{}}` | 诊断时查询各服务健康状态 |
|
|||
|
|
| 获取路由策略 | `GET /internal/gateway/routes` | - | `{"routes":[]}` | 读取当前路由配置,用于影响面分析 |
|
|||
|
|
| 修改路由策略 | `POST /internal/gateway/routes` | `{"action":"switch_route","target":"","config":{}}` | `{"success":true}` | 自愈动作调用,需审计 |
|
|||
|
|
|| 获取请求量统计 | `GET /internal/gateway/metrics` | `?metric=qps&duration=5m` | `{"value":1234.5}` | 采集指标数据 |
|
|||
|
|
|
|||
|
|
> **安全约束**:`/internal/gateway/metrics` 端点仅限内网 IP 访问(如 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16),或需要携带有效的服务间 API Key。公网直接访问应返回 403 Forbidden。
|
|||
|
|
|
|||
|
|
### 2.2 与 supply-api 集成
|
|||
|
|
|
|||
|
|
| 方法 | 路径 | 请求 | 响应 | 说明 |
|
|||
|
|
|------|------|------|------|------|
|
|||
|
|
| 查询供应商状态 | `GET /internal/supply/accounts/health` | - | `{"accounts":[]}` | 诊断供应商健康状态 |
|
|||
|
|
| 获取审计日志格式 | `GET /internal/supply/audit/schema` | - | `{"schema":{}}` | 确保审计事件格式一致 |
|
|||
|
|
|
|||
|
|
### 2.3 与 platform-token-runtime 集成
|
|||
|
|
|
|||
|
|
| 方法 | 路径 | 请求 | 响应 | 说明 |
|
|||
|
|
|------|------|------|------|------|
|
|||
|
|
| 获取 Token 消耗 | `GET /internal/runtime/token-usage` | `?window=1h` | `{"total":12345,"by_model":{}}` | 采集 Token 消耗指标 |
|
|||
|
|
| 获取容量使用率 | `GET /internal/runtime/capacity` | - | `{"utilization":0.75}` | 采集容量指标 |
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
## 3. API 接口规范
|
|||
|
|
|
|||
|
|
### 3.1 REST API 基础
|
|||
|
|
|
|||
|
|
- **基础路径**: `/api/v1/ai-ops/`
|
|||
|
|
- **内部路径** (集成模式): `/internal/ai-ops/`
|
|||
|
|
- **内容类型**: `application/json`
|
|||
|
|
- **错误响应格式**:
|
|||
|
|
```json
|
|||
|
|
{
|
|||
|
|
"error_code": "OPS_{CATEGORY}_{CODE}",
|
|||
|
|
"message": "人类可读的错误信息",
|
|||
|
|
"detail": {} // 可选,包含额外的调试信息
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### 3.2 错误码
|
|||
|
|
|
|||
|
|
| 错误码 | HTTP 状态 | 说明 |
|
|||
|
|
|---------|-----------|------|
|
|||
|
|
| `OPS_GEN_4001` | 400 | 请求参数错误 |
|
|||
|
|
| `OPS_GEN_4002` | 401 | 未授权 |
|
|||
|
|
| `OPS_GEN_4003` | 403 | 权限不足 |
|
|||
|
|
| `OPS_GEN_4004` | 404 | 资源不存在 |
|
|||
|
|
| `OPS_GEN_4005` | 409 | 资源冲突(如名称已存在) |
|
|||
|
|
| `OPS_GEN_4006` | 413 | 请求体过大(如日志查询时间范围过大) |
|
|||
|
|
| `OPS_GEN_5001` | 500 | 内部服务错误 |
|
|||
|
|
| `OPS_MET_4001` | 400 | 指标名称无效 |
|
|||
|
|
| `OPS_MET_4002` | 400 | 时间范围不合法 |
|
|||
|
|
| `OPS_ALT_4001` | 400 | 规则名称已存在 |
|
|||
|
|
| `OPS_ALT_4002` | 400 | 规则参数验证失败 |
|
|||
|
|
| `OPS_ALT_4003` | 409 | 规则被其他用户修改(版本冲突) |
|
|||
|
|
| `OPS_HEAL_4001` | 400 | 自愈动作参数无效 |
|
|||
|
|
| `OPS_HEAL_4002` | 409 | 自愈动作正在执行中 |
|
|||
|
|
| `OPS_HEAL_4003` | 400 | 回滚目标执行不存在 |
|
|||
|
|
| `OPS_AUD_4001` | 403 | 无权进行审计操作 |
|
|||
|
|
| `OPS_AUD_4101` | 400 | 回滚目标资源不存在 |
|
|||
|
|
| `OPS_AUD_4102` | 409 | 回滚目标已被后续修改覆盖 |
|
|||
|
|
| `OPS_CAP_4001` | 400 | 容量指标不存在 |
|
|||
|
|
|
|||
|
|
### 3.3 分页
|
|||
|
|
|
|||
|
|
- `列表接口` 支持分页参数:`?page=1&page_size=20`
|
|||
|
|
- 默认 `page_size=20`,最大 `page_size=100`
|
|||
|
|
- 响应体包含:`{"items":[],"total":123,"page":1,"page_size":20}`
|
|||
|
|
|
|||
|
|
### 3.4 WebSocket 接口
|
|||
|
|
|
|||
|
|
**路径**: `/ws/v1/ai-ops/alerts`
|
|||
|
|
|
|||
|
|
**鉴权机制**:
|
|||
|
|
- 连接建立时必须在查询参数中携带有效 JWT Token:`?token=<jwt>`。
|
|||
|
|
- 服务端在升级 WebSocket 连接前必须验证 token 有效性、过期时间和角色权限。
|
|||
|
|
- token 无效或已过期时,立即返回 401 Unauthorized 并关闭连接。
|
|||
|
|
- 订阅范围根据用户角色过滤,查看者只能接收 P1 及以下级别告警,管理员可接收所有级别。
|
|||
|
|
|
|||
|
|
**功能**:
|
|||
|
|
- 客户端订阅后,实时推送新告警事件。
|
|||
|
|
- 支持按级别过滤:`?levels=P0,P1`。
|
|||
|
|
- 心跳间隔 30 秒。
|