Files
llm-intelligence/scripts/verification_executor.go

592 lines
15 KiB
Go
Raw Permalink Normal View History

// verification_executor.go
// Reads TASKS.md, runs each task's verification.command,
// matches expected_evidence, outputs pass/fail report.
//
// Usage: go run scripts/verification_executor.go [--dry-run] [--task T-Q2-1.1]
//go:build llm_script
package main
import (
"bufio"
"bytes"
"context"
"flag"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strings"
"time"
)
type Verification struct {
Mode string
Command string
ExpectedEvidence string
TimeoutSeconds int
EvidenceGrade string
TaskType string
}
type TaskResult struct {
TaskID string
TaskName string
Verified bool
Command string
ExitCode int
Stdout string
Stderr string
StdoutSummary string
StderrSummary string
Error string
Reason string
FailureClass string
EvidenceGrade string
TaskType string
}
func main() {
dryRun := flag.Bool("dry-run", false, "print commands without executing")
taskFilter := flag.String("task", "", "filter by task ID (e.g. T-Q2-1.1)")
tasksPathFlag := flag.String("tasks", "", "path to TASKS.md")
statusFilter := flag.String("status", "all", "filter by normalized status: all|completed|in_progress|planned|paused|unknown")
completedOnly := flag.Bool("completed-only", false, "shortcut for --status completed")
flag.Parse()
tasksPath := resolveTasksPath(*tasksPathFlag)
f, err := os.Open(tasksPath)
if err != nil {
fmt.Fprintf(os.Stderr, "open TASKS.md: %v\n", err)
os.Exit(1)
}
defer f.Close()
tasks := parseTasks(f)
if *taskFilter != "" {
var filtered []taskEntry
for _, t := range tasks {
if t.ID == *taskFilter {
filtered = append(filtered, t)
}
}
tasks = filtered
}
effectiveStatus := *statusFilter
if *completedOnly {
effectiveStatus = "completed"
}
tasks, err = filterTasksByStatus(tasks, effectiveStatus)
if err != nil {
fmt.Fprintf(os.Stderr, "filter tasks: %v\n", err)
os.Exit(1)
}
fmt.Printf("=== Verification Report (%s) ===\n", time.Now().Format("2006-01-02 15:04"))
fmt.Printf("Tasks checked: %d | Dry-run: %v | Status: %s | TASKS: %s\n\n", len(tasks), *dryRun, effectiveStatus, tasksPath)
var passed, failed int
var results []TaskResult
for _, t := range tasks {
r := verifyTask(t, *dryRun)
results = append(results, r)
if r.Verified {
passed++
} else {
failed++
}
}
for _, r := range results {
icon := "✅"
if !r.Verified {
icon = "❌"
}
fmt.Printf("%s [%s] %s\n", icon, r.TaskID, r.TaskName)
if r.Command != "" {
fmt.Printf(" cmd: %s\n", r.Command)
}
if r.EvidenceGrade != "" || r.TaskType != "" {
fmt.Printf(" grade: %s | type: %s\n", r.EvidenceGrade, r.TaskType)
}
if r.StderrSummary != "" {
fmt.Printf(" stderr: %s\n", r.StderrSummary)
}
if r.StdoutSummary != "" && (!r.Verified || r.Reason != "" || r.Error != "") {
fmt.Printf(" stdout: %s\n", r.StdoutSummary)
}
if r.FailureClass != "" {
fmt.Printf(" class: %s\n", r.FailureClass)
}
if r.Error != "" {
fmt.Printf(" ERROR: %s\n", r.Error)
} else if r.ExitCode != 0 && r.Stdout != "" {
fmt.Printf(" output: %s\n", strings.TrimSpace(r.Stdout))
} else if r.Reason != "" {
fmt.Printf(" reason: %s\n", r.Reason)
}
}
fmt.Printf("\n=== Summary: %d passed, %d failed ===\n", passed, failed)
os.Exit(determineProcessExitCode(results))
}
func resolveTasksPath(flagValue string) string {
envValue := os.Getenv("TASKS_PATH")
wd := ""
if currentWD, err := os.Getwd(); err == nil {
wd = currentWD
}
sourceDir := ""
if _, sourcePath, _, ok := runtime.Caller(0); ok {
sourceDir = filepath.Dir(sourcePath)
}
return resolveTasksPathWithContext(flagValue, envValue, wd, sourceDir, "/home/long/.openclaw/workspace/TASKS.md")
}
func resolveTasksPathWithContext(flagValue, envValue, wd, sourceDir, globalTasksPath string) string {
candidates := []string{}
if flagValue != "" {
candidates = append(candidates, flagValue)
}
if envValue != "" {
candidates = append(candidates, envValue)
}
if wd != "" {
candidates = append(candidates,
filepath.Join(wd, "TASKS.md"),
filepath.Join(wd, "..", "TASKS.md"),
)
}
defaultProjectTasks := ""
if sourceDir != "" {
defaultProjectTasks = filepath.Join(sourceDir, "..", "TASKS.md")
candidates = append(candidates, defaultProjectTasks)
}
seen := map[string]struct{}{}
for _, candidate := range candidates {
if candidate == "" {
continue
}
cleaned := filepath.Clean(candidate)
if _, ok := seen[cleaned]; ok {
continue
}
seen[cleaned] = struct{}{}
if _, err := os.Stat(cleaned); err == nil {
return cleaned
}
}
if flagValue != "" {
return filepath.Clean(flagValue)
}
if envValue != "" {
return filepath.Clean(envValue)
}
if defaultProjectTasks != "" {
return filepath.Clean(defaultProjectTasks)
}
if wd != "" {
return filepath.Clean(filepath.Join(wd, "TASKS.md"))
}
if globalTasksPath != "" {
return filepath.Clean(globalTasksPath)
}
return "TASKS.md"
}
type taskEntry struct {
ID string
Name string
Status string
Verification Verification
HasVerification bool
}
func parseTasks(f *os.File) []taskEntry {
var tasks []taskEntry
var currentTask *taskEntry
inVerification := false
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
// Match task header: ### T-1.1 🔶 Phase 1 范围冻结
taskRe := regexp.MustCompile(`^### (T-[A-Za-z0-9.-]+)\s+[^\s]+\s+(.+)`)
if m := taskRe.FindStringSubmatch(line); m != nil {
if currentTask != nil {
tasks = append(tasks, *currentTask)
}
currentTask = &taskEntry{ID: m[1], Name: m[2], Status: normalizeStatusFromText(line)}
inVerification = false
continue
}
if currentTask == nil {
continue
}
// Check for verification block
if strings.Contains(line, "**verification**") || strings.Contains(line, "**verification**:") {
inVerification = true
currentTask.HasVerification = true
continue
}
if !inVerification {
statusRe := regexp.MustCompile(`^\s*-\s+\*\*状态\*\*(.+)$`)
if m := statusRe.FindStringSubmatch(line); m != nil {
currentTask.Status = normalizeStatusFromText(m[1])
}
continue
}
// Parse verification fields (indented under **verification**)
// - mode: `artifact_present`
modeRe := regexp.MustCompile(`^\s+- mode:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := modeRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.Mode = m[1]
continue
}
cmdRe := regexp.MustCompile(`^\s+- command:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := cmdRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.Command = m[1]
continue
}
expRe := regexp.MustCompile(`^\s+- expected_evidence:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := expRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.ExpectedEvidence = m[1]
continue
}
evidenceGradeRe := regexp.MustCompile(`^\s+- evidence_grade:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := evidenceGradeRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.EvidenceGrade = m[1]
continue
}
taskTypeRe := regexp.MustCompile(`^\s+- task_type:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := taskTypeRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.TaskType = m[1]
continue
}
timeoutRe := regexp.MustCompile(`^\s+- timeout_seconds:\s+(\d+)`)
if m := timeoutRe.FindStringSubmatch(line); m != nil {
fmt.Sscanf(m[1], "%d", &currentTask.Verification.TimeoutSeconds)
continue
}
// Blank line or new top-level field ends verification block
if strings.TrimSpace(line) == "" || (strings.HasPrefix(strings.TrimSpace(line), "**") && !strings.Contains(line, "verification")) {
inVerification = false
}
}
if currentTask != nil {
tasks = append(tasks, *currentTask)
}
return tasks
}
func verifyTask(t taskEntry, dryRun bool) TaskResult {
r := TaskResult{TaskID: t.ID, TaskName: t.Name}
if !t.HasVerification {
r.Reason = "no verification block"
r.FailureClass = "missing_verification"
r.Verified = true // No verification = trivially pass
return r
}
t.Verification.Mode = strings.TrimSpace(t.Verification.Mode)
t.Verification.TaskType = normalizeTaskType(t.Verification.TaskType)
t.Verification.EvidenceGrade = normalizeEvidenceGrade(t.Verification.Mode, t.Verification.EvidenceGrade)
r.TaskType = t.Verification.TaskType
r.EvidenceGrade = t.Verification.EvidenceGrade
if validationErr := validateVerification(t.Verification); validationErr != "" {
r.Verified = false
r.Reason = validationErr
r.FailureClass = "verification_config_failure"
return r
}
if t.Verification.Command == "" {
if t.Verification.Mode == "artifact_present" {
r.Verified = true
return r
}
r.Reason = "verification.command is empty"
r.FailureClass = "verification_config_failure"
r.Verified = false
return r
}
r.Command = t.Verification.Command
if t.Verification.TimeoutSeconds == 0 {
t.Verification.TimeoutSeconds = 30
}
if dryRun {
r.Stdout = "(dry-run, command not executed)"
r.Verified = true
return r
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(t.Verification.TimeoutSeconds)*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "sh", "-c", t.Verification.Command)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
r.ExitCode = 0
if err != nil {
r.ExitCode = -1
r.FailureClass = "tool_execution_failure"
if ctx.Err() == context.DeadlineExceeded {
r.Error = fmt.Sprintf("timeout after %ds", t.Verification.TimeoutSeconds)
} else {
r.Error = err.Error()
}
}
r.Stdout = stdout.String()
r.Stderr = stderr.String()
r.StdoutSummary = summarizeOutput(r.Stdout)
r.StderrSummary = summarizeOutput(r.Stderr)
if r.ExitCode != 0 && t.Verification.Mode == "test_pass" {
r.Verified = false
return r
}
// Match expected_evidence
if t.Verification.ExpectedEvidence != "" {
evidence := t.Verification.ExpectedEvidence
matched := false
if strings.HasPrefix(evidence, "[") && strings.HasSuffix(evidence, "]") {
// Regex range like [4-9]
re := regexp.MustCompile(`\[(\d+)-(\d+)\]`)
if m := re.FindStringSubmatch(evidence); m != nil {
var lo, hi int
fmt.Sscanf(m[1], "%d", &lo)
fmt.Sscanf(m[2], "%d", &hi)
reOut := regexp.MustCompile(fmt.Sprintf(`^\s*(\d+)\s*$`))
if numMatch := reOut.FindStringSubmatch(strings.TrimSpace(r.Stdout)); numMatch != nil {
var n int
fmt.Sscanf(numMatch[1], "%d", &n)
matched = n >= lo && n <= hi
}
}
} else if strings.Contains(r.Stdout, evidence) {
matched = true
}
r.Verified = matched
if !matched {
r.Reason = fmt.Sprintf("expected_evidence '%s' not found in output", evidence)
r.FailureClass = "business_assertion_failure"
}
} else if r.ExitCode == 0 {
r.Verified = true
} else {
r.Verified = false
r.Reason = fmt.Sprintf("exit code %d", r.ExitCode)
r.FailureClass = "tool_execution_failure"
}
return r
}
func classifyFailureTier(r TaskResult) int {
if r.Verified {
return 0
}
if r.EvidenceGrade == "runtime-verified" {
return 2
}
return 3
}
func determineProcessExitCode(results []TaskResult) int {
hasRuntimeFailure := false
hasLowerTierFailure := false
for _, r := range results {
tier := classifyFailureTier(r)
switch tier {
case 2:
hasRuntimeFailure = true
case 3:
hasLowerTierFailure = true
}
}
if hasRuntimeFailure {
return 2
}
if hasLowerTierFailure {
return 3
}
return 0
}
func normalizeEvidenceGrade(mode, explicit string) string {
if explicit = strings.TrimSpace(explicit); explicit != "" {
return explicit
}
switch strings.TrimSpace(mode) {
case "test_pass":
return "runtime-verified"
case "artifact_present":
return "artifact-present"
case "semantic":
return "doc-claimed"
default:
return ""
}
}
func normalizeTaskType(raw string) string {
raw = strings.TrimSpace(raw)
if raw == "" {
return "unspecified"
}
return raw
}
func normalizeStatusFromText(raw string) string {
lower := strings.ToLower(strings.TrimSpace(raw))
switch {
case strings.Contains(raw, "✅") || strings.Contains(raw, "完成"):
return "completed"
case strings.Contains(raw, "🟡") || strings.Contains(raw, "进行中"):
return "in_progress"
case strings.Contains(raw, "🔶") || strings.Contains(raw, "🔴") || strings.Contains(raw, "待启动") || strings.Contains(raw, "未开始"):
return "planned"
case strings.Contains(raw, "⏸️") || strings.Contains(raw, "待规划") || strings.Contains(raw, "暂停"):
return "paused"
case lower == "":
return "unknown"
default:
return "unknown"
}
}
func filterTasksByStatus(tasks []taskEntry, filter string) ([]taskEntry, error) {
filter = strings.TrimSpace(filter)
if filter == "" {
filter = "all"
}
valid := map[string]struct{}{
"all": {},
"completed": {},
"in_progress": {},
"planned": {},
"paused": {},
"unknown": {},
}
if _, ok := valid[filter]; !ok {
return nil, fmt.Errorf("unsupported status filter: %s", filter)
}
if filter == "all" {
return tasks, nil
}
filtered := make([]taskEntry, 0, len(tasks))
for _, t := range tasks {
status := t.Status
if status == "" {
status = "unknown"
}
if status == filter {
filtered = append(filtered, t)
}
}
return filtered, nil
}
func summarizeOutput(raw string) string {
cleaned := strings.TrimSpace(raw)
if cleaned == "" {
return ""
}
cleaned = strings.Join(strings.Fields(cleaned), " ")
const limit = 220
if len(cleaned) <= limit {
return cleaned
}
return cleaned[:limit] + "..."
}
func validateVerification(v Verification) string {
validModes := map[string]struct{}{
"test_pass": {},
"artifact_present": {},
"semantic": {},
}
if _, ok := validModes[v.Mode]; !ok {
return fmt.Sprintf("unsupported verification mode: %s", v.Mode)
}
validGrades := map[string]struct{}{
"runtime-verified": {},
"artifact-present": {},
"doc-claimed": {},
}
if v.EvidenceGrade != "" {
if _, ok := validGrades[v.EvidenceGrade]; !ok {
return fmt.Sprintf("unsupported evidence grade: %s", v.EvidenceGrade)
}
}
validTaskTypes := map[string]struct{}{
"unspecified": {},
"code": {},
"automation": {},
"documentation": {},
"configuration": {},
"data": {},
"analysis": {},
}
if _, ok := validTaskTypes[v.TaskType]; !ok {
return fmt.Sprintf("unsupported task type: %s", v.TaskType)
}
if (v.TaskType == "code" || v.TaskType == "automation") && v.Mode == "semantic" {
return fmt.Sprintf("semantic-only verification is not allowed for %s tasks", v.TaskType)
}
if v.Mode == "artifact_present" {
if strings.TrimSpace(v.Command) != "" || strings.TrimSpace(v.ExpectedEvidence) != "" {
return "artifact_present does not allow command or expected_evidence; use test_pass for executable verification"
}
if v.TaskType == "code" || v.TaskType == "automation" || v.TaskType == "data" || v.TaskType == "analysis" {
return fmt.Sprintf("artifact_present is not allowed for %s tasks", v.TaskType)
}
}
return ""
}