Files
llm-intelligence/scripts/verification_executor.go
2026-05-29 18:48:48 +08:00

592 lines
15 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// verification_executor.go
// Reads TASKS.md, runs each task's verification.command,
// matches expected_evidence, outputs pass/fail report.
//
// Usage: go run scripts/verification_executor.go [--dry-run] [--task T-Q2-1.1]
//go:build llm_script
package main
import (
"bufio"
"bytes"
"context"
"flag"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strings"
"time"
)
type Verification struct {
Mode string
Command string
ExpectedEvidence string
TimeoutSeconds int
EvidenceGrade string
TaskType string
}
type TaskResult struct {
TaskID string
TaskName string
Verified bool
Command string
ExitCode int
Stdout string
Stderr string
StdoutSummary string
StderrSummary string
Error string
Reason string
FailureClass string
EvidenceGrade string
TaskType string
}
func main() {
dryRun := flag.Bool("dry-run", false, "print commands without executing")
taskFilter := flag.String("task", "", "filter by task ID (e.g. T-Q2-1.1)")
tasksPathFlag := flag.String("tasks", "", "path to TASKS.md")
statusFilter := flag.String("status", "all", "filter by normalized status: all|completed|in_progress|planned|paused|unknown")
completedOnly := flag.Bool("completed-only", false, "shortcut for --status completed")
flag.Parse()
tasksPath := resolveTasksPath(*tasksPathFlag)
f, err := os.Open(tasksPath)
if err != nil {
fmt.Fprintf(os.Stderr, "open TASKS.md: %v\n", err)
os.Exit(1)
}
defer f.Close()
tasks := parseTasks(f)
if *taskFilter != "" {
var filtered []taskEntry
for _, t := range tasks {
if t.ID == *taskFilter {
filtered = append(filtered, t)
}
}
tasks = filtered
}
effectiveStatus := *statusFilter
if *completedOnly {
effectiveStatus = "completed"
}
tasks, err = filterTasksByStatus(tasks, effectiveStatus)
if err != nil {
fmt.Fprintf(os.Stderr, "filter tasks: %v\n", err)
os.Exit(1)
}
fmt.Printf("=== Verification Report (%s) ===\n", time.Now().Format("2006-01-02 15:04"))
fmt.Printf("Tasks checked: %d | Dry-run: %v | Status: %s | TASKS: %s\n\n", len(tasks), *dryRun, effectiveStatus, tasksPath)
var passed, failed int
var results []TaskResult
for _, t := range tasks {
r := verifyTask(t, *dryRun)
results = append(results, r)
if r.Verified {
passed++
} else {
failed++
}
}
for _, r := range results {
icon := "✅"
if !r.Verified {
icon = "❌"
}
fmt.Printf("%s [%s] %s\n", icon, r.TaskID, r.TaskName)
if r.Command != "" {
fmt.Printf(" cmd: %s\n", r.Command)
}
if r.EvidenceGrade != "" || r.TaskType != "" {
fmt.Printf(" grade: %s | type: %s\n", r.EvidenceGrade, r.TaskType)
}
if r.StderrSummary != "" {
fmt.Printf(" stderr: %s\n", r.StderrSummary)
}
if r.StdoutSummary != "" && (!r.Verified || r.Reason != "" || r.Error != "") {
fmt.Printf(" stdout: %s\n", r.StdoutSummary)
}
if r.FailureClass != "" {
fmt.Printf(" class: %s\n", r.FailureClass)
}
if r.Error != "" {
fmt.Printf(" ERROR: %s\n", r.Error)
} else if r.ExitCode != 0 && r.Stdout != "" {
fmt.Printf(" output: %s\n", strings.TrimSpace(r.Stdout))
} else if r.Reason != "" {
fmt.Printf(" reason: %s\n", r.Reason)
}
}
fmt.Printf("\n=== Summary: %d passed, %d failed ===\n", passed, failed)
os.Exit(determineProcessExitCode(results))
}
func resolveTasksPath(flagValue string) string {
envValue := os.Getenv("TASKS_PATH")
wd := ""
if currentWD, err := os.Getwd(); err == nil {
wd = currentWD
}
sourceDir := ""
if _, sourcePath, _, ok := runtime.Caller(0); ok {
sourceDir = filepath.Dir(sourcePath)
}
return resolveTasksPathWithContext(flagValue, envValue, wd, sourceDir, "/home/long/.openclaw/workspace/TASKS.md")
}
func resolveTasksPathWithContext(flagValue, envValue, wd, sourceDir, globalTasksPath string) string {
candidates := []string{}
if flagValue != "" {
candidates = append(candidates, flagValue)
}
if envValue != "" {
candidates = append(candidates, envValue)
}
if wd != "" {
candidates = append(candidates,
filepath.Join(wd, "TASKS.md"),
filepath.Join(wd, "..", "TASKS.md"),
)
}
defaultProjectTasks := ""
if sourceDir != "" {
defaultProjectTasks = filepath.Join(sourceDir, "..", "TASKS.md")
candidates = append(candidates, defaultProjectTasks)
}
seen := map[string]struct{}{}
for _, candidate := range candidates {
if candidate == "" {
continue
}
cleaned := filepath.Clean(candidate)
if _, ok := seen[cleaned]; ok {
continue
}
seen[cleaned] = struct{}{}
if _, err := os.Stat(cleaned); err == nil {
return cleaned
}
}
if flagValue != "" {
return filepath.Clean(flagValue)
}
if envValue != "" {
return filepath.Clean(envValue)
}
if defaultProjectTasks != "" {
return filepath.Clean(defaultProjectTasks)
}
if wd != "" {
return filepath.Clean(filepath.Join(wd, "TASKS.md"))
}
if globalTasksPath != "" {
return filepath.Clean(globalTasksPath)
}
return "TASKS.md"
}
type taskEntry struct {
ID string
Name string
Status string
Verification Verification
HasVerification bool
}
func parseTasks(f *os.File) []taskEntry {
var tasks []taskEntry
var currentTask *taskEntry
inVerification := false
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
// Match task header: ### T-1.1 🔶 Phase 1 范围冻结
taskRe := regexp.MustCompile(`^### (T-[A-Za-z0-9.-]+)\s+[^\s]+\s+(.+)`)
if m := taskRe.FindStringSubmatch(line); m != nil {
if currentTask != nil {
tasks = append(tasks, *currentTask)
}
currentTask = &taskEntry{ID: m[1], Name: m[2], Status: normalizeStatusFromText(line)}
inVerification = false
continue
}
if currentTask == nil {
continue
}
// Check for verification block
if strings.Contains(line, "**verification**") || strings.Contains(line, "**verification**:") {
inVerification = true
currentTask.HasVerification = true
continue
}
if !inVerification {
statusRe := regexp.MustCompile(`^\s*-\s+\*\*状态\*\*(.+)$`)
if m := statusRe.FindStringSubmatch(line); m != nil {
currentTask.Status = normalizeStatusFromText(m[1])
}
continue
}
// Parse verification fields (indented under **verification**)
// - mode: `artifact_present`
modeRe := regexp.MustCompile(`^\s+- mode:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := modeRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.Mode = m[1]
continue
}
cmdRe := regexp.MustCompile(`^\s+- command:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := cmdRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.Command = m[1]
continue
}
expRe := regexp.MustCompile(`^\s+- expected_evidence:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := expRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.ExpectedEvidence = m[1]
continue
}
evidenceGradeRe := regexp.MustCompile(`^\s+- evidence_grade:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := evidenceGradeRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.EvidenceGrade = m[1]
continue
}
taskTypeRe := regexp.MustCompile(`^\s+- task_type:\s+` + "`" + `([^` + "`" + `]+)` + "`")
if m := taskTypeRe.FindStringSubmatch(line); m != nil {
currentTask.Verification.TaskType = m[1]
continue
}
timeoutRe := regexp.MustCompile(`^\s+- timeout_seconds:\s+(\d+)`)
if m := timeoutRe.FindStringSubmatch(line); m != nil {
fmt.Sscanf(m[1], "%d", &currentTask.Verification.TimeoutSeconds)
continue
}
// Blank line or new top-level field ends verification block
if strings.TrimSpace(line) == "" || (strings.HasPrefix(strings.TrimSpace(line), "**") && !strings.Contains(line, "verification")) {
inVerification = false
}
}
if currentTask != nil {
tasks = append(tasks, *currentTask)
}
return tasks
}
func verifyTask(t taskEntry, dryRun bool) TaskResult {
r := TaskResult{TaskID: t.ID, TaskName: t.Name}
if !t.HasVerification {
r.Reason = "no verification block"
r.FailureClass = "missing_verification"
r.Verified = true // No verification = trivially pass
return r
}
t.Verification.Mode = strings.TrimSpace(t.Verification.Mode)
t.Verification.TaskType = normalizeTaskType(t.Verification.TaskType)
t.Verification.EvidenceGrade = normalizeEvidenceGrade(t.Verification.Mode, t.Verification.EvidenceGrade)
r.TaskType = t.Verification.TaskType
r.EvidenceGrade = t.Verification.EvidenceGrade
if validationErr := validateVerification(t.Verification); validationErr != "" {
r.Verified = false
r.Reason = validationErr
r.FailureClass = "verification_config_failure"
return r
}
if t.Verification.Command == "" {
if t.Verification.Mode == "artifact_present" {
r.Verified = true
return r
}
r.Reason = "verification.command is empty"
r.FailureClass = "verification_config_failure"
r.Verified = false
return r
}
r.Command = t.Verification.Command
if t.Verification.TimeoutSeconds == 0 {
t.Verification.TimeoutSeconds = 30
}
if dryRun {
r.Stdout = "(dry-run, command not executed)"
r.Verified = true
return r
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(t.Verification.TimeoutSeconds)*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "sh", "-c", t.Verification.Command)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
r.ExitCode = 0
if err != nil {
r.ExitCode = -1
r.FailureClass = "tool_execution_failure"
if ctx.Err() == context.DeadlineExceeded {
r.Error = fmt.Sprintf("timeout after %ds", t.Verification.TimeoutSeconds)
} else {
r.Error = err.Error()
}
}
r.Stdout = stdout.String()
r.Stderr = stderr.String()
r.StdoutSummary = summarizeOutput(r.Stdout)
r.StderrSummary = summarizeOutput(r.Stderr)
if r.ExitCode != 0 && t.Verification.Mode == "test_pass" {
r.Verified = false
return r
}
// Match expected_evidence
if t.Verification.ExpectedEvidence != "" {
evidence := t.Verification.ExpectedEvidence
matched := false
if strings.HasPrefix(evidence, "[") && strings.HasSuffix(evidence, "]") {
// Regex range like [4-9]
re := regexp.MustCompile(`\[(\d+)-(\d+)\]`)
if m := re.FindStringSubmatch(evidence); m != nil {
var lo, hi int
fmt.Sscanf(m[1], "%d", &lo)
fmt.Sscanf(m[2], "%d", &hi)
reOut := regexp.MustCompile(fmt.Sprintf(`^\s*(\d+)\s*$`))
if numMatch := reOut.FindStringSubmatch(strings.TrimSpace(r.Stdout)); numMatch != nil {
var n int
fmt.Sscanf(numMatch[1], "%d", &n)
matched = n >= lo && n <= hi
}
}
} else if strings.Contains(r.Stdout, evidence) {
matched = true
}
r.Verified = matched
if !matched {
r.Reason = fmt.Sprintf("expected_evidence '%s' not found in output", evidence)
r.FailureClass = "business_assertion_failure"
}
} else if r.ExitCode == 0 {
r.Verified = true
} else {
r.Verified = false
r.Reason = fmt.Sprintf("exit code %d", r.ExitCode)
r.FailureClass = "tool_execution_failure"
}
return r
}
func classifyFailureTier(r TaskResult) int {
if r.Verified {
return 0
}
if r.EvidenceGrade == "runtime-verified" {
return 2
}
return 3
}
func determineProcessExitCode(results []TaskResult) int {
hasRuntimeFailure := false
hasLowerTierFailure := false
for _, r := range results {
tier := classifyFailureTier(r)
switch tier {
case 2:
hasRuntimeFailure = true
case 3:
hasLowerTierFailure = true
}
}
if hasRuntimeFailure {
return 2
}
if hasLowerTierFailure {
return 3
}
return 0
}
func normalizeEvidenceGrade(mode, explicit string) string {
if explicit = strings.TrimSpace(explicit); explicit != "" {
return explicit
}
switch strings.TrimSpace(mode) {
case "test_pass":
return "runtime-verified"
case "artifact_present":
return "artifact-present"
case "semantic":
return "doc-claimed"
default:
return ""
}
}
func normalizeTaskType(raw string) string {
raw = strings.TrimSpace(raw)
if raw == "" {
return "unspecified"
}
return raw
}
func normalizeStatusFromText(raw string) string {
lower := strings.ToLower(strings.TrimSpace(raw))
switch {
case strings.Contains(raw, "✅") || strings.Contains(raw, "完成"):
return "completed"
case strings.Contains(raw, "🟡") || strings.Contains(raw, "进行中"):
return "in_progress"
case strings.Contains(raw, "🔶") || strings.Contains(raw, "🔴") || strings.Contains(raw, "待启动") || strings.Contains(raw, "未开始"):
return "planned"
case strings.Contains(raw, "⏸️") || strings.Contains(raw, "待规划") || strings.Contains(raw, "暂停"):
return "paused"
case lower == "":
return "unknown"
default:
return "unknown"
}
}
func filterTasksByStatus(tasks []taskEntry, filter string) ([]taskEntry, error) {
filter = strings.TrimSpace(filter)
if filter == "" {
filter = "all"
}
valid := map[string]struct{}{
"all": {},
"completed": {},
"in_progress": {},
"planned": {},
"paused": {},
"unknown": {},
}
if _, ok := valid[filter]; !ok {
return nil, fmt.Errorf("unsupported status filter: %s", filter)
}
if filter == "all" {
return tasks, nil
}
filtered := make([]taskEntry, 0, len(tasks))
for _, t := range tasks {
status := t.Status
if status == "" {
status = "unknown"
}
if status == filter {
filtered = append(filtered, t)
}
}
return filtered, nil
}
func summarizeOutput(raw string) string {
cleaned := strings.TrimSpace(raw)
if cleaned == "" {
return ""
}
cleaned = strings.Join(strings.Fields(cleaned), " ")
const limit = 220
if len(cleaned) <= limit {
return cleaned
}
return cleaned[:limit] + "..."
}
func validateVerification(v Verification) string {
validModes := map[string]struct{}{
"test_pass": {},
"artifact_present": {},
"semantic": {},
}
if _, ok := validModes[v.Mode]; !ok {
return fmt.Sprintf("unsupported verification mode: %s", v.Mode)
}
validGrades := map[string]struct{}{
"runtime-verified": {},
"artifact-present": {},
"doc-claimed": {},
}
if v.EvidenceGrade != "" {
if _, ok := validGrades[v.EvidenceGrade]; !ok {
return fmt.Sprintf("unsupported evidence grade: %s", v.EvidenceGrade)
}
}
validTaskTypes := map[string]struct{}{
"unspecified": {},
"code": {},
"automation": {},
"documentation": {},
"configuration": {},
"data": {},
"analysis": {},
}
if _, ok := validTaskTypes[v.TaskType]; !ok {
return fmt.Sprintf("unsupported task type: %s", v.TaskType)
}
if (v.TaskType == "code" || v.TaskType == "automation") && v.Mode == "semantic" {
return fmt.Sprintf("semantic-only verification is not allowed for %s tasks", v.TaskType)
}
if v.Mode == "artifact_present" {
if strings.TrimSpace(v.Command) != "" || strings.TrimSpace(v.ExpectedEvidence) != "" {
return "artifact_present does not allow command or expected_evidence; use test_pass for executable verification"
}
if v.TaskType == "code" || v.TaskType == "automation" || v.TaskType == "data" || v.TaskType == "analysis" {
return fmt.Sprintf("artifact_present is not allowed for %s tasks", v.TaskType)
}
}
return ""
}