llm-intelligence/scripts/verification_executor.go

// verification_executor.go
// Reads TASKS.md, runs each task's verification.command,
// matches expected_evidence, outputs pass/fail report.
//
// Usage: go run scripts/verification_executor.go [--dry-run] [--task T-Q2-1.1]
//go:build llm_script

package main

import (
	"bufio"
	"bytes"
	"context"
	"flag"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"runtime"
	"strings"
	"time"
)

type Verification struct {
	Mode             string
	Command          string
	ExpectedEvidence string
	TimeoutSeconds   int
	EvidenceGrade    string
	TaskType         string
}

type TaskResult struct {
	TaskID        string
	TaskName      string
	Verified      bool
	Command       string
	ExitCode      int
	Stdout        string
	Stderr        string
	StdoutSummary string
	StderrSummary string
	Error         string
	Reason        string
	FailureClass  string
	EvidenceGrade string
	TaskType      string
}


func main() {
	dryRun := flag.Bool("dry-run", false, "print commands without executing")
	taskFilter := flag.String("task", "", "filter by task ID (e.g. T-Q2-1.1)")
	tasksPathFlag := flag.String("tasks", "", "path to TASKS.md")
	statusFilter := flag.String("status", "all", "filter by normalized status: all|completed|in_progress|planned|paused|unknown")
	completedOnly := flag.Bool("completed-only", false, "shortcut for --status completed")
	flag.Parse()

	tasksPath := resolveTasksPath(*tasksPathFlag)

	f, err := os.Open(tasksPath)
	if err != nil {
		fmt.Fprintf(os.Stderr, "open TASKS.md: %v\n", err)
		os.Exit(1)
	}
	defer f.Close()

	tasks := parseTasks(f)
	if *taskFilter != "" {
		var filtered []taskEntry
		for _, t := range tasks {
			if t.ID == *taskFilter {
				filtered = append(filtered, t)
			}
		}
		tasks = filtered
	}

	effectiveStatus := *statusFilter
	if *completedOnly {
		effectiveStatus = "completed"
	}
	tasks, err = filterTasksByStatus(tasks, effectiveStatus)
	if err != nil {
		fmt.Fprintf(os.Stderr, "filter tasks: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("=== Verification Report (%s) ===\n", time.Now().Format("2006-01-02 15:04"))
	fmt.Printf("Tasks checked: %d | Dry-run: %v | Status: %s | TASKS: %s\n\n", len(tasks), *dryRun, effectiveStatus, tasksPath)

	var passed, failed int
	var results []TaskResult

	for _, t := range tasks {
		r := verifyTask(t, *dryRun)
		results = append(results, r)
		if r.Verified {
			passed++
		} else {
			failed++
		}
	}

	for _, r := range results {
		icon := "✅"
		if !r.Verified {
			icon = "❌"
		}
		fmt.Printf("%s [%s] %s\n", icon, r.TaskID, r.TaskName)
		if r.Command != "" {
			fmt.Printf("   cmd: %s\n", r.Command)
		}
		if r.EvidenceGrade != "" || r.TaskType != "" {
			fmt.Printf("   grade: %s | type: %s\n", r.EvidenceGrade, r.TaskType)
		}
		if r.StderrSummary != "" {
			fmt.Printf("   stderr: %s\n", r.StderrSummary)
		}
		if r.StdoutSummary != "" && (!r.Verified || r.Reason != "" || r.Error != "") {
			fmt.Printf("   stdout: %s\n", r.StdoutSummary)
		}
		if r.FailureClass != "" {
			fmt.Printf("   class: %s\n", r.FailureClass)
		}
		if r.Error != "" {
			fmt.Printf("   ERROR: %s\n", r.Error)
		} else if r.ExitCode != 0 && r.Stdout != "" {
			fmt.Printf("   output: %s\n", strings.TrimSpace(r.Stdout))
		} else if r.Reason != "" {
			fmt.Printf("   reason: %s\n", r.Reason)
		}

	}

	fmt.Printf("\n=== Summary: %d passed, %d failed ===\n", passed, failed)
	os.Exit(determineProcessExitCode(results))
}


func resolveTasksPath(flagValue string) string {
	envValue := os.Getenv("TASKS_PATH")
	wd := ""
	if currentWD, err := os.Getwd(); err == nil {
		wd = currentWD
	}
	sourceDir := ""
	if _, sourcePath, _, ok := runtime.Caller(0); ok {
		sourceDir = filepath.Dir(sourcePath)
	}
	return resolveTasksPathWithContext(flagValue, envValue, wd, sourceDir, "/home/long/.openclaw/workspace/TASKS.md")
}

func resolveTasksPathWithContext(flagValue, envValue, wd, sourceDir, globalTasksPath string) string {
	candidates := []string{}
	if flagValue != "" {
		candidates = append(candidates, flagValue)
	}
	if envValue != "" {
		candidates = append(candidates, envValue)
	}

	if wd != "" {
		candidates = append(candidates,
			filepath.Join(wd, "TASKS.md"),
			filepath.Join(wd, "..", "TASKS.md"),
		)
	}

	defaultProjectTasks := ""
	if sourceDir != "" {
		defaultProjectTasks = filepath.Join(sourceDir, "..", "TASKS.md")
		candidates = append(candidates, defaultProjectTasks)
	}

	seen := map[string]struct{}{}
	for _, candidate := range candidates {
		if candidate == "" {
			continue
		}
		cleaned := filepath.Clean(candidate)
		if _, ok := seen[cleaned]; ok {
			continue
		}
		seen[cleaned] = struct{}{}
		if _, err := os.Stat(cleaned); err == nil {
			return cleaned
		}
	}

	if flagValue != "" {
		return filepath.Clean(flagValue)
	}
	if envValue != "" {
		return filepath.Clean(envValue)
	}
	if defaultProjectTasks != "" {
		return filepath.Clean(defaultProjectTasks)
	}
	if wd != "" {
		return filepath.Clean(filepath.Join(wd, "TASKS.md"))
	}
	if globalTasksPath != "" {
		return filepath.Clean(globalTasksPath)
	}
	return "TASKS.md"
}

type taskEntry struct {
	ID              string
	Name            string
	Status          string
	Verification    Verification
	HasVerification bool
}

func parseTasks(f *os.File) []taskEntry {
	var tasks []taskEntry
	var currentTask *taskEntry
	inVerification := false
	scanner := bufio.NewScanner(f)

	for scanner.Scan() {
		line := scanner.Text()

		// Match task header: ### T-1.1 🔶 Phase 1 范围冻结
		taskRe := regexp.MustCompile(`^### (T-[A-Za-z0-9.-]+)\s+[^\s]+\s+(.+)`)
		if m := taskRe.FindStringSubmatch(line); m != nil {
			if currentTask != nil {
				tasks = append(tasks, *currentTask)
			}
			currentTask = &taskEntry{ID: m[1], Name: m[2], Status: normalizeStatusFromText(line)}
			inVerification = false
			continue
		}

		if currentTask == nil {
			continue
		}

		// Check for verification block
		if strings.Contains(line, "**verification**") || strings.Contains(line, "**verification**:") {
			inVerification = true
			currentTask.HasVerification = true
			continue
		}

		if !inVerification {
			statusRe := regexp.MustCompile(`^\s*-\s+\*\*状态\*\*：(.+)$`)
			if m := statusRe.FindStringSubmatch(line); m != nil {
				currentTask.Status = normalizeStatusFromText(m[1])
			}
			continue
		}

		// Parse verification fields (indented under **verification**)
		//  - mode: `artifact_present`
		modeRe := regexp.MustCompile(`^\s+- mode:\s+` + "`" + `([^` + "`" + `]+)` + "`")
		if m := modeRe.FindStringSubmatch(line); m != nil {
			currentTask.Verification.Mode = m[1]
			continue
		}

		cmdRe := regexp.MustCompile(`^\s+- command:\s+` + "`" + `([^` + "`" + `]+)` + "`")
		if m := cmdRe.FindStringSubmatch(line); m != nil {
			currentTask.Verification.Command = m[1]
			continue
		}

		expRe := regexp.MustCompile(`^\s+- expected_evidence:\s+` + "`" + `([^` + "`" + `]+)` + "`")
		if m := expRe.FindStringSubmatch(line); m != nil {
			currentTask.Verification.ExpectedEvidence = m[1]
			continue
		}

		evidenceGradeRe := regexp.MustCompile(`^\s+- evidence_grade:\s+` + "`" + `([^` + "`" + `]+)` + "`")
		if m := evidenceGradeRe.FindStringSubmatch(line); m != nil {
			currentTask.Verification.EvidenceGrade = m[1]
			continue
		}

		taskTypeRe := regexp.MustCompile(`^\s+- task_type:\s+` + "`" + `([^` + "`" + `]+)` + "`")
		if m := taskTypeRe.FindStringSubmatch(line); m != nil {
			currentTask.Verification.TaskType = m[1]
			continue
		}

		timeoutRe := regexp.MustCompile(`^\s+- timeout_seconds:\s+(\d+)`)
		if m := timeoutRe.FindStringSubmatch(line); m != nil {
			fmt.Sscanf(m[1], "%d", &currentTask.Verification.TimeoutSeconds)
			continue
		}

		// Blank line or new top-level field ends verification block
		if strings.TrimSpace(line) == "" || (strings.HasPrefix(strings.TrimSpace(line), "**") && !strings.Contains(line, "verification")) {
			inVerification = false
		}
	}

	if currentTask != nil {
		tasks = append(tasks, *currentTask)
	}

	return tasks
}

func verifyTask(t taskEntry, dryRun bool) TaskResult {
	r := TaskResult{TaskID: t.ID, TaskName: t.Name}

	if !t.HasVerification {
		r.Reason = "no verification block"
		r.FailureClass = "missing_verification"
		r.Verified = true // No verification = trivially pass
		return r
	}

	t.Verification.Mode = strings.TrimSpace(t.Verification.Mode)
	t.Verification.TaskType = normalizeTaskType(t.Verification.TaskType)
	t.Verification.EvidenceGrade = normalizeEvidenceGrade(t.Verification.Mode, t.Verification.EvidenceGrade)
	r.TaskType = t.Verification.TaskType
	r.EvidenceGrade = t.Verification.EvidenceGrade

	if validationErr := validateVerification(t.Verification); validationErr != "" {
		r.Verified = false
		r.Reason = validationErr
		r.FailureClass = "verification_config_failure"
		return r
	}

	if t.Verification.Command == "" {
		if t.Verification.Mode == "artifact_present" {
			r.Verified = true
			return r
		}
		r.Reason = "verification.command is empty"
		r.FailureClass = "verification_config_failure"
		r.Verified = false
		return r
	}

	r.Command = t.Verification.Command

	if t.Verification.TimeoutSeconds == 0 {
		t.Verification.TimeoutSeconds = 30
	}

	if dryRun {
		r.Stdout = "(dry-run, command not executed)"
		r.Verified = true
		return r
	}

	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(t.Verification.TimeoutSeconds)*time.Second)
	defer cancel()

	cmd := exec.CommandContext(ctx, "sh", "-c", t.Verification.Command)
	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr

	err := cmd.Run()
	r.ExitCode = 0
	if err != nil {
		r.ExitCode = -1
		r.FailureClass = "tool_execution_failure"
		if ctx.Err() == context.DeadlineExceeded {
			r.Error = fmt.Sprintf("timeout after %ds", t.Verification.TimeoutSeconds)
		} else {
			r.Error = err.Error()
		}
	}

	r.Stdout = stdout.String()
	r.Stderr = stderr.String()
	r.StdoutSummary = summarizeOutput(r.Stdout)
	r.StderrSummary = summarizeOutput(r.Stderr)

	if r.ExitCode != 0 && t.Verification.Mode == "test_pass" {
		r.Verified = false
		return r
	}

	// Match expected_evidence
	if t.Verification.ExpectedEvidence != "" {
		evidence := t.Verification.ExpectedEvidence
		matched := false

		if strings.HasPrefix(evidence, "[") && strings.HasSuffix(evidence, "]") {
			// Regex range like [4-9]
			re := regexp.MustCompile(`\[(\d+)-(\d+)\]`)
			if m := re.FindStringSubmatch(evidence); m != nil {
				var lo, hi int
				fmt.Sscanf(m[1], "%d", &lo)
				fmt.Sscanf(m[2], "%d", &hi)
				reOut := regexp.MustCompile(fmt.Sprintf(`^\s*(\d+)\s*$`))
				if numMatch := reOut.FindStringSubmatch(strings.TrimSpace(r.Stdout)); numMatch != nil {
					var n int
					fmt.Sscanf(numMatch[1], "%d", &n)
					matched = n >= lo && n <= hi
				}
			}
		} else if strings.Contains(r.Stdout, evidence) {
			matched = true
		}

		r.Verified = matched
		if !matched {
			r.Reason = fmt.Sprintf("expected_evidence '%s' not found in output", evidence)
			r.FailureClass = "business_assertion_failure"
		}
	} else if r.ExitCode == 0 {
		r.Verified = true
	} else {
		r.Verified = false
		r.Reason = fmt.Sprintf("exit code %d", r.ExitCode)
		r.FailureClass = "tool_execution_failure"
	}

	return r
}

func classifyFailureTier(r TaskResult) int {
	if r.Verified {
		return 0
	}
	if r.EvidenceGrade == "runtime-verified" {
		return 2
	}
	return 3
}

func determineProcessExitCode(results []TaskResult) int {
	hasRuntimeFailure := false
	hasLowerTierFailure := false
	for _, r := range results {
		tier := classifyFailureTier(r)
		switch tier {
		case 2:
			hasRuntimeFailure = true
		case 3:
			hasLowerTierFailure = true
		}
	}
	if hasRuntimeFailure {
		return 2
	}
	if hasLowerTierFailure {
		return 3
	}
	return 0
}

func normalizeEvidenceGrade(mode, explicit string) string {
	if explicit = strings.TrimSpace(explicit); explicit != "" {
		return explicit
	}

	switch strings.TrimSpace(mode) {
	case "test_pass":
		return "runtime-verified"
	case "artifact_present":
		return "artifact-present"
	case "semantic":
		return "doc-claimed"
	default:
		return ""
	}
}

func normalizeTaskType(raw string) string {
	raw = strings.TrimSpace(raw)
	if raw == "" {
		return "unspecified"
	}
	return raw
}

func normalizeStatusFromText(raw string) string {
	lower := strings.ToLower(strings.TrimSpace(raw))
	switch {
	case strings.Contains(raw, "✅") || strings.Contains(raw, "完成"):
		return "completed"
	case strings.Contains(raw, "🟡") || strings.Contains(raw, "进行中"):
		return "in_progress"
	case strings.Contains(raw, "🔶") || strings.Contains(raw, "🔴") || strings.Contains(raw, "待启动") || strings.Contains(raw, "未开始"):
		return "planned"
	case strings.Contains(raw, "⏸️") || strings.Contains(raw, "待规划") || strings.Contains(raw, "暂停"):
		return "paused"
	case lower == "":
		return "unknown"
	default:
		return "unknown"
	}
}

func filterTasksByStatus(tasks []taskEntry, filter string) ([]taskEntry, error) {
	filter = strings.TrimSpace(filter)
	if filter == "" {
		filter = "all"
	}

	valid := map[string]struct{}{
		"all":         {},
		"completed":   {},
		"in_progress": {},
		"planned":     {},
		"paused":      {},
		"unknown":     {},
	}
	if _, ok := valid[filter]; !ok {
		return nil, fmt.Errorf("unsupported status filter: %s", filter)
	}
	if filter == "all" {
		return tasks, nil
	}

	filtered := make([]taskEntry, 0, len(tasks))
	for _, t := range tasks {
		status := t.Status
		if status == "" {
			status = "unknown"
		}
		if status == filter {
			filtered = append(filtered, t)
		}
	}
	return filtered, nil
}

func summarizeOutput(raw string) string {
	cleaned := strings.TrimSpace(raw)
	if cleaned == "" {
		return ""
	}
	cleaned = strings.Join(strings.Fields(cleaned), " ")
	const limit = 220
	if len(cleaned) <= limit {
		return cleaned
	}
	return cleaned[:limit] + "..."
}

func validateVerification(v Verification) string {
	validModes := map[string]struct{}{
		"test_pass":        {},
		"artifact_present": {},
		"semantic":         {},
	}
	if _, ok := validModes[v.Mode]; !ok {
		return fmt.Sprintf("unsupported verification mode: %s", v.Mode)
	}

	validGrades := map[string]struct{}{
		"runtime-verified": {},
		"artifact-present": {},
		"doc-claimed":      {},
	}
	if v.EvidenceGrade != "" {
		if _, ok := validGrades[v.EvidenceGrade]; !ok {
			return fmt.Sprintf("unsupported evidence grade: %s", v.EvidenceGrade)
		}
	}

	validTaskTypes := map[string]struct{}{
		"unspecified":   {},
		"code":          {},
		"automation":    {},
		"documentation": {},
		"configuration": {},
		"data":          {},
		"analysis":      {},
	}
	if _, ok := validTaskTypes[v.TaskType]; !ok {
		return fmt.Sprintf("unsupported task type: %s", v.TaskType)
	}

	if (v.TaskType == "code" || v.TaskType == "automation") && v.Mode == "semantic" {
		return fmt.Sprintf("semantic-only verification is not allowed for %s tasks", v.TaskType)
	}
	if v.Mode == "artifact_present" {
		if strings.TrimSpace(v.Command) != "" || strings.TrimSpace(v.ExpectedEvidence) != "" {
			return "artifact_present does not allow command or expected_evidence; use test_pass for executable verification"
		}
		if v.TaskType == "code" || v.TaskType == "automation" || v.TaskType == "data" || v.TaskType == "analysis" {
			return fmt.Sprintf("artifact_present is not allowed for %s tasks", v.TaskType)
		}
	}

	return ""
}