Files
supply-intelligence/scripts/gateway_closure_inspect.sh
2026-05-12 18:49:52 +08:00

117 lines
3.4 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
BASE_URL="${BASE_URL:-http://127.0.0.1:8080}"
CONSUMER="${CONSUMER:-gateway}"
APPLIED_RATIO_THRESHOLD="${APPLIED_RATIO_THRESHOLD:-0.95}"
FAILED_BURST_THRESHOLD="${FAILED_BURST_THRESHOLD:-3}"
PENDING_RETRY_THRESHOLD="${PENDING_RETRY_THRESHOLD:-10}"
need() {
command -v "$1" >/dev/null 2>&1 || {
echo "missing required command: $1" >&2
exit 1
}
}
need curl
need python3
health=$(curl -fsS "$BASE_URL/healthz")
metrics=$(curl -fsS "$BASE_URL/metrics")
status=$(curl -fsS "$BASE_URL/internal/supply-intelligence/gateway/runtime-status")
echo "=== healthz ==="
echo "$health"
echo "=== runtime status ==="
echo "$status"
echo "=== metrics excerpt ==="
printf '%s
' "$metrics" | grep 'supply_intelligence_gateway_' || true
export METRICS_TEXT="$metrics"
export RUNTIME_STATUS_JSON="$status"
export CONSUMER
export APPLIED_RATIO_THRESHOLD
export FAILED_BURST_THRESHOLD
export PENDING_RETRY_THRESHOLD
python3 <<'PY'
import json
import os
import re
import sys
metrics = os.environ['METRICS_TEXT']
status = json.loads(os.environ['RUNTIME_STATUS_JSON'])
consumer = os.environ['CONSUMER']
ratio_threshold = float(os.environ['APPLIED_RATIO_THRESHOLD'])
failed_threshold = int(os.environ['FAILED_BURST_THRESHOLD'])
pending_threshold = int(os.environ['PENDING_RETRY_THRESHOLD'])
processed = {}
for line in metrics.splitlines():
if not line.startswith('supply_intelligence_gateway_events_processed_total'):
continue
head, _, tail = line.rpartition(' ')
if not tail:
continue
m = re.search(r'\{([^}]*)\}$', head)
if not m:
continue
labels = {}
for part in m.group(1).split(','):
if '=' not in part:
continue
k, v = part.split('=', 1)
labels[k.strip()] = v.strip().strip('"')
result_label = labels.get('result')
if not result_label:
continue
processed[result_label] = processed.get(result_label, 0.0) + float(tail)
pending_retry = 0.0
failed_events = 0.0
for line in metrics.splitlines():
if line.startswith('supply_intelligence_gateway_pending_retry_events') and f'consumer="{consumer}"' in line:
pending_retry = float(line.rsplit(' ', 1)[-1])
if line.startswith('supply_intelligence_gateway_failed_events') and f'consumer="{consumer}"' in line:
failed_events = float(line.rsplit(' ', 1)[-1])
total_terminal = processed.get('applied', 0.0) + processed.get('failed', 0.0)
applied_ratio = (processed.get('applied', 0.0) / total_terminal) if total_terminal > 0 else 1.0
decision = 'continue'
reasons = []
if not status.get('started', False):
decision = 'pause'
reasons.append('runtime_not_started')
if status.get('last_error'):
decision = 'pause'
reasons.append('runtime_last_error')
if pending_retry > pending_threshold:
decision = 'pause'
reasons.append('pending_retry_threshold_exceeded')
if applied_ratio < ratio_threshold:
decision = 'pause'
reasons.append('applied_ratio_below_threshold')
if failed_events >= failed_threshold:
decision = 'rollback'
reasons.append('failed_events_threshold_exceeded')
print(json.dumps({
'decision': decision,
'reasons': reasons,
'applied_ratio': applied_ratio,
'processed': processed,
'pending_retry_events': pending_retry,
'failed_events': failed_events,
'runtime': status,
}, ensure_ascii=False, indent=2))
if decision == 'rollback':
sys.exit(2)
if decision == 'pause':
sys.exit(1)
PY