356 lines
15 KiB
Bash
356 lines
15 KiB
Bash
#!/bin/bash
|
|
#
|
|
# BMAD Epic Execute - Observability Module
|
|
#
|
|
# Captures the telemetry the `claude` CLI already emits (session id, token
|
|
# usage, cost, latency, context window) as OTel-shaped trace spans, and rolls
|
|
# them up into deterministic (non-fabricated) metrics.
|
|
#
|
|
# Source: docs/improvements/observability-implementation-plan.md
|
|
#
|
|
# Usage: Sourced by epic-execute.sh. Relies on jq (hard prerequisite, enforced
|
|
# by require_observability_deps below) and the following globals from the
|
|
# parent script: SPRINT_ARTIFACTS_DIR, EPIC_ID, LOG_FILE, VERBOSE.
|
|
#
|
|
# Gated behind BMAD_TRACE: tracing is only active when BMAD_TRACE=1 (default
|
|
# off during initial rollout). The invocation helper in epic-execute.sh falls
|
|
# back to the legacy text path when tracing is disabled.
|
|
#
|
|
|
|
# =============================================================================
|
|
# Observability State
|
|
# =============================================================================
|
|
|
|
# Epic-level trace id (one per run); each phase's claude session_id is a span id.
|
|
TRACE_ID="${TRACE_ID:-}"
|
|
|
|
# Path to the append-only span log for this epic (set by init_observability).
|
|
TRACE_FILE=""
|
|
|
|
# Ambient context for the next span (set by the phase functions before each
|
|
# claude invocation — see Open Decision 1 in the implementation plan).
|
|
CURRENT_PHASE=""
|
|
CURRENT_STORY_ID=""
|
|
|
|
# Whether tracing is active for this run.
|
|
TRACE_ENABLED=false
|
|
|
|
# Intra-phase heartbeat: appends a liveness beat to a .live.jsonl every N
|
|
# seconds while a phase is running, so a hard kill (SIGKILL) mid-phase still
|
|
# leaves a forensic trail of which phase was in flight and how far it got.
|
|
HEARTBEAT_INTERVAL="${BMAD_TRACE_HEARTBEAT_INTERVAL:-10}"
|
|
LIVE_TRACE_FILE=""
|
|
HEARTBEAT_PID=""
|
|
PHASE_START_SECONDS=""
|
|
# Main script PID ($$ is the sourcing shell = the epic-execute process).
|
|
MAIN_PID="$$"
|
|
|
|
# =============================================================================
|
|
# Dependency Enforcement (Open Decision 4: hard fail)
|
|
# =============================================================================
|
|
|
|
# jq is load-bearing for observability in three places: the live renderer,
|
|
# span extraction, and clean .result extraction. When tracing is enabled it is
|
|
# a hard prerequisite — too much rides on it to silently degrade.
|
|
require_observability_deps() {
|
|
if [ "${BMAD_TRACE:-}" != "1" ]; then
|
|
return 0
|
|
fi
|
|
if ! command -v jq >/dev/null 2>&1; then
|
|
log_error "BMAD_TRACE=1 requires 'jq' but it was not found on PATH."
|
|
log_error "Install jq (https://jqlang.github.io/jq/) or unset BMAD_TRACE."
|
|
exit 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# =============================================================================
|
|
# Initialization
|
|
# =============================================================================
|
|
|
|
# Initialize the trace for this epic. Mints a trace id (if not already set) and
|
|
# creates the span file. No-op unless BMAD_TRACE=1.
|
|
init_observability() {
|
|
if [ "${BMAD_TRACE:-}" != "1" ]; then
|
|
TRACE_ENABLED=false
|
|
return 0
|
|
fi
|
|
|
|
if [ -z "$SPRINT_ARTIFACTS_DIR" ] || [ -z "$EPIC_ID" ]; then
|
|
log_warn "Cannot initialize tracing: SPRINT_ARTIFACTS_DIR or EPIC_ID not set"
|
|
TRACE_ENABLED=false
|
|
return 1
|
|
fi
|
|
|
|
if [ -z "$TRACE_ID" ]; then
|
|
if command -v uuidgen >/dev/null 2>&1; then
|
|
TRACE_ID=$(uuidgen | tr '[:upper:]' '[:lower:]')
|
|
else
|
|
# Fallback id: epic + pid + start seconds (still unique per run)
|
|
TRACE_ID="epic-${EPIC_ID}-$$-$(date +%s)"
|
|
fi
|
|
fi
|
|
|
|
TRACES_DIR="${TRACES_DIR:-$SPRINT_ARTIFACTS_DIR/traces}"
|
|
mkdir -p "$TRACES_DIR" 2>/dev/null || true
|
|
TRACE_FILE="$TRACES_DIR/epic-${EPIC_ID}-trace.jsonl"
|
|
LIVE_TRACE_FILE="$TRACES_DIR/epic-${EPIC_ID}-trace.live.jsonl"
|
|
|
|
TRACE_ENABLED=true
|
|
log "Tracing enabled (trace_id=$TRACE_ID) -> $TRACE_FILE"
|
|
return 0
|
|
}
|
|
|
|
# Set the ambient context for the next span. Called by phase functions before
|
|
# invoking claude (ambient-vars approach, Open Decision 1).
|
|
set_span_context() {
|
|
CURRENT_PHASE="$1"
|
|
CURRENT_STORY_ID="$2"
|
|
}
|
|
|
|
# =============================================================================
|
|
# Intra-phase Heartbeat (liveness for debugging hangs / hard kills)
|
|
# =============================================================================
|
|
|
|
# Append one heartbeat record describing the in-flight phase. Reads only the
|
|
# latest assistant event from the stream (bounded I/O — no slurp), so it stays
|
|
# cheap even for long phases with large tool output. Best-effort: a transient
|
|
# partial line just skips this tick.
|
|
emit_heartbeat() {
|
|
local stream_file="$1"
|
|
{ [ "$TRACE_ENABLED" = true ] && [ -n "$LIVE_TRACE_FILE" ] && [ -f "$stream_file" ]; } || return 0
|
|
|
|
local now elapsed ts events last_asst
|
|
now=$(date +%s)
|
|
elapsed=$(( now - ${PHASE_START_SECONDS:-$now} ))
|
|
ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
events=$(wc -l < "$stream_file" 2>/dev/null | tr -d ' '); events="${events:-0}"
|
|
last_asst=$(grep '"type":"assistant"' "$stream_file" 2>/dev/null | tail -n 1)
|
|
|
|
if [ -n "$last_asst" ]; then
|
|
printf '%s' "$last_asst" | jq -c \
|
|
--arg trace_id "$TRACE_ID" --arg phase "$CURRENT_PHASE" --arg story "$CURRENT_STORY_ID" \
|
|
--arg ts "$ts" --argjson elapsed "$elapsed" --argjson events "$events" '
|
|
(.message.usage // {}) as $u
|
|
| { trace_id:$trace_id, kind:"heartbeat", phase:$phase, story_id:$story,
|
|
elapsed_s:$elapsed, ts:$ts, events:$events,
|
|
ctx_input_tokens: ($u.input_tokens // 0),
|
|
cache_read: ($u.cache_read_input_tokens // 0),
|
|
out_tokens: ($u.output_tokens // 0),
|
|
last_text: ((([ .message.content[]? | select(.type=="text") | .text ] | last) // "")[0:160]) }' \
|
|
>> "$LIVE_TRACE_FILE" 2>/dev/null || true
|
|
else
|
|
# No assistant output yet — still emit a beat so you can see it started.
|
|
jq -nc \
|
|
--arg trace_id "$TRACE_ID" --arg phase "$CURRENT_PHASE" --arg story "$CURRENT_STORY_ID" \
|
|
--arg ts "$ts" --argjson elapsed "$elapsed" --argjson events "$events" \
|
|
'{trace_id:$trace_id, kind:"heartbeat", phase:$phase, story_id:$story,
|
|
elapsed_s:$elapsed, ts:$ts, events:$events, ctx_input_tokens:0, out_tokens:0, last_text:""}' \
|
|
>> "$LIVE_TRACE_FILE" 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
# Start a background heartbeat for the current phase. Emits an immediate beat,
|
|
# then one every HEARTBEAT_INTERVAL seconds until the phase ends or the main
|
|
# process dies (self-terminates within one interval on SIGKILL).
|
|
start_phase_heartbeat() {
|
|
local stream_file="$1"
|
|
{ [ "$TRACE_ENABLED" = true ] && [ "${HEARTBEAT_INTERVAL:-0}" -gt 0 ] 2>/dev/null; } || return 0
|
|
|
|
PHASE_START_SECONDS=$(date +%s)
|
|
emit_heartbeat "$stream_file"
|
|
|
|
(
|
|
while kill -0 "$MAIN_PID" 2>/dev/null; do
|
|
sleep "$HEARTBEAT_INTERVAL"
|
|
emit_heartbeat "$stream_file"
|
|
done
|
|
) &
|
|
HEARTBEAT_PID=$!
|
|
}
|
|
|
|
# Stop the background heartbeat (called when the phase's claude call returns,
|
|
# and defensively from cleanup()).
|
|
stop_phase_heartbeat() {
|
|
[ -n "${HEARTBEAT_PID:-}" ] || return 0
|
|
kill "$HEARTBEAT_PID" 2>/dev/null || true
|
|
wait "$HEARTBEAT_PID" 2>/dev/null || true
|
|
HEARTBEAT_PID=""
|
|
}
|
|
|
|
# =============================================================================
|
|
# Span Recording
|
|
# =============================================================================
|
|
|
|
# Append one OTel-shaped span for the just-completed claude phase.
|
|
# Reads the final `result` envelope from the raw stream file (always the last
|
|
# JSONL line) and derives the telemetry fields.
|
|
#
|
|
# Arguments:
|
|
# $1 - raw stream file (JSONL emitted by claude --output-format stream-json)
|
|
# $2 - phase status (optional; the downstream completion status, e.g. COMPLETE)
|
|
record_span() {
|
|
local stream_file="$1"
|
|
local status="${2:-}"
|
|
|
|
[ "$TRACE_ENABLED" = true ] || return 0
|
|
[ -n "$TRACE_FILE" ] || return 0
|
|
[ -f "$stream_file" ] || return 0
|
|
|
|
# The result envelope is always the last line. A single JSONL line, so this
|
|
# is memory-cheap regardless of phase length.
|
|
local envelope
|
|
envelope=$(tail -n 1 "$stream_file" 2>/dev/null)
|
|
|
|
# Validate it is actually the result event; if the call crashed mid-stream
|
|
# the last line won't be a result — record a degraded error span instead.
|
|
local etype
|
|
etype=$(printf '%s' "$envelope" | jq -r '.type // empty' 2>/dev/null) || true
|
|
local ts
|
|
ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
if [ "$etype" != "result" ]; then
|
|
jq -nc \
|
|
--arg trace_id "$TRACE_ID" \
|
|
--arg parent "$CURRENT_STORY_ID" \
|
|
--arg name "$CURRENT_PHASE" \
|
|
--arg story_id "$CURRENT_STORY_ID" \
|
|
--arg status "${status:-UNKNOWN}" \
|
|
--arg ts "$ts" \
|
|
'{trace_id:$trace_id, span_id:null, parent:$parent, name:$name,
|
|
story_id:$story_id, model:null, input_tokens:0, output_tokens:0,
|
|
cache_read:0, cost_usd:0, duration_ms:0, ttft_ms:0, num_turns:0,
|
|
is_error:true, api_error_status:"no_result_envelope",
|
|
ctx_util_pct:0, status:$status, ts:$ts}' \
|
|
>> "$TRACE_FILE" 2>/dev/null || true
|
|
[ "$VERBOSE" = true ] && log_warn "record_span: no result envelope in stream (degraded span)"
|
|
return 0
|
|
fi
|
|
|
|
# Pull the primary (largest-context) model for ctx utilization. Sub-agent
|
|
# usage is preserved verbatim under model/cost via the envelope, but for the
|
|
# span's headline model we take the model with the largest context window.
|
|
printf '%s' "$envelope" | jq -c \
|
|
--arg trace_id "$TRACE_ID" \
|
|
--arg parent "$CURRENT_STORY_ID" \
|
|
--arg name "$CURRENT_PHASE" \
|
|
--arg story_id "$CURRENT_STORY_ID" \
|
|
--arg status "$status" \
|
|
--arg ts "$ts" '
|
|
# Choose headline model = the one with the largest contextWindow
|
|
(.modelUsage // {}) as $mu
|
|
| ($mu | to_entries | sort_by(.value.contextWindow // 0) | last) as $primary
|
|
| ($primary.key // .usage.service_tier // null) as $model
|
|
| (($primary.value.contextWindow) // 0) as $ctxwin
|
|
| ((.usage.input_tokens // 0)
|
|
+ (.usage.cache_read_input_tokens // 0)
|
|
+ (.usage.cache_creation_input_tokens // 0)) as $ctx_used
|
|
| (if $ctxwin > 0 then (($ctx_used / $ctxwin) * 100 * 10 | round / 10) else 0 end) as $ctx_pct
|
|
| {
|
|
trace_id: $trace_id,
|
|
span_id: (.session_id // null),
|
|
parent: $parent,
|
|
name: $name,
|
|
story_id: $story_id,
|
|
model: $model,
|
|
input_tokens: (.usage.input_tokens // 0),
|
|
output_tokens: (.usage.output_tokens // 0),
|
|
cache_read: (.usage.cache_read_input_tokens // 0),
|
|
cost_usd: (.total_cost_usd // 0),
|
|
duration_ms: (.duration_ms // 0),
|
|
ttft_ms: (.ttft_ms // 0),
|
|
num_turns: (.num_turns // 0),
|
|
is_error: (.is_error // false),
|
|
api_error_status: (.api_error_status // null),
|
|
ctx_util_pct: $ctx_pct,
|
|
status: $status,
|
|
ts: $ts
|
|
}' >> "$TRACE_FILE" 2>/dev/null || true
|
|
|
|
# Context-utilization warning feeds the Context Strategy gap (#7).
|
|
local ctx_pct
|
|
ctx_pct=$(printf '%s' "$envelope" | jq -r '
|
|
(.modelUsage // {} | to_entries | sort_by(.value.contextWindow // 0) | last) as $p
|
|
| (($p.value.contextWindow) // 0) as $w
|
|
| ((.usage.input_tokens // 0) + (.usage.cache_read_input_tokens // 0) + (.usage.cache_creation_input_tokens // 0)) as $u
|
|
| if $w > 0 then (($u / $w) * 100 | floor) else 0 end' 2>/dev/null)
|
|
if [ -n "$ctx_pct" ] && [ "$ctx_pct" -ge 80 ] 2>/dev/null; then
|
|
log_warn "Context utilization high: ${ctx_pct}% (${CURRENT_PHASE} / ${CURRENT_STORY_ID})"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# =============================================================================
|
|
# Rollup (deterministic telemetry into metrics.yaml)
|
|
# =============================================================================
|
|
|
|
# Sum all spans for this epic and write a telemetry block into metrics.yaml.
|
|
# Deterministic: no model involved, no fabrication. No-op without yq.
|
|
rollup_telemetry() {
|
|
[ "$TRACE_ENABLED" = true ] || return 0
|
|
[ -n "$TRACE_FILE" ] && [ -f "$TRACE_FILE" ] || return 0
|
|
[ -n "$METRICS_FILE" ] && [ -f "$METRICS_FILE" ] || return 0
|
|
command -v yq >/dev/null 2>&1 || { log_warn "yq not found - skipping telemetry rollup"; return 0; }
|
|
|
|
# Aggregate totals + per-phase breakdown from the JSONL spans.
|
|
local totals_json
|
|
totals_json=$(jq -s '
|
|
{
|
|
total_cost_usd: (map(.cost_usd) | add // 0),
|
|
total_input_tokens: (map(.input_tokens) | add // 0),
|
|
total_output_tokens: (map(.output_tokens) | add // 0),
|
|
cache_read_tokens: (map(.cache_read) | add // 0),
|
|
phases_total: length,
|
|
phases_errored: (map(select(.is_error == true)) | length),
|
|
by_phase: (group_by(.name) | map({
|
|
key: (.[0].name // "unknown"),
|
|
value: {
|
|
calls: length,
|
|
cost_usd: (map(.cost_usd) | add // 0),
|
|
input_tokens: (map(.input_tokens) | add // 0),
|
|
output_tokens: (map(.output_tokens) | add // 0)
|
|
}
|
|
}) | from_entries)
|
|
}' "$TRACE_FILE" 2>/dev/null)
|
|
|
|
[ -z "$totals_json" ] && return 0
|
|
|
|
# Tool Call Success Rate (phase-level proxy): non-errored phases / total.
|
|
local rate
|
|
rate=$(printf '%s' "$totals_json" | jq -r '
|
|
if .phases_total > 0
|
|
then (((.phases_total - .phases_errored) / .phases_total) * 100 * 10 | round / 10)
|
|
else 0 end' 2>/dev/null)
|
|
|
|
# Write the telemetry block (yq reads the JSON via env to avoid quoting hell).
|
|
TELEMETRY_JSON="$totals_json" yq -i '.telemetry = (strenv(TELEMETRY_JSON) | from_json) | (.telemetry | ..) style=""' "$METRICS_FILE" 2>/dev/null || {
|
|
log_warn "Failed to write telemetry block to metrics"
|
|
return 0
|
|
}
|
|
yq -i ".telemetry.trace_id = \"$TRACE_ID\"" "$METRICS_FILE" 2>/dev/null || true
|
|
|
|
# 2026 business-outcome rates (gap #9). Derived deterministically from the
|
|
# counters already in metrics.yaml — no model, no estimation.
|
|
# - Tool Call Success Rate (phase-level proxy): non-errored phases / total
|
|
# - Task Completion Rate: completed stories / total stories
|
|
# - Escalation Rate: stories that exhausted retries / total stories
|
|
local total completed max_retries
|
|
total=$(yq '.stories.total // 0' "$METRICS_FILE" 2>/dev/null); total="${total:-0}"
|
|
completed=$(yq '.stories.completed // 0' "$METRICS_FILE" 2>/dev/null); completed="${completed:-0}"
|
|
max_retries=$(yq '.fix_loop.max_retries_hit // 0' "$METRICS_FILE" 2>/dev/null); max_retries="${max_retries:-0}"
|
|
|
|
local completion_rate=0 escalation_rate=0
|
|
if [ "$total" -gt 0 ] 2>/dev/null; then
|
|
completion_rate=$(awk "BEGIN { printf \"%.1f\", ($completed / $total) * 100 }")
|
|
escalation_rate=$(awk "BEGIN { printf \"%.1f\", ($max_retries / $total) * 100 }")
|
|
fi
|
|
|
|
yq -i ".telemetry.tool_call_success_rate = $rate" "$METRICS_FILE" 2>/dev/null || true
|
|
yq -i ".telemetry.task_completion_rate = $completion_rate" "$METRICS_FILE" 2>/dev/null || true
|
|
yq -i ".telemetry.escalation_rate = $escalation_rate" "$METRICS_FILE" 2>/dev/null || true
|
|
|
|
log "Telemetry rolled up into metrics: $METRICS_FILE"
|
|
return 0
|
|
}
|