#!/bin/bash # # BMAD Epic Execute - Observability Module # # Captures the telemetry the `claude` CLI already emits (session id, token # usage, cost, latency, context window) as OTel-shaped trace spans, and rolls # them up into deterministic (non-fabricated) metrics. # # Source: docs/improvements/observability-implementation-plan.md # # Usage: Sourced by epic-execute.sh. Relies on jq (hard prerequisite, enforced # by require_observability_deps below) and the following globals from the # parent script: SPRINT_ARTIFACTS_DIR, EPIC_ID, LOG_FILE, VERBOSE. # # Gated behind BMAD_TRACE: tracing is only active when BMAD_TRACE=1 (default # off during initial rollout). The invocation helper in epic-execute.sh falls # back to the legacy text path when tracing is disabled. # # ============================================================================= # Observability State # ============================================================================= # Epic-level trace id (one per run); each phase's claude session_id is a span id. TRACE_ID="${TRACE_ID:-}" # Path to the append-only span log for this epic (set by init_observability). TRACE_FILE="" # Ambient context for the next span (set by the phase functions before each # claude invocation — see Open Decision 1 in the implementation plan). CURRENT_PHASE="" CURRENT_STORY_ID="" # Whether tracing is active for this run. TRACE_ENABLED=false # Intra-phase heartbeat: appends a liveness beat to a .live.jsonl every N # seconds while a phase is running, so a hard kill (SIGKILL) mid-phase still # leaves a forensic trail of which phase was in flight and how far it got. HEARTBEAT_INTERVAL="${BMAD_TRACE_HEARTBEAT_INTERVAL:-10}" LIVE_TRACE_FILE="" HEARTBEAT_PID="" PHASE_START_SECONDS="" # Main script PID ($$ is the sourcing shell = the epic-execute process). MAIN_PID="$$" # ============================================================================= # Dependency Enforcement (Open Decision 4: hard fail) # ============================================================================= # jq is load-bearing for observability in three places: the live renderer, # span extraction, and clean .result extraction. When tracing is enabled it is # a hard prerequisite — too much rides on it to silently degrade. require_observability_deps() { if [ "${BMAD_TRACE:-}" != "1" ]; then return 0 fi if ! command -v jq >/dev/null 2>&1; then log_error "BMAD_TRACE=1 requires 'jq' but it was not found on PATH." log_error "Install jq (https://jqlang.github.io/jq/) or unset BMAD_TRACE." exit 1 fi return 0 } # ============================================================================= # Initialization # ============================================================================= # Initialize the trace for this epic. Mints a trace id (if not already set) and # creates the span file. No-op unless BMAD_TRACE=1. init_observability() { if [ "${BMAD_TRACE:-}" != "1" ]; then TRACE_ENABLED=false return 0 fi if [ -z "$SPRINT_ARTIFACTS_DIR" ] || [ -z "$EPIC_ID" ]; then log_warn "Cannot initialize tracing: SPRINT_ARTIFACTS_DIR or EPIC_ID not set" TRACE_ENABLED=false return 1 fi if [ -z "$TRACE_ID" ]; then if command -v uuidgen >/dev/null 2>&1; then TRACE_ID=$(uuidgen | tr '[:upper:]' '[:lower:]') else # Fallback id: epic + pid + start seconds (still unique per run) TRACE_ID="epic-${EPIC_ID}-$$-$(date +%s)" fi fi TRACES_DIR="${TRACES_DIR:-$SPRINT_ARTIFACTS_DIR/traces}" mkdir -p "$TRACES_DIR" 2>/dev/null || true TRACE_FILE="$TRACES_DIR/epic-${EPIC_ID}-trace.jsonl" LIVE_TRACE_FILE="$TRACES_DIR/epic-${EPIC_ID}-trace.live.jsonl" TRACE_ENABLED=true log "Tracing enabled (trace_id=$TRACE_ID) -> $TRACE_FILE" return 0 } # Set the ambient context for the next span. Called by phase functions before # invoking claude (ambient-vars approach, Open Decision 1). set_span_context() { CURRENT_PHASE="$1" CURRENT_STORY_ID="$2" } # ============================================================================= # Intra-phase Heartbeat (liveness for debugging hangs / hard kills) # ============================================================================= # Append one heartbeat record describing the in-flight phase. Reads only the # latest assistant event from the stream (bounded I/O — no slurp), so it stays # cheap even for long phases with large tool output. Best-effort: a transient # partial line just skips this tick. emit_heartbeat() { local stream_file="$1" { [ "$TRACE_ENABLED" = true ] && [ -n "$LIVE_TRACE_FILE" ] && [ -f "$stream_file" ]; } || return 0 local now elapsed ts events last_asst now=$(date +%s) elapsed=$(( now - ${PHASE_START_SECONDS:-$now} )) ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") events=$(wc -l < "$stream_file" 2>/dev/null | tr -d ' '); events="${events:-0}" last_asst=$(grep '"type":"assistant"' "$stream_file" 2>/dev/null | tail -n 1) if [ -n "$last_asst" ]; then printf '%s' "$last_asst" | jq -c \ --arg trace_id "$TRACE_ID" --arg phase "$CURRENT_PHASE" --arg story "$CURRENT_STORY_ID" \ --arg ts "$ts" --argjson elapsed "$elapsed" --argjson events "$events" ' (.message.usage // {}) as $u | { trace_id:$trace_id, kind:"heartbeat", phase:$phase, story_id:$story, elapsed_s:$elapsed, ts:$ts, events:$events, ctx_input_tokens: ($u.input_tokens // 0), cache_read: ($u.cache_read_input_tokens // 0), out_tokens: ($u.output_tokens // 0), last_text: ((([ .message.content[]? | select(.type=="text") | .text ] | last) // "")[0:160]) }' \ >> "$LIVE_TRACE_FILE" 2>/dev/null || true else # No assistant output yet — still emit a beat so you can see it started. jq -nc \ --arg trace_id "$TRACE_ID" --arg phase "$CURRENT_PHASE" --arg story "$CURRENT_STORY_ID" \ --arg ts "$ts" --argjson elapsed "$elapsed" --argjson events "$events" \ '{trace_id:$trace_id, kind:"heartbeat", phase:$phase, story_id:$story, elapsed_s:$elapsed, ts:$ts, events:$events, ctx_input_tokens:0, out_tokens:0, last_text:""}' \ >> "$LIVE_TRACE_FILE" 2>/dev/null || true fi } # Start a background heartbeat for the current phase. Emits an immediate beat, # then one every HEARTBEAT_INTERVAL seconds until the phase ends or the main # process dies (self-terminates within one interval on SIGKILL). start_phase_heartbeat() { local stream_file="$1" { [ "$TRACE_ENABLED" = true ] && [ "${HEARTBEAT_INTERVAL:-0}" -gt 0 ] 2>/dev/null; } || return 0 PHASE_START_SECONDS=$(date +%s) emit_heartbeat "$stream_file" ( while kill -0 "$MAIN_PID" 2>/dev/null; do sleep "$HEARTBEAT_INTERVAL" emit_heartbeat "$stream_file" done ) & HEARTBEAT_PID=$! } # Stop the background heartbeat (called when the phase's claude call returns, # and defensively from cleanup()). stop_phase_heartbeat() { [ -n "${HEARTBEAT_PID:-}" ] || return 0 kill "$HEARTBEAT_PID" 2>/dev/null || true wait "$HEARTBEAT_PID" 2>/dev/null || true HEARTBEAT_PID="" } # ============================================================================= # Span Recording # ============================================================================= # Append one OTel-shaped span for the just-completed claude phase. # Reads the final `result` envelope from the raw stream file (always the last # JSONL line) and derives the telemetry fields. # # Arguments: # $1 - raw stream file (JSONL emitted by claude --output-format stream-json) # $2 - phase status (optional; the downstream completion status, e.g. COMPLETE) record_span() { local stream_file="$1" local status="${2:-}" [ "$TRACE_ENABLED" = true ] || return 0 [ -n "$TRACE_FILE" ] || return 0 [ -f "$stream_file" ] || return 0 # The result envelope is always the last line. A single JSONL line, so this # is memory-cheap regardless of phase length. local envelope envelope=$(tail -n 1 "$stream_file" 2>/dev/null) # Validate it is actually the result event; if the call crashed mid-stream # the last line won't be a result — record a degraded error span instead. local etype etype=$(printf '%s' "$envelope" | jq -r '.type // empty' 2>/dev/null) || true local ts ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") if [ "$etype" != "result" ]; then jq -nc \ --arg trace_id "$TRACE_ID" \ --arg parent "$CURRENT_STORY_ID" \ --arg name "$CURRENT_PHASE" \ --arg story_id "$CURRENT_STORY_ID" \ --arg status "${status:-UNKNOWN}" \ --arg ts "$ts" \ '{trace_id:$trace_id, span_id:null, parent:$parent, name:$name, story_id:$story_id, model:null, input_tokens:0, output_tokens:0, cache_read:0, cost_usd:0, duration_ms:0, ttft_ms:0, num_turns:0, is_error:true, api_error_status:"no_result_envelope", ctx_util_pct:0, status:$status, ts:$ts}' \ >> "$TRACE_FILE" 2>/dev/null || true [ "$VERBOSE" = true ] && log_warn "record_span: no result envelope in stream (degraded span)" return 0 fi # Pull the primary (largest-context) model for ctx utilization. Sub-agent # usage is preserved verbatim under model/cost via the envelope, but for the # span's headline model we take the model with the largest context window. printf '%s' "$envelope" | jq -c \ --arg trace_id "$TRACE_ID" \ --arg parent "$CURRENT_STORY_ID" \ --arg name "$CURRENT_PHASE" \ --arg story_id "$CURRENT_STORY_ID" \ --arg status "$status" \ --arg ts "$ts" ' # Choose headline model = the one with the largest contextWindow (.modelUsage // {}) as $mu | ($mu | to_entries | sort_by(.value.contextWindow // 0) | last) as $primary | ($primary.key // .usage.service_tier // null) as $model | (($primary.value.contextWindow) // 0) as $ctxwin | ((.usage.input_tokens // 0) + (.usage.cache_read_input_tokens // 0) + (.usage.cache_creation_input_tokens // 0)) as $ctx_used | (if $ctxwin > 0 then (($ctx_used / $ctxwin) * 100 * 10 | round / 10) else 0 end) as $ctx_pct | { trace_id: $trace_id, span_id: (.session_id // null), parent: $parent, name: $name, story_id: $story_id, model: $model, input_tokens: (.usage.input_tokens // 0), output_tokens: (.usage.output_tokens // 0), cache_read: (.usage.cache_read_input_tokens // 0), cost_usd: (.total_cost_usd // 0), duration_ms: (.duration_ms // 0), ttft_ms: (.ttft_ms // 0), num_turns: (.num_turns // 0), is_error: (.is_error // false), api_error_status: (.api_error_status // null), ctx_util_pct: $ctx_pct, status: $status, ts: $ts }' >> "$TRACE_FILE" 2>/dev/null || true # Context-utilization warning feeds the Context Strategy gap (#7). local ctx_pct ctx_pct=$(printf '%s' "$envelope" | jq -r ' (.modelUsage // {} | to_entries | sort_by(.value.contextWindow // 0) | last) as $p | (($p.value.contextWindow) // 0) as $w | ((.usage.input_tokens // 0) + (.usage.cache_read_input_tokens // 0) + (.usage.cache_creation_input_tokens // 0)) as $u | if $w > 0 then (($u / $w) * 100 | floor) else 0 end' 2>/dev/null) if [ -n "$ctx_pct" ] && [ "$ctx_pct" -ge 80 ] 2>/dev/null; then log_warn "Context utilization high: ${ctx_pct}% (${CURRENT_PHASE} / ${CURRENT_STORY_ID})" fi return 0 } # ============================================================================= # Rollup (deterministic telemetry into metrics.yaml) # ============================================================================= # Sum all spans for this epic and write a telemetry block into metrics.yaml. # Deterministic: no model involved, no fabrication. No-op without yq. rollup_telemetry() { [ "$TRACE_ENABLED" = true ] || return 0 [ -n "$TRACE_FILE" ] && [ -f "$TRACE_FILE" ] || return 0 [ -n "$METRICS_FILE" ] && [ -f "$METRICS_FILE" ] || return 0 command -v yq >/dev/null 2>&1 || { log_warn "yq not found - skipping telemetry rollup"; return 0; } # Aggregate totals + per-phase breakdown from the JSONL spans. local totals_json totals_json=$(jq -s ' { total_cost_usd: (map(.cost_usd) | add // 0), total_input_tokens: (map(.input_tokens) | add // 0), total_output_tokens: (map(.output_tokens) | add // 0), cache_read_tokens: (map(.cache_read) | add // 0), phases_total: length, phases_errored: (map(select(.is_error == true)) | length), by_phase: (group_by(.name) | map({ key: (.[0].name // "unknown"), value: { calls: length, cost_usd: (map(.cost_usd) | add // 0), input_tokens: (map(.input_tokens) | add // 0), output_tokens: (map(.output_tokens) | add // 0) } }) | from_entries) }' "$TRACE_FILE" 2>/dev/null) [ -z "$totals_json" ] && return 0 # Tool Call Success Rate (phase-level proxy): non-errored phases / total. local rate rate=$(printf '%s' "$totals_json" | jq -r ' if .phases_total > 0 then (((.phases_total - .phases_errored) / .phases_total) * 100 * 10 | round / 10) else 0 end' 2>/dev/null) # Write the telemetry block (yq reads the JSON via env to avoid quoting hell). TELEMETRY_JSON="$totals_json" yq -i '.telemetry = (strenv(TELEMETRY_JSON) | from_json) | (.telemetry | ..) style=""' "$METRICS_FILE" 2>/dev/null || { log_warn "Failed to write telemetry block to metrics" return 0 } yq -i ".telemetry.trace_id = \"$TRACE_ID\"" "$METRICS_FILE" 2>/dev/null || true # 2026 business-outcome rates (gap #9). Derived deterministically from the # counters already in metrics.yaml — no model, no estimation. # - Tool Call Success Rate (phase-level proxy): non-errored phases / total # - Task Completion Rate: completed stories / total stories # - Escalation Rate: stories that exhausted retries / total stories local total completed max_retries total=$(yq '.stories.total // 0' "$METRICS_FILE" 2>/dev/null); total="${total:-0}" completed=$(yq '.stories.completed // 0' "$METRICS_FILE" 2>/dev/null); completed="${completed:-0}" max_retries=$(yq '.fix_loop.max_retries_hit // 0' "$METRICS_FILE" 2>/dev/null); max_retries="${max_retries:-0}" local completion_rate=0 escalation_rate=0 if [ "$total" -gt 0 ] 2>/dev/null; then completion_rate=$(awk "BEGIN { printf \"%.1f\", ($completed / $total) * 100 }") escalation_rate=$(awk "BEGIN { printf \"%.1f\", ($max_retries / $total) * 100 }") fi yq -i ".telemetry.tool_call_success_rate = $rate" "$METRICS_FILE" 2>/dev/null || true yq -i ".telemetry.task_completion_rate = $completion_rate" "$METRICS_FILE" 2>/dev/null || true yq -i ".telemetry.escalation_rate = $escalation_rate" "$METRICS_FILE" 2>/dev/null || true log "Telemetry rolled up into metrics: $METRICS_FILE" return 0 }