feat(scripts): port memory-safe execution and reliability improvements from revive-dev

Sync functional improvements developed in revive-dev into BMAD-METHOD fork while preserving repo-specific paths: - Add memory-safe Claude helpers (run_claude_to_file, read_phase_tail) that pipe output to temp files instead of bash variables, preventing GB-scale RAM usage during long epic executions - Add kill_orphaned_test_processes() to clean up zombie jest/vitest/ playwright/pytest processes between stories and on exit - Replace per-call `env -u CLAUDECODE` with global `unset CLAUDECODE` at script start for cleaner nested session support - Port metrics resume/accumulation logic that restores counters from existing YAML on resumed runs and accumulates duration - Add log truncation between stories (64KB cap) to prevent unbounded log growth across multi-story runs - Add log persistence and cleanup trap to epic-chain.sh - Revert regression-gate.sh test commands to direct execution (matching revive-dev pattern) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 11:52:39 -05:00 · 2026-04-12 11:52:39 -05:00 · cdc92d0d90
parent 1eedebaca8
commit cdc92d0d90
7 changed files with 285 additions and 111 deletions
--- a/scripts/epic-chain.sh
+++ b/scripts/epic-chain.sh
@ -32,6 +32,9 @@

 set -e

+# Allow nested Claude Code sessions (when launched from within Claude Code)
+unset CLAUDECODE 2>/dev/null || true
+
 # =============================================================================
 # Configuration
 # =============================================================================
@ -47,6 +50,8 @@ UAT_DIR="$PROJECT_ROOT/docs/uat"
 HANDOFF_DIR="$PROJECT_ROOT/docs/handoffs"

 LOG_FILE="/tmp/bmad-epic-chain-$$.log"
+LOGS_DIR="$SPRINT_ARTIFACTS_DIR/logs"
+FINAL_LOG_FILE=""
 CHAIN_PLAN_FILE="$SPRINT_ARTIFACTS_DIR/chain-plan.yaml"

 # Colors for output
@ -110,6 +115,76 @@ log_section() {
    echo -e "${BOLD}───────────────────────────────────────────────────────────${NC}"
 }

+# =============================================================================
+# Orphaned Process Cleanup
+# =============================================================================
+
+kill_orphaned_test_processes() {
+    # Kill orphaned node/test processes that may have been spawned during epic execution
+    local killed=0
+
+    for pattern in "node.*jest" "node.*vitest" "node.*playwright" "node.*next.*dev" "node.*tsx.*watch"; do
+        local pids
+        pids=$(pgrep -f "$pattern" 2>/dev/null || true)
+        if [ -n "$pids" ]; then
+            echo "$pids" | while read -r pid; do
+                if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+                    kill "$pid" 2>/dev/null || true
+                    ((killed++)) || true
+                fi
+            done
+        fi
+    done
+
+    local pytest_pids
+    pytest_pids=$(pgrep -f "python.*pytest" 2>/dev/null || true)
+    if [ -n "$pytest_pids" ]; then
+        echo "$pytest_pids" | while read -r pid; do
+            if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+                kill "$pid" 2>/dev/null || true
+                ((killed++)) || true
+            fi
+        done
+    fi
+
+    if [ "${killed:-0}" -gt 0 ]; then
+        log "Killed orphaned test processes"
+    fi
+}
+
+# =============================================================================
+# Log Persistence
+# =============================================================================
+
+save_log_to_repo() {
+    if [ ! -f "$LOG_FILE" ] || [ ! -s "$LOG_FILE" ]; then
+        return 0
+    fi
+
+    mkdir -p "$LOGS_DIR" 2>/dev/null || true
+
+    local timestamp
+    timestamp=$(date '+%Y%m%d-%H%M%S')
+    FINAL_LOG_FILE="$LOGS_DIR/epic-chain-${timestamp}.log"
+
+    if cp "$LOG_FILE" "$FINAL_LOG_FILE" 2>/dev/null; then
+        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Log saved to: $FINAL_LOG_FILE" >> "$FINAL_LOG_FILE"
+    fi
+}
+
+cleanup_chain() {
+    local exit_code=$?
+    trap - EXIT INT TERM
+    kill_orphaned_test_processes
+    save_log_to_repo
+    if [ -n "$FINAL_LOG_FILE" ] && [ -f "$FINAL_LOG_FILE" ]; then
+        echo "    - Log saved:     $FINAL_LOG_FILE"
+    fi
+    exit $exit_code
+}
+
+trap cleanup_chain EXIT INT TERM
+
 # Helper function to create basic report if Claude fails
 create_basic_report() {
    local end_time_iso=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
@ -578,6 +653,9 @@ for current_idx in "${!EXECUTION_ORDER[@]}"; do

            ((COMPLETED_EPICS++))

+            # Kill orphaned node/test processes between epics
+            kill_orphaned_test_processes
+
            # Generate handoff for next epic
            if [ "$NO_HANDOFF" = false ]; then
                next_idx=$((current_idx + 1))
@ -803,7 +881,7 @@ REPORT_GENERATED: $CHAIN_REPORT_FILE"
        log "Invoking report generator..."

        # Execute report generation
-        report_result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$report_prompt" 2>&1) || true
+        report_result=$(claude --dangerously-skip-permissions -p "$report_prompt" 2>&1) || true

        echo "$report_result" >> "$LOG_FILE"

@ -847,7 +925,7 @@ echo "    - Metrics:       $METRICS_DIR/"
 if [ -f "$CHAIN_REPORT_FILE" ]; then
 echo "    - Report:        $CHAIN_REPORT_FILE"
 fi
-echo "    - Log:           $LOG_FILE"
+echo "    - Log:           (saved on exit to $LOGS_DIR/)"
 echo ""

 if [ $FAILED_EPICS -gt 0 ]; then
--- a/scripts/epic-execute-lib/design-phase.sh
+++ b/scripts/epic-execute-lib/design-phase.sh
@ -142,10 +142,10 @@ DESIGN COMPLETE: $story_id"
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$design_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$design_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    # Extract design block
    LAST_DESIGN=$(echo "$result" | sed -n '/DESIGN START/,/DESIGN END/p')
--- a/scripts/epic-execute-lib/regression-gate.sh
+++ b/scripts/epic-execute-lib/regression-gate.sh
@ -126,9 +126,9 @@ init_regression_baseline() {

            # Check if there's a test:json script for better parsing
            if grep -q '"test:json"' "$PROJECT_ROOT/package.json" 2>/dev/null; then
-                test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" npm run test:json) || true
+                test_output=$(cd "$PROJECT_ROOT" && npm run test:json 2>&1) || true
            else
-                test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" npm test) || true
+                test_output=$(cd "$PROJECT_ROOT" && npm test 2>&1) || true
            fi

            BASELINE_PASSING_TESTS=$(extract_test_count "$test_output")
@ -152,14 +152,14 @@ init_regression_baseline() {
    elif [ -f "$PROJECT_ROOT/Cargo.toml" ]; then
        # Rust project
        log "Capturing baseline test count (Rust)..."
-        test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" cargo test) || true
+        test_output=$(cd "$PROJECT_ROOT" && cargo test 2>&1) || true
        BASELINE_PASSING_TESTS=$(extract_test_count "$test_output")
        log "Baseline passing tests: $BASELINE_PASSING_TESTS"

    elif [ -f "$PROJECT_ROOT/go.mod" ]; then
        # Go project
        log "Capturing baseline test count (Go)..."
-        test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" go test ./... -v) || true
+        test_output=$(cd "$PROJECT_ROOT" && go test ./... -v 2>&1) || true
        BASELINE_PASSING_TESTS=$(extract_test_count "$test_output")
        log "Baseline passing tests: $BASELINE_PASSING_TESTS"

@ -167,7 +167,7 @@ init_regression_baseline() {
        # Python project
        if command -v pytest >/dev/null 2>&1; then
            log "Capturing baseline test count (Python)..."
-            test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" pytest -v) || true
+            test_output=$(cd "$PROJECT_ROOT" && pytest -v 2>&1) || true
            BASELINE_PASSING_TESTS=$(extract_test_count "$test_output")
            log "Baseline passing tests: $BASELINE_PASSING_TESTS"
        fi
@ -199,23 +199,23 @@ execute_regression_gate() {
    if [ -f "$PROJECT_ROOT/package.json" ]; then
        # Check if there's a test:json script for better parsing
        if grep -q '"test:json"' "$PROJECT_ROOT/package.json" 2>/dev/null; then
-            test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" npm run test:json) || true
+            test_output=$(cd "$PROJECT_ROOT" && npm run test:json 2>&1) || true
        else
-            test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" npm test) || true
+            test_output=$(cd "$PROJECT_ROOT" && npm test 2>&1) || true
        fi
        current_tests=$(extract_test_count "$test_output")

    elif [ -f "$PROJECT_ROOT/Cargo.toml" ]; then
-        test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" cargo test) || true
+        test_output=$(cd "$PROJECT_ROOT" && cargo test 2>&1) || true
        current_tests=$(extract_test_count "$test_output")

    elif [ -f "$PROJECT_ROOT/go.mod" ]; then
-        test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" go test ./... -v) || true
+        test_output=$(cd "$PROJECT_ROOT" && go test ./... -v 2>&1) || true
        current_tests=$(extract_test_count "$test_output")

    elif [ -f "$PROJECT_ROOT/requirements.txt" ] || [ -f "$PROJECT_ROOT/pyproject.toml" ]; then
        if command -v pytest >/dev/null 2>&1; then
-            test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" pytest -v) || true
+            test_output=$(cd "$PROJECT_ROOT" && pytest -v 2>&1) || true
            current_tests=$(extract_test_count "$test_output")
        fi
    fi
--- a/scripts/epic-execute-lib/tdd-flow.sh
+++ b/scripts/epic-execute-lib/tdd-flow.sh
@ -168,10 +168,10 @@ After outputting the spec block:
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$spec_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$spec_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    # Extract test spec block
    LAST_TEST_SPEC=$(echo "$result" | sed -n '/TEST SPEC START/,/TEST SPEC END/p')
@ -314,10 +314,10 @@ After implementing the tests:
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$impl_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$impl_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    # Check completion
    local completion_status
--- a/scripts/epic-execute-lib/utils.sh
+++ b/scripts/epic-execute-lib/utils.sh
@ -137,7 +137,7 @@ execute_claude_with_retry() {

    # Wrapper function for retry
    _claude_invoke() {
-        timeout "$timeout" env -u CLAUDECODE claude --dangerously-skip-permissions -p "$1" 2>&1
+        timeout "$timeout" claude --dangerously-skip-permissions -p "$1" 2>&1
        local code=$?
        if [ $code -eq 124 ]; then
            echo "TIMEOUT: Claude invocation timed out after ${timeout}s"
@ -626,7 +626,7 @@ execute_claude_verbose() {

        # Execute with output tee'd to both terminal and log file
        local result
-        result=$(timeout "$timeout" env -u CLAUDECODE claude --dangerously-skip-permissions -p "$prompt" 2>&1 | tee -a "$LOG_FILE")
+        result=$(timeout "$timeout" claude --dangerously-skip-permissions -p "$prompt" 2>&1 | tee -a "$LOG_FILE")
        local exit_code=$?

        if [ $exit_code -eq 124 ]; then
@ -640,7 +640,7 @@ execute_claude_verbose() {
    else
        # Non-verbose mode: capture output silently
        local result
-        result=$(timeout "$timeout" env -u CLAUDECODE claude --dangerously-skip-permissions -p "$prompt" 2>&1)
+        result=$(timeout "$timeout" claude --dangerously-skip-permissions -p "$prompt" 2>&1)
        local exit_code=$?

        # Log to file only
--- a/scripts/epic-execute.sh
+++ b/scripts/epic-execute.sh
@ -20,6 +20,9 @@

 set -e

+# Allow nested Claude Code sessions (when launched from within Claude Code)
+unset CLAUDECODE 2>/dev/null || true
+
 # =============================================================================
 # Cleanup and Signal Handling
 # =============================================================================
@ -91,6 +94,12 @@ cleanup() {
        fi
    fi

+    # Kill orphaned node/test processes
+    kill_orphaned_test_processes
+
+    # Clean up phase output temp file
+    rm -f "$PHASE_OUTPUT_FILE" 2>/dev/null
+
    # Save log to repo before exiting
    save_log_to_repo
    if [ -n "$FINAL_LOG_FILE" ] && [ -f "$FINAL_LOG_FILE" ]; then
@ -275,6 +284,46 @@ flush_log_to_repo() {
    cp "$LOG_FILE" "$flush_file" 2>/dev/null || true
 }

+# =============================================================================
+# Orphaned Process Cleanup
+# =============================================================================
+
+kill_orphaned_test_processes() {
+    # Kill orphaned node/test processes that may have been spawned during story execution
+    # These can accumulate and consume memory if tests or dev servers aren't cleaned up
+    local killed=0
+
+    # Kill orphaned node test runners (jest, vitest, playwright)
+    for pattern in "node.*jest" "node.*vitest" "node.*playwright" "node.*next.*dev" "node.*tsx.*watch"; do
+        local pids
+        pids=$(pgrep -f "$pattern" 2>/dev/null || true)
+        if [ -n "$pids" ]; then
+            echo "$pids" | while read -r pid; do
+                if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+                    kill "$pid" 2>/dev/null || true
+                    ((killed++)) || true
+                fi
+            done
+        fi
+    done
+
+    # Kill orphaned pytest processes
+    local pytest_pids
+    pytest_pids=$(pgrep -f "python.*pytest" 2>/dev/null || true)
+    if [ -n "$pytest_pids" ]; then
+        echo "$pytest_pids" | while read -r pid; do
+            if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+                kill "$pid" 2>/dev/null || true
+                ((killed++)) || true
+            fi
+        done
+    fi
+
+    if [ "${killed:-0}" -gt 0 ]; then
+        log "Killed orphaned test processes"
+    fi
+}
+
 # =============================================================================
 # Git Safety Functions
 # =============================================================================
@ -449,6 +498,50 @@ log_prompt_size() {
    fi
 }

+# =============================================================================
+# Memory-safe Claude execution helpers
+# =============================================================================
+# Instead of capturing claude output into a bash variable (which can consume
+# gigabytes of RAM), pipe output directly to a temp file and read only the
+# tail for completion signal parsing.
+
+# Temp file for current phase output (reused across phases, cleaned up on exit)
+PHASE_OUTPUT_FILE="/tmp/bmad-phase-output-$$.txt"
+
+# Run claude and pipe output directly to file + LOG_FILE (no bash variable)
+# Arguments:
+#   $1 - prompt text (use "-f" as first arg to use file-based prompt)
+#   $2 - prompt file path (only when $1 is "-f")
+# Sets: PHASE_OUTPUT_FILE with the output
+run_claude_to_file() {
+    # Truncate phase output file
+    : > "$PHASE_OUTPUT_FILE"
+
+    if [ "$1" = "-f" ]; then
+        local prompt_file="$2"
+        claude --dangerously-skip-permissions -f "$prompt_file" 2>&1 | tee -a "$LOG_FILE" > "$PHASE_OUTPUT_FILE" || true
+    else
+        local prompt="$1"
+        claude --dangerously-skip-permissions -p "$prompt" 2>&1 | tee -a "$LOG_FILE" > "$PHASE_OUTPUT_FILE" || true
+    fi
+}
+
+# Read the tail of phase output for completion signal parsing.
+# Only reads the last 32KB — enough for JSON result blocks and signal lines,
+# but avoids loading megabytes of tool output into a bash variable.
+# Arguments: none (reads from PHASE_OUTPUT_FILE)
+# Returns: tail content on stdout
+read_phase_tail() {
+    tail -c 32768 "$PHASE_OUTPUT_FILE" 2>/dev/null || echo ""
+}
+
+# Read full phase output (use sparingly — only when you must search the entire output)
+# Arguments: none (reads from PHASE_OUTPUT_FILE)
+# Returns: full content on stdout
+read_phase_output() {
+    cat "$PHASE_OUTPUT_FILE" 2>/dev/null || echo ""
+}
+
 # =============================================================================
 # Shared Automated Prompt Builder
 # =============================================================================
@ -538,32 +631,38 @@ PROMPT_EOF

 METRICS_DIR=""
 METRICS_FILE=""
+METRICS_RESUMED=false

 init_metrics() {
    METRICS_DIR="$SPRINT_ARTIFACTS_DIR/metrics"
    METRICS_FILE="$METRICS_DIR/epic-${EPIC_ID}-metrics.yaml"
    mkdir -p "$METRICS_DIR"

-    # L4: Archive existing metrics file to prevent unbounded growth
-    if [ -f "$METRICS_FILE" ]; then
-        local archive_name="epic-${EPIC_ID}-metrics.$(date +%Y%m%d%H%M%S).yaml"
-        local archive_dir="$METRICS_DIR/archive"
-        mkdir -p "$archive_dir"
-        mv "$METRICS_FILE" "$archive_dir/$archive_name"
-        log "Archived previous metrics to: archive/$archive_name"
-
-        # Clean up old archives (keep last 10)
-        local archive_count
-        archive_count=$(find "$archive_dir" -name "epic-${EPIC_ID}-metrics.*.yaml" 2>/dev/null | wc -l | tr -d ' ')
-        if [ "$archive_count" -gt 10 ]; then
-            log "Cleaning up old metrics archives (keeping last 10)..."
-            find "$archive_dir" -name "epic-${EPIC_ID}-metrics.*.yaml" -type f | \
-                sort | head -n -10 | xargs rm -f 2>/dev/null || true
-        fi
-    fi
-
    local start_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

+    # If metrics file already exists, preserve it and seed in-memory counters
+    if [ -f "$METRICS_FILE" ]; then
+        METRICS_RESUMED=true
+        log "Resuming with existing metrics: $METRICS_FILE"
+
+        # Seed in-memory counters from existing YAML so they accumulate
+        if command -v yq >/dev/null 2>&1; then
+            COMPLETED=$(yq '.stories.completed // 0' "$METRICS_FILE")
+            FAILED=$(yq '.stories.failed // 0' "$METRICS_FILE")
+            SKIPPED=$(yq '.stories.skipped // 0' "$METRICS_FILE")
+
+            log "Restored counters: completed=$COMPLETED failed=$FAILED skipped=$SKIPPED"
+
+            # Record resume event
+            yq -i ".execution.resumed_at = \"$start_time\"" "$METRICS_FILE"
+        else
+            log_warn "yq not found - cannot restore counters from existing metrics"
+        fi
+
+        return
+    fi
+
+    # No existing file - create fresh metrics
    cat > "$METRICS_FILE" << EOF
 epic_id: "$EPIC_ID"
 execution:
@ -675,31 +774,21 @@ finalize_metrics() {
    local end_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

    if command -v yq >/dev/null 2>&1; then
+        # Add current session duration to any prior duration (for resumed runs)
+        local prior_duration
+        prior_duration=$(yq '.execution.duration_seconds // 0' "$METRICS_FILE")
+        local total_duration=$((prior_duration + duration))
+
        yq -i ".execution.end_time = \"$end_time\"" "$METRICS_FILE"
-        yq -i ".execution.duration_seconds = $duration" "$METRICS_FILE"
+        yq -i ".execution.duration_seconds = $total_duration" "$METRICS_FILE"
        yq -i ".stories.total = $total_stories" "$METRICS_FILE"
        yq -i ".stories.completed = $completed" "$METRICS_FILE"
        yq -i ".stories.failed = $failed" "$METRICS_FILE"
        yq -i ".stories.skipped = $skipped" "$METRICS_FILE"
    else
-        # Fallback: rewrite the file with final values
-        cat > "$METRICS_FILE" << EOF
-epic_id: "$EPIC_ID"
-execution:
-  start_time: "$EPIC_START_TIME"
-  end_time: "$end_time"
-  duration_seconds: $duration
-stories:
-  total: $total_stories
-  completed: $completed
-  failed: $failed
-  skipped: $skipped
-validation:
-  gate_executed: false
-  gate_status: "PENDING"
-  fix_attempts: 0
-issues: []
-EOF
+        # Fallback without yq: only update counters, don't overwrite the file
+        # This preserves issues, story_details, and fix_loop data
+        log_warn "yq not found - metrics finalization limited (counters may be stale)"
    fi

    log "Metrics finalized: $METRICS_FILE"
@ -744,7 +833,7 @@ update_sprint_status() {

    # Find sprint-status.yaml file
    local sprint_file=""
-    for search_dir in "$SPRINT_ARTIFACTS_DIR" "$SPRINTS_DIR" "$PROJECT_ROOT/docs"; do
+    for search_dir in "$SPRINT_ARTIFACTS_DIR" "$SPRINTS_DIR" "$PROJECT_ROOT/_bmad-output" "$PROJECT_ROOT/docs"; do
        if [ -f "$search_dir/sprint-status.yaml" ]; then
            sprint_file="$search_dir/sprint-status.yaml"
            break
@ -1418,11 +1507,10 @@ Do NOT use 'git add -A' or 'git add .' - only stage files you created or modifie
        return 0
    fi

-    # Execute in isolated context
+    # Execute in isolated context — pipe to file to avoid memory bloat
+    run_claude_to_file "$dev_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$dev_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    # Check completion using JSON parsing with text fallback
    local completion_status
@ -1558,11 +1646,10 @@ Stage any fixes with explicit file paths: git add <file1> <file2> ..."
        return 0
    fi

-    # Execute in isolated context
+    # Execute in isolated context — pipe to file to avoid memory bloat
+    run_claude_to_file "$review_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$review_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    # Check completion using JSON parsing with text fallback
    local completion_status
@ -1792,9 +1879,6 @@ Address all review findings now. This is attempt $attempt_num of 3."
        log "Truncated prompt size: ${prompt_size}B"
    fi

-    # Declare result variable outside the conditional blocks
-    local result=""
-
    # Final safety check - if still too large, write to temp file and use -f flag
    if [ "$prompt_size" -gt "$MAX_PROMPT_SIZE" ]; then
        log_warn "Prompt still too large after truncation - using file-based prompt"
@ -1808,7 +1892,8 @@ Address all review findings now. This is attempt $attempt_num of 3."
            return 0
        fi

-        result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -f "$temp_prompt_file" 2>&1) || true
+        # Pipe to file to avoid memory bloat
+        run_claude_to_file "-f" "$temp_prompt_file"
        rm -f "$temp_prompt_file"
    else
        if [ "$DRY_RUN" = true ]; then
@ -1816,11 +1901,12 @@ Address all review findings now. This is attempt $attempt_num of 3."
            return 0
        fi

-        # Execute in isolated context
-        result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$fix_prompt" 2>&1) || true
+        # Execute in isolated context — pipe to file to avoid memory bloat
+        run_claude_to_file "$fix_prompt"
    fi

-    echo "$result" >> "$LOG_FILE"
+    local result
+    result=$(read_phase_tail)

    # Check completion using JSON parsing with text fallback
    local completion_status
@ -1956,7 +2042,7 @@ $build_output
        if grep -q '"test"' "$PROJECT_ROOT/package.json" 2>/dev/null; then
            log "Running tests..."
            local test_output
-            test_output=$(cd "$PROJECT_ROOT" && run_with_timeout "${REGRESSION_TEST_TIMEOUT:-120}" npm test) || {
+            test_output=$(cd "$PROJECT_ROOT" && npm test 2>&1) || {
                local exit_code=$?

                # Check if there are NEW failures (not just pre-existing baseline failures)
@ -2273,10 +2359,10 @@ Stage any fixes with: git add <file1> <file2> ..."
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$arch_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$arch_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    if echo "$result" | grep -q "ARCH COMPLIANT"; then
        log_success "Architecture compliant: $story_id"
@ -2350,10 +2436,10 @@ Stage any fixes with: git add <file1> <file2> ..."
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$quality_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$quality_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    if echo "$result" | grep -q "TEST QUALITY APPROVED"; then
        log_success "Test quality approved: $story_id"
@ -2485,10 +2571,10 @@ Analyze traceability now. Read story files on-demand as needed."
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$trace_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$trace_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    if echo "$result" | grep -q "TRACEABILITY PASS"; then
        log_success "Traceability passed: Epic $EPIC_ID"
@ -2563,10 +2649,10 @@ Generate missing tests now."
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$fix_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$fix_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    if echo "$result" | grep -q "TEST GENERATION COMPLETE"; then
        log_success "Test generation complete for Epic $EPIC_ID"
@ -2917,10 +3003,10 @@ Generate the UAT document now. Read story files on-demand as needed."
        return 0
    fi

+    # Pipe to file to avoid memory bloat
+    run_claude_to_file "$uat_prompt"
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$uat_prompt" 2>&1) || true
-
-    echo "$result" >> "$LOG_FILE"
+    result=$(read_phase_tail)

    if echo "$result" | grep -q "UAT GENERATED"; then
        log_success "UAT document generated"
@ -2943,16 +3029,10 @@ log "=========================================="
 log "Starting execution of ${#STORIES[@]} stories"
 log "=========================================="

-# Initialize counters (may be restored from checkpoint)
-if [ -z "$COMPLETED" ] || [ "$COMPLETED" = "0" ]; then
-    COMPLETED=0
-fi
-if [ -z "$FAILED" ] || [ "$FAILED" = "0" ]; then
-    FAILED=0
-fi
-if [ -z "$SKIPPED" ] || [ "$SKIPPED" = "0" ]; then
-    SKIPPED=0
-fi
+# Initialize counters (may already be restored from metrics or checkpoint)
+: "${COMPLETED:=0}"
+: "${FAILED:=0}"
+: "${SKIPPED:=0}"
 START_TIME=$(date +%s)
 STARTED=false

@ -2976,10 +3056,13 @@ for story_file in "${STORIES[@]}"; do
            STARTED=true
        else
            log_warn "Skipping $story_id (waiting for $START_FROM)"
-            ((SKIPPED++))
+            # Only count as skipped if this is a fresh run (no prior metrics)
+            if [ "${METRICS_RESUMED:-false}" = false ]; then
+                ((SKIPPED++))
+                update_story_metrics "skipped"
+            fi
            ((STORY_INDEX++))
            CURRENT_STORY_INDEX=$STORY_INDEX
-            update_story_metrics "skipped"
            continue
        fi
    fi
@ -2988,10 +3071,13 @@ for story_file in "${STORIES[@]}"; do
    if [ "$SKIP_DONE" = true ]; then
        if grep -qi "^Status:.*done" "$story_file" 2>/dev/null; then
            log_warn "Skipping $story_id (Status: Done)"
-            ((SKIPPED++))
+            # Only count as skipped if this is a fresh run (no prior metrics)
+            if [ "${METRICS_RESUMED:-false}" = false ]; then
+                ((SKIPPED++))
+                update_story_metrics "skipped"
+            fi
            ((STORY_INDEX++))
            CURRENT_STORY_INDEX=$STORY_INDEX
-            update_story_metrics "skipped"
            continue
        fi
    fi
@ -3052,6 +3138,16 @@ for story_file in "${STORIES[@]}"; do
    update_story_metrics "completed"
    log_success "Story complete: $story_id ($COMPLETED/${#STORIES[@]})"

+    # Kill orphaned node/test processes between stories
+    kill_orphaned_test_processes
+
+    # Truncate log file between stories to prevent unbounded growth.
+    # Each Claude phase appends via tee -a, so across 6-7 phases per story
+    # the log can grow to hundreds of MB. Keep only the last 64KB as context.
+    if [ -f "$LOG_FILE" ]; then
+        tail -c 65536 "$LOG_FILE" > "${LOG_FILE}.tmp" 2>/dev/null && mv "${LOG_FILE}.tmp" "$LOG_FILE" 2>/dev/null || true
+    fi
+
    # Track progress for checkpoint/resume
    ((STORY_INDEX++))
    CURRENT_STORY_INDEX=$STORY_INDEX
@ -3060,9 +3156,6 @@ for story_file in "${STORIES[@]}"; do
    if type save_checkpoint >/dev/null 2>&1; then
        save_checkpoint "$STORY_INDEX" "$story_id" "$COMPLETED" "$FAILED" "$SKIPPED"
    fi
-
-    # Flush log to repo after each completed story
-    flush_log_to_repo
 done

 # =============================================================================
--- a/scripts/uat-validate.sh
+++ b/scripts/uat-validate.sh
@ -20,6 +20,9 @@

 set -e

+# Allow nested Claude Code sessions (when launched from within Claude Code)
+unset CLAUDECODE 2>/dev/null || true
+
 # =============================================================================
 # Section 1: Configuration
 # =============================================================================
@ -859,7 +862,7 @@ HUMAN_ACTION_NEEDED: {yes/no}"

    # Execute in isolated context
    local result
-    result=$(env -u CLAUDECODE claude --dangerously-skip-permissions -p "$fix_prompt" 2>&1) || true
+    result=$(claude --dangerously-skip-permissions -p "$fix_prompt" 2>&1) || true

    echo "$result" >> "$LOG_FILE"