feat(story-4.2): implement autonomous server recovery workflow (Phase 2)

Created memtrace-restart.mjs: cross-platform MCP server recovery script that terminates stale processes and verifies server operability via MCP initialize handshake. Added npm script memtrace:restart. Created bmad-memtrace-recovery skill. Includes 8 tests, all passing with 32/32 adapter regression.
2026-05-20 10:52:25 -03:00 · 2026-05-20 10:52:25 -03:00 · aecd1ac27b
parent 3a94cc4571
commit aecd1ac27b
4 changed files with 354 additions and 0 deletions
--- a/.agents/skills/bmad-memtrace-recovery/SKILL.md
+++ b/.agents/skills/bmad-memtrace-recovery/SKILL.md
@ -0,0 +1,62 @@
+---
+name: bmad-memtrace-recovery
+description: 'Memtrace MCP server recovery workflow. Use when the adapter emits MEMTRACE_MCP_ERROR_TIMEOUT (connection failure or timeout) to autonomously restart the server.'
+---
+
+# Memtrace MCP Server Recovery
+
+## When to Use
+
+Activate this skill when:
+- The `memtrace-adapter.mjs` script exits with code 1 AND emits `MEMTRACE_MCP_ERROR_TIMEOUT` on STDOUT
+- An MCP tool call fails with a connection/timeout error
+- The Memtrace MCP server is unresponsive or hung
+
+## Recovery Protocol
+
+### Step 1: Run the Restart Script
+
+Execute from the project root:
+```bash
+npm run memtrace:restart
+```
+
+**DO NOT** use raw OS commands (`taskkill`, `kill -9`, `pkill`). These are prohibited. Only `npm run memtrace:restart` is permitted.
+
+### Step 2: Evaluate Result
+
+**If exit code 0 (SUCCESS):**
+- The stale processes were terminated
+- A fresh memtrace instance is verified operational
+- Proceed to Step 3
+
+**If exit code 1 (FAILURE):**
+- The server could not be recovered
+- Halt the current task immediately
+- Notify the Human Developer: "Memtrace MCP server recovery failed. Manual intervention required."
+- The Human Developer may invoke Story 4.3 fallback to proceed with legacy text-search heuristics.
+
+### Step 3: Verify Index Freshness
+
+After successful restart, verify the index is up-to-date. Run from the project root:
+```bash
+node _bmad/scripts/memtrace/memtrace-adapter.mjs --query list_repos
+```
+
+**If the adapter fails (exit code 1):** The restart may not have been sufficient. Halt and notify: "Memtrace server responded to initialize but adapter query failed. Manual investigation needed."
+
+If the index is stale, re-index from the project root:
+```bash
+memtrace index --path .
+```
+Ensure you are in the project root directory before running this command (the path `.` is relative to CWD).
+
+### Step 4: Resume Task
+
+Once the server is verified online and the index is fresh, resume the original task from the last checkpoint. Do NOT restart the entire workflow — continue from where the timeout occurred.
+
+## Confinement Rules
+
+- **NEVER** use `taskkill`, `kill`, `pkill`, `tasklist`, or any raw OS process command
+- **ALWAYS** use `npm run memtrace:restart` as the sole recovery interface
+- **ALWAYS** halt and escalate to manual intervention if recovery fails
--- a/_bmad/scripts/memtrace/memtrace-restart.mjs
+++ b/_bmad/scripts/memtrace/memtrace-restart.mjs
@ -0,0 +1,188 @@
+#!/usr/bin/env node
+
+import { spawn, execFile } from 'node:child_process';
+import { platform } from 'node:os';
+
+const TIMEOUT_MS = parseInt(process.env.MEMTRACE_TIMEOUT_MS || '10000', 10);
+
+function parseArgs() {
+  const args = process.argv.slice(2);
+  if (args.includes('--help') || args.includes('-h')) {
+    console.log(`Usage: node memtrace-restart.mjs [--dry-run]
+
+Restarts the Memtrace MCP server by terminating stale processes and
+verifying a fresh instance can respond to MCP initialize requests.
+
+Options:
+  --dry-run   Report what would be done without killing any processes
+  --help, -h  Show this help`);
+    process.exit(0);
+  }
+  for (const arg of args) {
+    if (!arg.startsWith('-')) continue;
+    if (arg !== '--dry-run' && arg !== '--help' && arg !== '-h') {
+      console.error(`ERROR: Unknown argument: ${arg}. Use --help to see available options.`);
+      process.exit(1);
+    }
+  }
+  return { dryRun: args.includes('--dry-run') };
+}
+
+async function killStaleProcesses(dryRun) {
+  if (platform() === 'win32') {
+    if (dryRun) {
+      console.error('[restart] DRY-RUN: Would execute: taskkill /f /im memtrace.exe /t');
+      return;
+    }
+    return new Promise((resolvePromise) => {
+      execFile('taskkill', ['/f', '/im', 'memtrace.exe', '/t'], { windowsHide: true }, (err, stdout, stderr) => {
+        if (err) {
+          if (stderr && !stderr.toLowerCase().includes('not found') && !stderr.toLowerCase().includes('instance')) {
+            console.error(`[restart] taskkill warning: ${stderr.trim()}`);
+          }
+        }
+        if (stdout) console.error(`[restart] Terminated: ${stdout.trim().replace(/\r?\n/g, ' ')}`);
+        resolvePromise();
+      });
+    });
+  }
+  if (dryRun) {
+    console.error('[restart] DRY-RUN: Would execute: pkill -f "memtrace mcp"');
+    return;
+  }
+  return new Promise((resolvePromise) => {
+    execFile('pkill', ['-f', 'memtrace mcp'], (err) => {
+      if (err && err.code !== 1) {
+        console.error(`[restart] pkill warning: ${err.message}`);
+      }
+      resolvePromise();
+    });
+  });
+}
+
+async function verifyServerOnline() {
+  return new Promise((resolvePromise) => {
+    const child = spawn('memtrace', ['mcp'], {
+      stdio: ['pipe', 'pipe', 'pipe'],
+      shell: platform() === 'win32',
+      windowsHide: true
+    });
+
+    let resolved = false;
+    let stdoutBuffer = '';
+
+    const finish = (value) => {
+      if (resolved) return;
+      resolved = true;
+      try { child.stdin.end(); } catch (e) {}
+      try { child.kill(); } catch (e) {}
+      resolvePromise(value);
+    };
+
+    const timeout = setTimeout(() => {
+      if (!resolved) {
+        finish(false);
+      }
+    }, TIMEOUT_MS);
+
+    child.on('error', (err) => {
+      if (!resolved) {
+        clearTimeout(timeout);
+        if (err.code === 'ENOENT') {
+          console.error('[restart] memtrace binary not found on PATH. Verify memtrace is installed.');
+        } else {
+          console.error(`[restart] Verification spawn error: ${err.message}`);
+        }
+        finish(false);
+      }
+    });
+
+    child.on('exit', (code, signal) => {
+      if (!resolved) {
+        clearTimeout(timeout);
+        if (signal) {
+          console.error(`[restart] Verification child terminated by signal ${signal} before responding.`);
+        } else {
+          console.error(`[restart] Verification child exited with code ${code} before responding.`);
+        }
+        finish(false);
+      }
+    });
+
+    child.stderr.on('data', () => {});
+
+    const request = JSON.stringify({
+      jsonrpc: '2.0',
+      id: 1,
+      method: 'initialize',
+      params: {
+        protocolVersion: '2024-11-05',
+        capabilities: {},
+        clientInfo: { name: 'memtrace-restart-verifier', version: '1.0.0' }
+      }
+    }) + '\n';
+
+    child.stdout.on('data', (data) => {
+      if (resolved) return;
+      stdoutBuffer += data.toString();
+      const lines = stdoutBuffer.split('\n');
+      stdoutBuffer = lines.pop() || '';
+
+      for (const line of lines) {
+        if (!line.trim()) continue;
+        try {
+          const response = JSON.parse(line);
+          if (response.id === 1) {
+            if (!response.error) {
+              clearTimeout(timeout);
+              finish(true);
+            } else {
+              console.error(`[restart] MCP error response: ${response.error.message || JSON.stringify(response.error)}`);
+            }
+            return;
+          }
+        } catch (err) {
+          if (line.trim().startsWith('{')) {
+            console.error(`[restart] Verification JSON parse error: ${err.message} (payload: ${line.trim().substring(0, 120)}...)`);
+          }
+        }
+      }
+    });
+
+    child.stdin.write(request);
+  });
+}
+
+async function main() {
+  const args = parseArgs();
+
+  console.error('[restart] Memtrace MCP server recovery initiated...');
+
+  console.error('[restart] Step 1/2: Terminating stale memtrace processes...');
+  await killStaleProcesses(args.dryRun);
+
+  if (args.dryRun) {
+    console.error('[restart] DRY-RUN complete. No processes terminated. Exiting with code 0.');
+    process.exit(0);
+  }
+
+  // Allow OS to release process handles, ports, and file locks before verification.
+  // 500ms is a best-effort delay; loaded systems may need more but we optimise for common case.
+  await new Promise(r => setTimeout(r, 500));
+
+  console.error('[restart] Step 2/2: Verifying memtrace server responds to MCP...');
+  const online = await verifyServerOnline();
+
+  if (!online) {
+    console.error(`[restart] FAIL: Memtrace server did not respond within ${TIMEOUT_MS}ms.`);
+    console.error('[restart] The IDE/client may need to reconnect on the next MCP tool call.');
+    console.error('[restart] If the issue persists, manual intervention is required (Story 4.3).');
+    process.exit(1);
+  }
+
+  console.error('[restart] SUCCESS: Memtrace MCP server verified operational.');
+  console.error('[restart] The IDE/client will reconnect automatically on the next MCP tool call.');
+  process.exit(0);
+}
+
+main();
--- a/_bmad/scripts/memtrace/memtrace-restart.test.mjs
+++ b/_bmad/scripts/memtrace/memtrace-restart.test.mjs
@ -0,0 +1,103 @@
+import { describe, it } from 'node:test';
+import assert from 'node:assert/strict';
+import { spawnSync } from 'node:child_process';
+import { resolve } from 'node:path';
+import { platform } from 'node:os';
+
+const scriptPath = resolve(import.meta.dirname, 'memtrace-restart.mjs');
+const node = process.execPath;
+
+function runRestart(args = []) {
+  const result = spawnSync(node, [scriptPath, ...args], {
+    encoding: 'utf8',
+    timeout: 15000,
+    env: { ...process.env, MEMTRACE_TIMEOUT_MS: '5000' }
+  });
+  return {
+    code: result.status,
+    stdout: result.stdout || '',
+    stderr: result.stderr || ''
+  };
+}
+
+describe('memtrace-restart', () => {
+  it('should print help and exit 0 on --help', () => {
+    const r = runRestart(['--help']);
+    assert.equal(r.code, 0);
+    assert.ok(r.stdout.includes('Usage:'));
+    assert.ok(r.stdout.includes('--dry-run'));
+    assert.ok(r.stdout.includes('--help'));
+  });
+
+  it('should print help and exit 0 on -h', () => {
+    const r = runRestart(['-h']);
+    assert.equal(r.code, 0);
+    assert.ok(r.stdout.includes('Usage:'));
+  });
+
+  it('should execute dry-run without terminating processes and exit 0', () => {
+    const r = runRestart(['--dry-run']);
+    assert.equal(r.code, 0);
+    assert.ok(r.stderr.includes('DRY-RUN'));
+    assert.ok(r.stderr.includes('Terminating'));
+  });
+
+  it('should report kill step before verify step in stderr messages', () => {
+    const r = runRestart(['--dry-run']);
+    const stderrLines = r.stderr.split('\n').filter(Boolean);
+    const killIdx = stderrLines.findIndex(l => l.includes('Terminating'));
+    const verifyIdx = stderrLines.findIndex(l => l.includes('Verifying'));
+    if (killIdx !== -1 && verifyIdx !== -1) {
+      assert.ok(killIdx < verifyIdx, 'Kill step must precede verify step in stderr output');
+    }
+  });
+
+  it('should use correct termination command based on platform', () => {
+    const r = runRestart(['--dry-run']);
+    if (platform() === 'win32') {
+      assert.ok(r.stderr.includes('taskkill'), 'Windows should use taskkill');
+    } else {
+      assert.ok(r.stderr.includes('pkill'), 'Unix should use pkill');
+    }
+  });
+
+  it('should exit 1 with error for unknown argument', () => {
+    const r = runRestart(['--invalid']);
+    assert.equal(r.code, 1);
+    assert.ok(r.stderr.includes('ERROR'));
+    assert.ok(r.stderr.includes('Unknown argument'));
+  });
+
+  it('should exit 1 and report FAIL when verification times out', { timeout: 30000 }, () => {
+    const r = spawnSync(node, [scriptPath], {
+      encoding: 'utf8',
+      timeout: 20000,
+      env: { ...process.env, MEMTRACE_TIMEOUT_MS: '100', PATH: '' }
+    });
+    if (r.status === 0) {
+      assert.ok(r.stderr.includes('SUCCESS'), 'Exit 0 — memtrace was available via absolute path');
+    } else {
+      assert.ok(r.stderr.includes('FAIL') || r.stderr.includes('not found'), 'Exit ' + r.status + ' — should report FAIL or binary-not-found');
+    }
+  });
+
+  it('should report ENOENT when memtrace binary is missing', { timeout: 15000 }, () => {
+    const testEnv = { ...process.env, MEMTRACE_TIMEOUT_MS: '5000' };
+    if (platform() === 'win32') {
+      testEnv.Path = (testEnv.Path || '').replace(/nodejs[^;]*/gi, '');
+    } else {
+      testEnv.PATH = '';
+    }
+    const r = spawnSync(node, [scriptPath], {
+      encoding: 'utf8',
+      timeout: 10000,
+      env: testEnv
+    });
+    if (r.status === null) {
+      return; // PATH manipulation broke node spawn itself — skip on this platform
+    }
+    assert.equal(r.status, 1, 'Should exit 1 when binary is not found');
+    assert.ok(r.stderr.includes('not found on PATH') || r.stderr.includes('FAIL'),
+      'Should report ENOENT or verification failure');
+  });
+});
--- a/package.json
+++ b/package.json
@ -36,6 +36,7 @@
    "format:fix:staged": "prettier --write",
    "install:bmad": "node tools/installer/bmad-cli.js install",
    "lint": "eslint . --ext .js,.cjs,.mjs,.yaml --max-warnings=0",
+    "memtrace:restart": "node _bmad/scripts/memtrace/memtrace-restart.mjs",
    "lint:fix": "eslint . --ext .js,.cjs,.mjs,.yaml --fix",
    "lint:md": "markdownlint-cli2 \"**/*.md\"",
    "prepare": "command -v husky >/dev/null 2>&1 && husky || exit 0",