feat(story-4.2): implement autonomous server recovery workflow (Phase 2)

Created memtrace-restart.mjs: cross-platform MCP server recovery script that terminates stale processes and verifies server operability via MCP initialize handshake. Added npm script memtrace:restart. Created bmad-memtrace-recovery skill. Includes 8 tests, all passing with 32/32 adapter regression.
This commit is contained in:
Magal 2026-05-20 10:52:25 -03:00
parent 3a94cc4571
commit aecd1ac27b
4 changed files with 354 additions and 0 deletions

View File

@ -0,0 +1,62 @@
---
name: bmad-memtrace-recovery
description: 'Memtrace MCP server recovery workflow. Use when the adapter emits MEMTRACE_MCP_ERROR_TIMEOUT (connection failure or timeout) to autonomously restart the server.'
---
# Memtrace MCP Server Recovery
## When to Use
Activate this skill when:
- The `memtrace-adapter.mjs` script exits with code 1 AND emits `MEMTRACE_MCP_ERROR_TIMEOUT` on STDOUT
- An MCP tool call fails with a connection/timeout error
- The Memtrace MCP server is unresponsive or hung
## Recovery Protocol
### Step 1: Run the Restart Script
Execute from the project root:
```bash
npm run memtrace:restart
```
**DO NOT** use raw OS commands (`taskkill`, `kill -9`, `pkill`). These are prohibited. Only `npm run memtrace:restart` is permitted.
### Step 2: Evaluate Result
**If exit code 0 (SUCCESS):**
- The stale processes were terminated
- A fresh memtrace instance is verified operational
- Proceed to Step 3
**If exit code 1 (FAILURE):**
- The server could not be recovered
- Halt the current task immediately
- Notify the Human Developer: "Memtrace MCP server recovery failed. Manual intervention required."
- The Human Developer may invoke Story 4.3 fallback to proceed with legacy text-search heuristics.
### Step 3: Verify Index Freshness
After successful restart, verify the index is up-to-date. Run from the project root:
```bash
node _bmad/scripts/memtrace/memtrace-adapter.mjs --query list_repos
```
**If the adapter fails (exit code 1):** The restart may not have been sufficient. Halt and notify: "Memtrace server responded to initialize but adapter query failed. Manual investigation needed."
If the index is stale, re-index from the project root:
```bash
memtrace index --path .
```
Ensure you are in the project root directory before running this command (the path `.` is relative to CWD).
### Step 4: Resume Task
Once the server is verified online and the index is fresh, resume the original task from the last checkpoint. Do NOT restart the entire workflow — continue from where the timeout occurred.
## Confinement Rules
- **NEVER** use `taskkill`, `kill`, `pkill`, `tasklist`, or any raw OS process command
- **ALWAYS** use `npm run memtrace:restart` as the sole recovery interface
- **ALWAYS** halt and escalate to manual intervention if recovery fails

View File

@ -0,0 +1,188 @@
#!/usr/bin/env node
import { spawn, execFile } from 'node:child_process';
import { platform } from 'node:os';
const TIMEOUT_MS = parseInt(process.env.MEMTRACE_TIMEOUT_MS || '10000', 10);
function parseArgs() {
const args = process.argv.slice(2);
if (args.includes('--help') || args.includes('-h')) {
console.log(`Usage: node memtrace-restart.mjs [--dry-run]
Restarts the Memtrace MCP server by terminating stale processes and
verifying a fresh instance can respond to MCP initialize requests.
Options:
--dry-run Report what would be done without killing any processes
--help, -h Show this help`);
process.exit(0);
}
for (const arg of args) {
if (!arg.startsWith('-')) continue;
if (arg !== '--dry-run' && arg !== '--help' && arg !== '-h') {
console.error(`ERROR: Unknown argument: ${arg}. Use --help to see available options.`);
process.exit(1);
}
}
return { dryRun: args.includes('--dry-run') };
}
async function killStaleProcesses(dryRun) {
if (platform() === 'win32') {
if (dryRun) {
console.error('[restart] DRY-RUN: Would execute: taskkill /f /im memtrace.exe /t');
return;
}
return new Promise((resolvePromise) => {
execFile('taskkill', ['/f', '/im', 'memtrace.exe', '/t'], { windowsHide: true }, (err, stdout, stderr) => {
if (err) {
if (stderr && !stderr.toLowerCase().includes('not found') && !stderr.toLowerCase().includes('instance')) {
console.error(`[restart] taskkill warning: ${stderr.trim()}`);
}
}
if (stdout) console.error(`[restart] Terminated: ${stdout.trim().replace(/\r?\n/g, ' ')}`);
resolvePromise();
});
});
}
if (dryRun) {
console.error('[restart] DRY-RUN: Would execute: pkill -f "memtrace mcp"');
return;
}
return new Promise((resolvePromise) => {
execFile('pkill', ['-f', 'memtrace mcp'], (err) => {
if (err && err.code !== 1) {
console.error(`[restart] pkill warning: ${err.message}`);
}
resolvePromise();
});
});
}
async function verifyServerOnline() {
return new Promise((resolvePromise) => {
const child = spawn('memtrace', ['mcp'], {
stdio: ['pipe', 'pipe', 'pipe'],
shell: platform() === 'win32',
windowsHide: true
});
let resolved = false;
let stdoutBuffer = '';
const finish = (value) => {
if (resolved) return;
resolved = true;
try { child.stdin.end(); } catch (e) {}
try { child.kill(); } catch (e) {}
resolvePromise(value);
};
const timeout = setTimeout(() => {
if (!resolved) {
finish(false);
}
}, TIMEOUT_MS);
child.on('error', (err) => {
if (!resolved) {
clearTimeout(timeout);
if (err.code === 'ENOENT') {
console.error('[restart] memtrace binary not found on PATH. Verify memtrace is installed.');
} else {
console.error(`[restart] Verification spawn error: ${err.message}`);
}
finish(false);
}
});
child.on('exit', (code, signal) => {
if (!resolved) {
clearTimeout(timeout);
if (signal) {
console.error(`[restart] Verification child terminated by signal ${signal} before responding.`);
} else {
console.error(`[restart] Verification child exited with code ${code} before responding.`);
}
finish(false);
}
});
child.stderr.on('data', () => {});
const request = JSON.stringify({
jsonrpc: '2.0',
id: 1,
method: 'initialize',
params: {
protocolVersion: '2024-11-05',
capabilities: {},
clientInfo: { name: 'memtrace-restart-verifier', version: '1.0.0' }
}
}) + '\n';
child.stdout.on('data', (data) => {
if (resolved) return;
stdoutBuffer += data.toString();
const lines = stdoutBuffer.split('\n');
stdoutBuffer = lines.pop() || '';
for (const line of lines) {
if (!line.trim()) continue;
try {
const response = JSON.parse(line);
if (response.id === 1) {
if (!response.error) {
clearTimeout(timeout);
finish(true);
} else {
console.error(`[restart] MCP error response: ${response.error.message || JSON.stringify(response.error)}`);
}
return;
}
} catch (err) {
if (line.trim().startsWith('{')) {
console.error(`[restart] Verification JSON parse error: ${err.message} (payload: ${line.trim().substring(0, 120)}...)`);
}
}
}
});
child.stdin.write(request);
});
}
async function main() {
const args = parseArgs();
console.error('[restart] Memtrace MCP server recovery initiated...');
console.error('[restart] Step 1/2: Terminating stale memtrace processes...');
await killStaleProcesses(args.dryRun);
if (args.dryRun) {
console.error('[restart] DRY-RUN complete. No processes terminated. Exiting with code 0.');
process.exit(0);
}
// Allow OS to release process handles, ports, and file locks before verification.
// 500ms is a best-effort delay; loaded systems may need more but we optimise for common case.
await new Promise(r => setTimeout(r, 500));
console.error('[restart] Step 2/2: Verifying memtrace server responds to MCP...');
const online = await verifyServerOnline();
if (!online) {
console.error(`[restart] FAIL: Memtrace server did not respond within ${TIMEOUT_MS}ms.`);
console.error('[restart] The IDE/client may need to reconnect on the next MCP tool call.');
console.error('[restart] If the issue persists, manual intervention is required (Story 4.3).');
process.exit(1);
}
console.error('[restart] SUCCESS: Memtrace MCP server verified operational.');
console.error('[restart] The IDE/client will reconnect automatically on the next MCP tool call.');
process.exit(0);
}
main();

View File

@ -0,0 +1,103 @@
import { describe, it } from 'node:test';
import assert from 'node:assert/strict';
import { spawnSync } from 'node:child_process';
import { resolve } from 'node:path';
import { platform } from 'node:os';
const scriptPath = resolve(import.meta.dirname, 'memtrace-restart.mjs');
const node = process.execPath;
function runRestart(args = []) {
const result = spawnSync(node, [scriptPath, ...args], {
encoding: 'utf8',
timeout: 15000,
env: { ...process.env, MEMTRACE_TIMEOUT_MS: '5000' }
});
return {
code: result.status,
stdout: result.stdout || '',
stderr: result.stderr || ''
};
}
describe('memtrace-restart', () => {
it('should print help and exit 0 on --help', () => {
const r = runRestart(['--help']);
assert.equal(r.code, 0);
assert.ok(r.stdout.includes('Usage:'));
assert.ok(r.stdout.includes('--dry-run'));
assert.ok(r.stdout.includes('--help'));
});
it('should print help and exit 0 on -h', () => {
const r = runRestart(['-h']);
assert.equal(r.code, 0);
assert.ok(r.stdout.includes('Usage:'));
});
it('should execute dry-run without terminating processes and exit 0', () => {
const r = runRestart(['--dry-run']);
assert.equal(r.code, 0);
assert.ok(r.stderr.includes('DRY-RUN'));
assert.ok(r.stderr.includes('Terminating'));
});
it('should report kill step before verify step in stderr messages', () => {
const r = runRestart(['--dry-run']);
const stderrLines = r.stderr.split('\n').filter(Boolean);
const killIdx = stderrLines.findIndex(l => l.includes('Terminating'));
const verifyIdx = stderrLines.findIndex(l => l.includes('Verifying'));
if (killIdx !== -1 && verifyIdx !== -1) {
assert.ok(killIdx < verifyIdx, 'Kill step must precede verify step in stderr output');
}
});
it('should use correct termination command based on platform', () => {
const r = runRestart(['--dry-run']);
if (platform() === 'win32') {
assert.ok(r.stderr.includes('taskkill'), 'Windows should use taskkill');
} else {
assert.ok(r.stderr.includes('pkill'), 'Unix should use pkill');
}
});
it('should exit 1 with error for unknown argument', () => {
const r = runRestart(['--invalid']);
assert.equal(r.code, 1);
assert.ok(r.stderr.includes('ERROR'));
assert.ok(r.stderr.includes('Unknown argument'));
});
it('should exit 1 and report FAIL when verification times out', { timeout: 30000 }, () => {
const r = spawnSync(node, [scriptPath], {
encoding: 'utf8',
timeout: 20000,
env: { ...process.env, MEMTRACE_TIMEOUT_MS: '100', PATH: '' }
});
if (r.status === 0) {
assert.ok(r.stderr.includes('SUCCESS'), 'Exit 0 — memtrace was available via absolute path');
} else {
assert.ok(r.stderr.includes('FAIL') || r.stderr.includes('not found'), 'Exit ' + r.status + ' — should report FAIL or binary-not-found');
}
});
it('should report ENOENT when memtrace binary is missing', { timeout: 15000 }, () => {
const testEnv = { ...process.env, MEMTRACE_TIMEOUT_MS: '5000' };
if (platform() === 'win32') {
testEnv.Path = (testEnv.Path || '').replace(/nodejs[^;]*/gi, '');
} else {
testEnv.PATH = '';
}
const r = spawnSync(node, [scriptPath], {
encoding: 'utf8',
timeout: 10000,
env: testEnv
});
if (r.status === null) {
return; // PATH manipulation broke node spawn itself — skip on this platform
}
assert.equal(r.status, 1, 'Should exit 1 when binary is not found');
assert.ok(r.stderr.includes('not found on PATH') || r.stderr.includes('FAIL'),
'Should report ENOENT or verification failure');
});
});

View File

@ -36,6 +36,7 @@
"format:fix:staged": "prettier --write",
"install:bmad": "node tools/installer/bmad-cli.js install",
"lint": "eslint . --ext .js,.cjs,.mjs,.yaml --max-warnings=0",
"memtrace:restart": "node _bmad/scripts/memtrace/memtrace-restart.mjs",
"lint:fix": "eslint . --ext .js,.cjs,.mjs,.yaml --fix",
"lint:md": "markdownlint-cli2 \"**/*.md\"",
"prepare": "command -v husky >/dev/null 2>&1 && husky || exit 0",