BMAD-METHOD/bmad-claude-integration/tests/ai-judge/judge.test.js

407 lines
13 KiB
JavaScript

const { describe, test, expect, beforeAll, afterAll } = require('@jest/globals');
const OpenAI = require('openai');
const BMADMessageQueue = require('../../core/message-queue');
const ElicitationBroker = require('../../core/elicitation-broker');
const SessionManager = require('../../core/session-manager');
const BMADLoader = require('../../core/bmad-loader');
// AI Judge class for evaluating test results using o3
class AIJudge {
constructor() {
const apiKey = process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error('OPENAI_API_KEY environment variable is required for AI Judge tests');
}
this.openai = new OpenAI({
apiKey: apiKey
});
}
async evaluate(prompt, criteria, model = 'o3-2025-01-17') {
try {
const response = await this.openai.chat.completions.create({
model,
messages: [{
role: 'user',
content: `You are an expert AI judge evaluating a BMAD-METHOD Claude Code integration test.
${prompt}
Evaluation Criteria:
${criteria.map((c, i) => `${i + 1}. ${c}`).join('\n')}
Provide:
1. A score from 0-10 for each criterion
2. Brief explanation for each score
3. Overall pass/fail determination (pass requires all scores >= 7)
4. Specific feedback for improvements
Format your response as JSON:
{
"scores": [{"criterion": 1, "score": X, "explanation": "..."}],
"overall_score": X,
"pass": boolean,
"feedback": "..."
}`
}],
temperature: 0.3,
max_tokens: 1000,
response_format: { type: "json_object" }
});
return JSON.parse(response.choices[0].message.content);
} catch (error) {
console.error('AI Judge error:', error);
throw error;
}
}
}
describe('BMAD Claude Integration - AI Judge Tests', () => {
let queue, broker, sessionManager, loader, judge;
const skipIfNoApiKey = () => {
if (!process.env.OPENAI_API_KEY) {
return describe.skip;
}
return describe;
};
beforeAll(async () => {
queue = new BMADMessageQueue({ basePath: './test-bmad' });
broker = new ElicitationBroker(queue);
sessionManager = new SessionManager(queue, broker);
loader = new BMADLoader();
// Only create judge if we have API key
if (process.env.OPENAI_API_KEY) {
judge = new AIJudge();
}
await queue.initialize();
await sessionManager.initialize();
});
afterAll(async () => {
// Cleanup test directories
const fs = require('fs').promises;
await fs.rm('./test-bmad', { recursive: true, force: true });
});
describe('Context Preservation', () => {
test('should maintain full context through agent handoffs', async () => {
// Create complex scenario
const initialContext = {
user_request: "Create a microservices architecture for e-commerce with user stories",
constraints: ["Must support 100k concurrent users", "Budget $50k", "3 month timeline"],
files: ["requirements.md", "existing-api.yaml"],
technical_requirements: {
languages: ["TypeScript", "Python"],
databases: ["PostgreSQL", "Redis"],
deployment: "Kubernetes"
}
};
// Simulate PM agent session
const pmSession = await sessionManager.createAgentSession('pm', initialContext);
const pmMessage = await queue.sendMessage({
agent: 'pm',
type: 'execute',
session_id: pmSession.id,
context: initialContext
});
// Add conversation entries
await sessionManager.addToConversation(pmSession.id, {
type: 'user',
content: initialContext.user_request
});
await sessionManager.addToConversation(pmSession.id, {
type: 'agent',
content: 'I need to understand your user base better. What are the main user personas?'
});
// Simulate architect session with handoff
const architectSession = await sessionManager.createAgentSession('architect', {
...initialContext,
previous_agent: 'pm',
pm_output: 'User stories created for authentication, catalog, and checkout'
});
// Get final context
const finalPMSession = await sessionManager.loadSession(pmSession.id);
const finalArchSession = await sessionManager.loadSession(architectSession.id);
// AI Judge evaluation
const evaluation = await judge.evaluate(
`Context Preservation Test:
Initial Context: ${JSON.stringify(initialContext, null, 2)}
PM Session Final State: ${JSON.stringify(finalPMSession, null, 2)}
Architect Session State: ${JSON.stringify(finalArchSession, null, 2)}
Evaluate whether the context was properly preserved across agent handoffs.`,
[
'All initial constraints are preserved in both sessions',
'Technical requirements remain intact',
'File references are maintained',
'User request is accurately captured',
'Agent handoff includes relevant context from PM to Architect'
]
);
expect(evaluation.pass).toBe(true);
expect(evaluation.overall_score).toBeGreaterThanOrEqual(7);
}, 30000);
});
describe('Elicitation Quality', () => {
test('should handle elicitation phases naturally', async () => {
const userRequest = "Create a user story for a payment processing feature";
// Create elicitation session
const elicitSession = await broker.createSession('pm', {
user_request: userRequest,
project_context: 'E-commerce platform'
});
// Simulate elicitation flow
const questions = [
"What payment methods should be supported?",
"Do you need to handle recurring payments?",
"What are the compliance requirements (PCI-DSS, etc.)?"
];
const responses = [
"Credit cards, PayPal, and Apple Pay",
"Yes, for subscription products",
"Full PCI-DSS compliance is required"
];
for (let i = 0; i < questions.length; i++) {
await broker.addQuestion(elicitSession.id, questions[i], {
phase: 'requirements_gathering',
importance: 'high'
});
await broker.addResponse(elicitSession.id, responses[i], `q${i + 1}`);
}
const completedSession = await broker.completeSession(elicitSession.id, {
user_story: "As a customer, I want to pay using multiple payment methods..."
});
// AI Judge evaluation
const evaluation = await judge.evaluate(
`Elicitation Quality Test:
User Request: ${userRequest}
Elicitation Flow:
${questions.map((q, i) => `Q: ${q}\nA: ${responses[i]}`).join('\n\n')}
Completed Session: ${JSON.stringify(completedSession, null, 2)}
Evaluate the quality of the elicitation process.`,
[
'Questions are relevant to the user request',
'Questions progressively gather necessary details',
'Questions avoid redundancy',
'Response format is natural (no special syntax required)',
'Elicitation history is properly tracked'
]
);
expect(evaluation.pass).toBe(true);
expect(evaluation.overall_score).toBeGreaterThanOrEqual(8);
}, 30000);
});
describe('Multi-Agent Orchestration', () => {
test('should handle concurrent agent sessions effectively', async () => {
// Create multiple concurrent sessions
const sessions = await Promise.all([
sessionManager.createAgentSession('pm', { task: 'Create user stories' }),
sessionManager.createAgentSession('architect', { task: 'Design system architecture' }),
sessionManager.createAgentSession('qa', { task: 'Create test plan' })
]);
// Simulate switching between sessions
await sessionManager.switchSession(sessions[1].id);
await sessionManager.suspendSession(sessions[1].id, 'user_switch');
await sessionManager.switchSession(sessions[0].id);
// Add activities to different sessions
for (const session of sessions) {
await sessionManager.addToConversation(session.id, {
type: 'user',
content: `Working on ${session.context.task}`
});
}
// Get session list
const sessionList = sessionManager.formatSessionList();
// AI Judge evaluation
const evaluation = await judge.evaluate(
`Multi-Agent Orchestration Test:
Created Sessions: ${sessions.map(s => `${s.agent}: ${s.context.task}`).join(', ')}
Session List Output:
${sessionList}
Session States: ${JSON.stringify(sessions.map(s => ({
id: s.id,
agent: s.agent,
status: s.status,
ui: s.ui
})), null, 2)}
Evaluate the multi-agent session management.`,
[
'Each agent has clear visual identification (icon + name)',
'Session status is clearly indicated (active/suspended)',
'Session switching commands are provided',
'Concurrent sessions are properly isolated',
'User can easily understand which agent they are talking to'
]
);
expect(evaluation.pass).toBe(true);
expect(evaluation.overall_score).toBeGreaterThanOrEqual(8);
}, 30000);
});
describe('BMAD Agent Behavior Preservation', () => {
test('should preserve original BMAD agent behavior', async () => {
// Load original BMAD agent
const pmAgent = await loader.loadAgent('pm');
// Verify agent structure
const evaluation = await judge.evaluate(
`BMAD Agent Preservation Test:
Loaded PM Agent Structure:
- Title: ${pmAgent.title}
- Agent Info: ${JSON.stringify(pmAgent.agent, null, 2)}
- Commands: ${JSON.stringify(pmAgent.commands?.slice(0, 5), null, 2)}
- Dependencies: ${JSON.stringify(pmAgent.dependencies, null, 2)}
Evaluate whether the BMAD loader properly preserves the original agent structure and behavior.`,
[
'Agent metadata is correctly extracted',
'Commands are properly parsed',
'Dependencies are maintained',
'YAML configuration is correctly loaded',
'Original agent logic can be executed without modification'
]
);
expect(evaluation.pass).toBe(true);
expect(pmAgent.agent.name).toBe('Product Manager');
}, 30000);
});
describe('Error Recovery', () => {
test('should handle errors gracefully', async () => {
const errorScenarios = [];
// Test 1: Invalid session ID
try {
await sessionManager.switchSession('invalid-session-id');
} catch (error) {
errorScenarios.push({
scenario: 'Invalid session ID',
error: error.message,
handled: true
});
}
// Test 2: Message queue retry
const failingMessage = await queue.sendMessage({
agent: 'test-agent',
type: 'failing',
simulateFailure: true
});
await queue.retry(failingMessage);
const retriedMessage = await queue.getMessage(failingMessage);
errorScenarios.push({
scenario: 'Message retry',
retries: retriedMessage.retries,
status: retriedMessage.status
});
// Test 3: Elicitation session not found
try {
await broker.loadSession('non-existent-session');
} catch (error) {
errorScenarios.push({
scenario: 'Elicitation session not found',
error: error.message,
handled: true
});
}
// AI Judge evaluation
const evaluation = await judge.evaluate(
`Error Recovery Test:
Error Scenarios Tested:
${JSON.stringify(errorScenarios, null, 2)}
Evaluate the error handling and recovery mechanisms.`,
[
'Errors provide clear, actionable messages',
'System maintains stability after errors',
'Retry mechanisms work correctly',
'Error states are properly tracked',
'Recovery suggestions are provided'
]
);
expect(evaluation.pass).toBe(true);
expect(errorScenarios.every(s => s.handled !== false)).toBe(true);
}, 30000);
});
});
// Integration test with actual agent execution
describe('End-to-End Integration', () => {
test('should complete a full BMAD workflow', async () => {
const judge = new AIJudge();
// This test would require actual Claude Code environment
// For now, we'll simulate the expected behavior
const workflowSteps = [
{ agent: 'pm', action: 'Create user story', status: 'completed' },
{ agent: 'architect', action: 'Design architecture', status: 'completed' },
{ agent: 'dev', action: 'Implementation plan', status: 'completed' },
{ agent: 'qa', action: 'Test strategy', status: 'completed' }
];
const evaluation = await judge.evaluate(
`End-to-End Workflow Test:
Workflow Steps: ${JSON.stringify(workflowSteps, null, 2)}
This represents a complete BMAD workflow from requirements to test strategy.
Each agent should maintain context from previous agents while adding their expertise.`,
[
'Workflow progresses logically through agents',
'Each agent adds value without losing context',
'Handoffs between agents are smooth',
'Final output incorporates all agent contributions',
'User can track progress throughout workflow'
]
);
expect(evaluation.pass).toBe(true);
}, 30000);
});