const { describe, test, expect, beforeAll, afterAll } = require('@jest/globals'); const OpenAI = require('openai'); const BMADMessageQueue = require('../../core/message-queue'); const ElicitationBroker = require('../../core/elicitation-broker'); const SessionManager = require('../../core/session-manager'); const BMADLoader = require('../../core/bmad-loader'); // AI Judge class for evaluating test results using o3 class AIJudge { constructor() { const apiKey = process.env.OPENAI_API_KEY; if (!apiKey) { throw new Error('OPENAI_API_KEY environment variable is required for AI Judge tests'); } this.openai = new OpenAI({ apiKey: apiKey }); } async evaluate(prompt, criteria, model = 'o3-2025-01-17') { try { const response = await this.openai.chat.completions.create({ model, messages: [{ role: 'user', content: `You are an expert AI judge evaluating a BMAD-METHOD Claude Code integration test. ${prompt} Evaluation Criteria: ${criteria.map((c, i) => `${i + 1}. ${c}`).join('\n')} Provide: 1. A score from 0-10 for each criterion 2. Brief explanation for each score 3. Overall pass/fail determination (pass requires all scores >= 7) 4. Specific feedback for improvements Format your response as JSON: { "scores": [{"criterion": 1, "score": X, "explanation": "..."}], "overall_score": X, "pass": boolean, "feedback": "..." }` }], temperature: 0.3, max_tokens: 1000, response_format: { type: "json_object" } }); return JSON.parse(response.choices[0].message.content); } catch (error) { console.error('AI Judge error:', error); throw error; } } } describe('BMAD Claude Integration - AI Judge Tests', () => { let queue, broker, sessionManager, loader, judge; const skipIfNoApiKey = () => { if (!process.env.OPENAI_API_KEY) { return describe.skip; } return describe; }; beforeAll(async () => { queue = new BMADMessageQueue({ basePath: './test-bmad' }); broker = new ElicitationBroker(queue); sessionManager = new SessionManager(queue, broker); loader = new BMADLoader(); // Only create judge if we have API key if (process.env.OPENAI_API_KEY) { judge = new AIJudge(); } await queue.initialize(); await sessionManager.initialize(); }); afterAll(async () => { // Cleanup test directories const fs = require('fs').promises; await fs.rm('./test-bmad', { recursive: true, force: true }); }); describe('Context Preservation', () => { test('should maintain full context through agent handoffs', async () => { // Create complex scenario const initialContext = { user_request: "Create a microservices architecture for e-commerce with user stories", constraints: ["Must support 100k concurrent users", "Budget $50k", "3 month timeline"], files: ["requirements.md", "existing-api.yaml"], technical_requirements: { languages: ["TypeScript", "Python"], databases: ["PostgreSQL", "Redis"], deployment: "Kubernetes" } }; // Simulate PM agent session const pmSession = await sessionManager.createAgentSession('pm', initialContext); const pmMessage = await queue.sendMessage({ agent: 'pm', type: 'execute', session_id: pmSession.id, context: initialContext }); // Add conversation entries await sessionManager.addToConversation(pmSession.id, { type: 'user', content: initialContext.user_request }); await sessionManager.addToConversation(pmSession.id, { type: 'agent', content: 'I need to understand your user base better. What are the main user personas?' }); // Simulate architect session with handoff const architectSession = await sessionManager.createAgentSession('architect', { ...initialContext, previous_agent: 'pm', pm_output: 'User stories created for authentication, catalog, and checkout' }); // Get final context const finalPMSession = await sessionManager.loadSession(pmSession.id); const finalArchSession = await sessionManager.loadSession(architectSession.id); // AI Judge evaluation const evaluation = await judge.evaluate( `Context Preservation Test: Initial Context: ${JSON.stringify(initialContext, null, 2)} PM Session Final State: ${JSON.stringify(finalPMSession, null, 2)} Architect Session State: ${JSON.stringify(finalArchSession, null, 2)} Evaluate whether the context was properly preserved across agent handoffs.`, [ 'All initial constraints are preserved in both sessions', 'Technical requirements remain intact', 'File references are maintained', 'User request is accurately captured', 'Agent handoff includes relevant context from PM to Architect' ] ); expect(evaluation.pass).toBe(true); expect(evaluation.overall_score).toBeGreaterThanOrEqual(7); }, 30000); }); describe('Elicitation Quality', () => { test('should handle elicitation phases naturally', async () => { const userRequest = "Create a user story for a payment processing feature"; // Create elicitation session const elicitSession = await broker.createSession('pm', { user_request: userRequest, project_context: 'E-commerce platform' }); // Simulate elicitation flow const questions = [ "What payment methods should be supported?", "Do you need to handle recurring payments?", "What are the compliance requirements (PCI-DSS, etc.)?" ]; const responses = [ "Credit cards, PayPal, and Apple Pay", "Yes, for subscription products", "Full PCI-DSS compliance is required" ]; for (let i = 0; i < questions.length; i++) { await broker.addQuestion(elicitSession.id, questions[i], { phase: 'requirements_gathering', importance: 'high' }); await broker.addResponse(elicitSession.id, responses[i], `q${i + 1}`); } const completedSession = await broker.completeSession(elicitSession.id, { user_story: "As a customer, I want to pay using multiple payment methods..." }); // AI Judge evaluation const evaluation = await judge.evaluate( `Elicitation Quality Test: User Request: ${userRequest} Elicitation Flow: ${questions.map((q, i) => `Q: ${q}\nA: ${responses[i]}`).join('\n\n')} Completed Session: ${JSON.stringify(completedSession, null, 2)} Evaluate the quality of the elicitation process.`, [ 'Questions are relevant to the user request', 'Questions progressively gather necessary details', 'Questions avoid redundancy', 'Response format is natural (no special syntax required)', 'Elicitation history is properly tracked' ] ); expect(evaluation.pass).toBe(true); expect(evaluation.overall_score).toBeGreaterThanOrEqual(8); }, 30000); }); describe('Multi-Agent Orchestration', () => { test('should handle concurrent agent sessions effectively', async () => { // Create multiple concurrent sessions const sessions = await Promise.all([ sessionManager.createAgentSession('pm', { task: 'Create user stories' }), sessionManager.createAgentSession('architect', { task: 'Design system architecture' }), sessionManager.createAgentSession('qa', { task: 'Create test plan' }) ]); // Simulate switching between sessions await sessionManager.switchSession(sessions[1].id); await sessionManager.suspendSession(sessions[1].id, 'user_switch'); await sessionManager.switchSession(sessions[0].id); // Add activities to different sessions for (const session of sessions) { await sessionManager.addToConversation(session.id, { type: 'user', content: `Working on ${session.context.task}` }); } // Get session list const sessionList = sessionManager.formatSessionList(); // AI Judge evaluation const evaluation = await judge.evaluate( `Multi-Agent Orchestration Test: Created Sessions: ${sessions.map(s => `${s.agent}: ${s.context.task}`).join(', ')} Session List Output: ${sessionList} Session States: ${JSON.stringify(sessions.map(s => ({ id: s.id, agent: s.agent, status: s.status, ui: s.ui })), null, 2)} Evaluate the multi-agent session management.`, [ 'Each agent has clear visual identification (icon + name)', 'Session status is clearly indicated (active/suspended)', 'Session switching commands are provided', 'Concurrent sessions are properly isolated', 'User can easily understand which agent they are talking to' ] ); expect(evaluation.pass).toBe(true); expect(evaluation.overall_score).toBeGreaterThanOrEqual(8); }, 30000); }); describe('BMAD Agent Behavior Preservation', () => { test('should preserve original BMAD agent behavior', async () => { // Load original BMAD agent const pmAgent = await loader.loadAgent('pm'); // Verify agent structure const evaluation = await judge.evaluate( `BMAD Agent Preservation Test: Loaded PM Agent Structure: - Title: ${pmAgent.title} - Agent Info: ${JSON.stringify(pmAgent.agent, null, 2)} - Commands: ${JSON.stringify(pmAgent.commands?.slice(0, 5), null, 2)} - Dependencies: ${JSON.stringify(pmAgent.dependencies, null, 2)} Evaluate whether the BMAD loader properly preserves the original agent structure and behavior.`, [ 'Agent metadata is correctly extracted', 'Commands are properly parsed', 'Dependencies are maintained', 'YAML configuration is correctly loaded', 'Original agent logic can be executed without modification' ] ); expect(evaluation.pass).toBe(true); expect(pmAgent.agent.name).toBe('Product Manager'); }, 30000); }); describe('Error Recovery', () => { test('should handle errors gracefully', async () => { const errorScenarios = []; // Test 1: Invalid session ID try { await sessionManager.switchSession('invalid-session-id'); } catch (error) { errorScenarios.push({ scenario: 'Invalid session ID', error: error.message, handled: true }); } // Test 2: Message queue retry const failingMessage = await queue.sendMessage({ agent: 'test-agent', type: 'failing', simulateFailure: true }); await queue.retry(failingMessage); const retriedMessage = await queue.getMessage(failingMessage); errorScenarios.push({ scenario: 'Message retry', retries: retriedMessage.retries, status: retriedMessage.status }); // Test 3: Elicitation session not found try { await broker.loadSession('non-existent-session'); } catch (error) { errorScenarios.push({ scenario: 'Elicitation session not found', error: error.message, handled: true }); } // AI Judge evaluation const evaluation = await judge.evaluate( `Error Recovery Test: Error Scenarios Tested: ${JSON.stringify(errorScenarios, null, 2)} Evaluate the error handling and recovery mechanisms.`, [ 'Errors provide clear, actionable messages', 'System maintains stability after errors', 'Retry mechanisms work correctly', 'Error states are properly tracked', 'Recovery suggestions are provided' ] ); expect(evaluation.pass).toBe(true); expect(errorScenarios.every(s => s.handled !== false)).toBe(true); }, 30000); }); }); // Integration test with actual agent execution describe('End-to-End Integration', () => { test('should complete a full BMAD workflow', async () => { const judge = new AIJudge(); // This test would require actual Claude Code environment // For now, we'll simulate the expected behavior const workflowSteps = [ { agent: 'pm', action: 'Create user story', status: 'completed' }, { agent: 'architect', action: 'Design architecture', status: 'completed' }, { agent: 'dev', action: 'Implementation plan', status: 'completed' }, { agent: 'qa', action: 'Test strategy', status: 'completed' } ]; const evaluation = await judge.evaluate( `End-to-End Workflow Test: Workflow Steps: ${JSON.stringify(workflowSteps, null, 2)} This represents a complete BMAD workflow from requirements to test strategy. Each agent should maintain context from previous agents while adding their expertise.`, [ 'Workflow progresses logically through agents', 'Each agent adds value without losing context', 'Handoffs between agents are smooth', 'Final output incorporates all agent contributions', 'User can track progress throughout workflow' ] ); expect(evaluation.pass).toBe(true); }, 30000); });