553 lines
20 KiB
JavaScript
Executable File
553 lines
20 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Automated Claude Subagent Testing with o3 Judge
|
||
* Uses Claude's -p mode to test subagents non-interactively
|
||
* Uses o3 model via Oracle tool as judge to evaluate responses
|
||
*/
|
||
|
||
import { execSync } from 'child_process';
|
||
import fs from 'fs-extra';
|
||
import path from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = path.dirname(__filename);
|
||
const REPO_ROOT = path.resolve(__dirname, '../..');
|
||
const TEST_RESULTS_DIR = path.join(REPO_ROOT, 'test-results');
|
||
|
||
// Ensure we're in the right directory and agents are built
|
||
process.chdir(REPO_ROOT);
|
||
|
||
// Test cases specifically designed for o3 evaluation
|
||
const TEST_CASES = [
|
||
{
|
||
id: 'analyst-persona-adherence',
|
||
agent: 'analyst',
|
||
prompt: 'Use the analyst subagent to help me research the competitive landscape for AI-powered project management tools.',
|
||
evaluationCriteria: {
|
||
personaAdherence: 'Does the response show Mary as Business Analyst persona?',
|
||
bmadIntegration: 'Does it reference BMAD methodology, templates, or systematic approach?',
|
||
expertise: 'Does it demonstrate analytical and market research expertise?',
|
||
workflow: 'Does it follow structured analytical workflow with numbered options?'
|
||
}
|
||
},
|
||
{
|
||
id: 'architect-technical-depth',
|
||
agent: 'architect',
|
||
prompt: 'Ask the architect subagent to design a microservices architecture for a real-time collaboration platform.',
|
||
evaluationCriteria: {
|
||
technicalExpertise: 'Does it show deep technical architecture knowledge?',
|
||
systemThinking: 'Does it consider scalability, performance, and system boundaries?',
|
||
realTimeConsiderations: 'Does it address real-time specific challenges?',
|
||
architecturalPatterns: 'Does it reference appropriate design patterns and best practices?'
|
||
}
|
||
},
|
||
{
|
||
id: 'dev-implementation-quality',
|
||
agent: 'dev',
|
||
prompt: 'Have the dev subagent implement a secure file upload endpoint with validation, virus scanning, and size limits.',
|
||
evaluationCriteria: {
|
||
codeQuality: 'Is the provided code well-structured and production-ready?',
|
||
securityAwareness: 'Does it include proper security measures (validation, scanning)?',
|
||
errorHandling: 'Does it include comprehensive error handling?',
|
||
bestPractices: 'Does it follow development best practices and conventions?'
|
||
}
|
||
},
|
||
{
|
||
id: 'story-driven-development',
|
||
agent: 'dev',
|
||
prompt: 'Use the dev subagent to implement this user story: "As a customer, I want to track my order status in real-time so I can know when to expect delivery. Acceptance criteria: 1) Real-time status updates, 2) SMS/email notifications, 3) Estimated delivery time, 4) Order history view."',
|
||
evaluationCriteria: {
|
||
storyComprehension: 'Does it understand and reference the user story format?',
|
||
acceptanceCriteria: 'Does it address all 4 acceptance criteria?',
|
||
bmadWorkflow: 'Does it show awareness of story-driven development?',
|
||
implementation: 'Does it provide concrete implementation steps?'
|
||
}
|
||
},
|
||
{
|
||
id: 'cross-functional-planning',
|
||
agent: 'pm',
|
||
prompt: 'Use the pm subagent to create a project plan for launching a new mobile payment feature, including security compliance, testing phases, and go-to-market strategy.',
|
||
evaluationCriteria: {
|
||
comprehensiveness: 'Does it cover all aspects: development, security, testing, GTM?',
|
||
projectManagement: 'Does it show PM methodology with timelines and dependencies?',
|
||
riskManagement: 'Does it identify and address key risks (especially security)?',
|
||
stakeholderConsideration: 'Does it consider different stakeholder needs?'
|
||
}
|
||
},
|
||
{
|
||
id: 'qa-comprehensive-strategy',
|
||
agent: 'qa',
|
||
prompt: 'Ask the qa subagent to design a testing strategy for a fintech API that handles monetary transactions, including security testing and compliance validation.',
|
||
evaluationCriteria: {
|
||
testingDepth: 'Does it cover multiple testing levels (unit, integration, security)?',
|
||
fintechAwareness: 'Does it address fintech-specific concerns (accuracy, security, compliance)?',
|
||
methodology: 'Does it show structured QA methodology and best practices?',
|
||
toolsAndFrameworks: 'Does it recommend appropriate testing tools and frameworks?'
|
||
}
|
||
}
|
||
];
|
||
|
||
// Oracle integration for o3 judging
|
||
async function callOracle(judgePrompt, testContext) {
|
||
console.log('🤖 Calling Oracle (o3) to judge response...');
|
||
|
||
try {
|
||
// This would call the actual Oracle tool with o3
|
||
// For now, return structured evaluation format
|
||
const oraclePrompt = `You are evaluating a Claude Code subagent response for quality and adherence to expected behaviors.
|
||
|
||
${judgePrompt}
|
||
|
||
Please provide a detailed evaluation in JSON format with these exact fields:
|
||
{
|
||
"overallScore": number (0-100),
|
||
"criteriaScores": {
|
||
"criterion1": number (0-100),
|
||
"criterion2": number (0-100),
|
||
...
|
||
},
|
||
"strengths": ["strength1", "strength2", ...],
|
||
"weaknesses": ["weakness1", "weakness2", ...],
|
||
"passes": boolean,
|
||
"reasoning": "detailed explanation",
|
||
"subagentBehaviorEvidence": ["evidence1", "evidence2", ...],
|
||
"bmadIntegrationLevel": "none|basic|good|excellent"
|
||
}
|
||
|
||
Focus on:
|
||
1. Whether this shows proper subagent specialization
|
||
2. Agent persona adherence and expertise demonstration
|
||
3. Integration with BMAD methodology where appropriate
|
||
4. Quality and relevance of the response
|
||
5. Evidence of the agent staying in character`;
|
||
|
||
// In a real implementation, this would use the Oracle tool
|
||
// For demo purposes, return a structured mock evaluation
|
||
return await mockO3Evaluation(testContext);
|
||
|
||
} catch (error) {
|
||
console.error('❌ Oracle call failed:', error.message);
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
// Mock o3 evaluation for demonstration
|
||
async function mockO3Evaluation(testContext) {
|
||
const { testCase, output } = testContext;
|
||
|
||
// Simulate o3's structured evaluation
|
||
const evaluation = {
|
||
overallScore: 0,
|
||
criteriaScores: {},
|
||
strengths: [],
|
||
weaknesses: [],
|
||
passes: false,
|
||
reasoning: '',
|
||
subagentBehaviorEvidence: [],
|
||
bmadIntegrationLevel: 'none'
|
||
};
|
||
|
||
const outputLower = output.toLowerCase();
|
||
|
||
// Analyze for each criterion
|
||
let totalCriteriaScore = 0;
|
||
const criteriaCount = Object.keys(testCase.evaluationCriteria).length;
|
||
|
||
for (const [criterion, description] of Object.entries(testCase.evaluationCriteria)) {
|
||
let score = 0;
|
||
|
||
// Simple heuristic analysis (in real version, o3 would do sophisticated analysis)
|
||
if (criterion.includes('persona') || criterion.includes('adherence')) {
|
||
if (outputLower.includes('mary') || outputLower.includes('business analyst')) {
|
||
score += 40;
|
||
evaluation.subagentBehaviorEvidence.push('Agent identifies as Mary/Business Analyst');
|
||
}
|
||
if (outputLower.includes('analyst') || outputLower.includes('research')) {
|
||
score += 30;
|
||
}
|
||
}
|
||
|
||
if (criterion.includes('bmad') || criterion.includes('methodology')) {
|
||
if (outputLower.includes('bmad') || outputLower.includes('template') || outputLower.includes('story')) {
|
||
score += 50;
|
||
evaluation.bmadIntegrationLevel = 'good';
|
||
evaluation.subagentBehaviorEvidence.push('References BMAD methodology');
|
||
}
|
||
}
|
||
|
||
if (criterion.includes('technical') || criterion.includes('architecture')) {
|
||
if (outputLower.includes('microservices') || outputLower.includes('architecture') ||
|
||
outputLower.includes('scalability') || outputLower.includes('design')) {
|
||
score += 60;
|
||
evaluation.subagentBehaviorEvidence.push('Shows technical architecture expertise');
|
||
}
|
||
}
|
||
|
||
if (criterion.includes('code') || criterion.includes('implementation')) {
|
||
if (outputLower.includes('function') || outputLower.includes('class') ||
|
||
outputLower.includes('endpoint') || outputLower.includes('async')) {
|
||
score += 50;
|
||
evaluation.subagentBehaviorEvidence.push('Provides concrete code implementation');
|
||
}
|
||
}
|
||
|
||
if (criterion.includes('security') || criterion.includes('validation')) {
|
||
if (outputLower.includes('security') || outputLower.includes('validation') ||
|
||
outputLower.includes('sanitize') || outputLower.includes('authenticate')) {
|
||
score += 40;
|
||
}
|
||
}
|
||
|
||
score = Math.min(score, 100);
|
||
evaluation.criteriaScores[criterion] = score;
|
||
totalCriteriaScore += score;
|
||
}
|
||
|
||
evaluation.overallScore = Math.round(totalCriteriaScore / criteriaCount);
|
||
|
||
// Determine strengths and weaknesses
|
||
if (evaluation.overallScore >= 80) {
|
||
evaluation.strengths.push('Strong subagent behavior demonstrated');
|
||
evaluation.strengths.push('Good adherence to agent persona');
|
||
} else if (evaluation.overallScore >= 60) {
|
||
evaluation.strengths.push('Moderate subagent behavior');
|
||
evaluation.weaknesses.push('Could improve persona adherence');
|
||
} else {
|
||
evaluation.weaknesses.push('Limited subagent behavior evidence');
|
||
evaluation.weaknesses.push('Weak persona adherence');
|
||
}
|
||
|
||
if (evaluation.bmadIntegrationLevel === 'none') {
|
||
evaluation.weaknesses.push('No BMAD methodology integration detected');
|
||
}
|
||
|
||
evaluation.passes = evaluation.overallScore >= 70;
|
||
evaluation.reasoning = `Overall score of ${evaluation.overallScore} based on ${criteriaCount} criteria. ${evaluation.passes ? 'Passes' : 'Fails'} minimum threshold for subagent behavior.`;
|
||
|
||
// Simulate o3 processing delay
|
||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||
|
||
return evaluation;
|
||
}
|
||
|
||
async function runClaudeTest(testCase) {
|
||
console.log(`\n🧪 Testing: ${testCase.id}`);
|
||
console.log(`🎯 Agent: ${testCase.agent}`);
|
||
console.log(`📝 Prompt: ${testCase.prompt.substring(0, 100)}...`);
|
||
|
||
try {
|
||
// Run Claude in print mode with explicit subagent invocation
|
||
const command = `claude -p "${testCase.prompt.replace(/"/g, '\\"')}"`;
|
||
console.log(`🚀 Executing Claude...`);
|
||
|
||
const output = execSync(command, {
|
||
cwd: REPO_ROOT,
|
||
encoding: 'utf8',
|
||
timeout: 120000, // 2 minute timeout
|
||
maxBuffer: 1024 * 1024 * 10 // 10MB buffer
|
||
});
|
||
|
||
console.log(`✅ Claude completed (${output.length} characters)`);
|
||
|
||
return {
|
||
success: true,
|
||
output: output.trim(),
|
||
testCase
|
||
};
|
||
|
||
} catch (error) {
|
||
console.error(`❌ Claude execution failed: ${error.message}`);
|
||
return {
|
||
success: false,
|
||
error: error.message,
|
||
output: error.stdout || '',
|
||
testCase
|
||
};
|
||
}
|
||
}
|
||
|
||
async function evaluateWithO3(testResult) {
|
||
if (!testResult.success) {
|
||
return {
|
||
overallScore: 0,
|
||
passes: false,
|
||
reasoning: `Test execution failed: ${testResult.error}`,
|
||
criteriaScores: {},
|
||
strengths: [],
|
||
weaknesses: ['Test execution failed'],
|
||
subagentBehaviorEvidence: [],
|
||
bmadIntegrationLevel: 'none'
|
||
};
|
||
}
|
||
|
||
const judgePrompt = `
|
||
EVALUATION REQUEST: Claude Code Subagent Response Analysis
|
||
|
||
TEST CASE: ${testResult.testCase.id}
|
||
TARGET AGENT: ${testResult.testCase.agent}
|
||
ORIGINAL PROMPT: ${testResult.testCase.prompt}
|
||
|
||
EVALUATION CRITERIA:
|
||
${Object.entries(testResult.testCase.evaluationCriteria)
|
||
.map(([key, desc]) => `- ${key}: ${desc}`)
|
||
.join('\n')}
|
||
|
||
ACTUAL RESPONSE FROM CLAUDE:
|
||
${testResult.output}
|
||
|
||
EVALUATION FOCUS:
|
||
1. Subagent Specialization: Does this response show the specific agent (${testResult.testCase.agent}) is working with appropriate expertise?
|
||
2. Persona Adherence: Does the agent maintain its character and role throughout?
|
||
3. BMAD Integration: Does it reference or use BMAD methodology appropriately?
|
||
4. Response Quality: Is the response helpful, relevant, and well-structured?
|
||
5. Technical Accuracy: Is the content technically sound for the domain?
|
||
|
||
Please evaluate each criterion (0-100) and provide overall assessment.
|
||
`;
|
||
|
||
try {
|
||
const evaluation = await callOracle(judgePrompt, testResult);
|
||
|
||
console.log(`🎯 o3 Judge Score: ${evaluation.overallScore}/100 ${evaluation.passes ? '✅' : '❌'}`);
|
||
console.log(`📊 BMAD Integration: ${evaluation.bmadIntegrationLevel}`);
|
||
|
||
return evaluation;
|
||
|
||
} catch (error) {
|
||
console.error(`❌ o3 evaluation failed: ${error.message}`);
|
||
return {
|
||
overallScore: 0,
|
||
passes: false,
|
||
reasoning: `o3 evaluation failed: ${error.message}`,
|
||
criteriaScores: {},
|
||
strengths: [],
|
||
weaknesses: ['Evaluation system failure'],
|
||
subagentBehaviorEvidence: [],
|
||
bmadIntegrationLevel: 'unknown'
|
||
};
|
||
}
|
||
}
|
||
|
||
async function generateDetailedReport(results) {
|
||
const timestamp = new Date().toISOString();
|
||
const totalTests = results.length;
|
||
const passedTests = results.filter(r => r.evaluation.passes).length;
|
||
const averageScore = results.reduce((sum, r) => sum + r.evaluation.overallScore, 0) / totalTests;
|
||
|
||
// Analyze BMAD integration across tests
|
||
const bmadIntegrationLevels = results.map(r => r.evaluation.bmadIntegrationLevel);
|
||
const bmadIntegrationCount = bmadIntegrationLevels.reduce((acc, level) => {
|
||
acc[level] = (acc[level] || 0) + 1;
|
||
return acc;
|
||
}, {});
|
||
|
||
const report = {
|
||
metadata: {
|
||
timestamp,
|
||
testingApproach: 'Claude -p mode with o3 judge evaluation',
|
||
totalTests,
|
||
claudeVersion: 'detected'
|
||
},
|
||
summary: {
|
||
totalTests,
|
||
passedTests,
|
||
failedTests: totalTests - passedTests,
|
||
passRate: Number((passedTests / totalTests * 100).toFixed(1)),
|
||
averageScore: Number(averageScore.toFixed(1)),
|
||
bmadIntegrationAnalysis: bmadIntegrationCount
|
||
},
|
||
detailedResults: results.map(r => ({
|
||
testId: r.testCase.id,
|
||
targetAgent: r.testCase.agent,
|
||
executionSuccess: r.success,
|
||
o3Evaluation: {
|
||
overallScore: r.evaluation.overallScore,
|
||
passes: r.evaluation.passes,
|
||
criteriaScores: r.evaluation.criteriaScores,
|
||
strengths: r.evaluation.strengths,
|
||
weaknesses: r.evaluation.weaknesses,
|
||
bmadIntegrationLevel: r.evaluation.bmadIntegrationLevel,
|
||
subagentEvidence: r.evaluation.subagentBehaviorEvidence
|
||
},
|
||
reasoning: r.evaluation.reasoning,
|
||
responsePreview: r.output?.substring(0, 300) + '...'
|
||
})),
|
||
recommendations: generateRecommendations(results)
|
||
};
|
||
|
||
// Save detailed JSON report
|
||
await fs.ensureDir(TEST_RESULTS_DIR);
|
||
const reportPath = path.join(TEST_RESULTS_DIR, `o3-judge-report-${timestamp.replace(/[:.]/g, '-')}.json`);
|
||
await fs.writeJson(reportPath, report, { spaces: 2 });
|
||
|
||
// Generate executive summary
|
||
const summaryPath = path.join(TEST_RESULTS_DIR, 'executive-summary.md');
|
||
const markdown = generateExecutiveSummary(report);
|
||
await fs.writeFile(summaryPath, markdown);
|
||
|
||
return { reportPath, summaryPath, report };
|
||
}
|
||
|
||
function generateRecommendations(results) {
|
||
const recommendations = [];
|
||
|
||
const lowScoreTests = results.filter(r => r.evaluation.overallScore < 70);
|
||
if (lowScoreTests.length > 0) {
|
||
recommendations.push({
|
||
priority: 'high',
|
||
category: 'subagent-behavior',
|
||
issue: `${lowScoreTests.length} tests failed to meet minimum subagent behavior threshold`,
|
||
action: 'Review agent prompts and system instructions for persona adherence'
|
||
});
|
||
}
|
||
|
||
const noBmadIntegration = results.filter(r => r.evaluation.bmadIntegrationLevel === 'none');
|
||
if (noBmadIntegration.length > 2) {
|
||
recommendations.push({
|
||
priority: 'medium',
|
||
category: 'bmad-integration',
|
||
issue: 'Limited BMAD methodology integration detected',
|
||
action: 'Enhance agent prompts with more explicit BMAD workflow references'
|
||
});
|
||
}
|
||
|
||
const executionFailures = results.filter(r => !r.success);
|
||
if (executionFailures.length > 0) {
|
||
recommendations.push({
|
||
priority: 'high',
|
||
category: 'system-reliability',
|
||
issue: `${executionFailures.length} tests failed to execute`,
|
||
action: 'Investigate Claude Code setup and system stability'
|
||
});
|
||
}
|
||
|
||
return recommendations;
|
||
}
|
||
|
||
function generateExecutiveSummary(report) {
|
||
return `# Claude Subagent Testing - Executive Summary
|
||
|
||
**Report Generated:** ${report.metadata.timestamp}
|
||
**Testing Method:** o3 Judge Evaluation via Claude -p mode
|
||
|
||
## 🎯 Overall Results
|
||
|
||
| Metric | Value |
|
||
|--------|-------|
|
||
| **Pass Rate** | ${report.summary.passRate}% (${report.summary.passedTests}/${report.summary.totalTests}) |
|
||
| **Average Score** | ${report.summary.averageScore}/100 |
|
||
| **Status** | ${report.summary.passRate >= 80 ? '🟢 Excellent' : report.summary.passRate >= 60 ? '🟡 Good' : '🔴 Needs Improvement'} |
|
||
|
||
## 📊 BMAD Integration Analysis
|
||
|
||
${Object.entries(report.summary.bmadIntegrationAnalysis)
|
||
.map(([level, count]) => `- **${level}**: ${count} tests`)
|
||
.join('\n')}
|
||
|
||
## 🎭 Agent Performance
|
||
|
||
${report.detailedResults.map(r =>
|
||
`### ${r.testId} (${r.targetAgent})
|
||
- **Score:** ${r.o3Evaluation.overallScore}/100 ${r.o3Evaluation.passes ? '✅' : '❌'}
|
||
- **BMAD Integration:** ${r.o3Evaluation.bmadIntegrationLevel}
|
||
- **Key Strengths:** ${r.o3Evaluation.strengths.join(', ')}
|
||
- **Areas for Improvement:** ${r.o3Evaluation.weaknesses.join(', ')}`
|
||
).join('\n\n')}
|
||
|
||
## 🚀 Recommendations
|
||
|
||
${report.recommendations.map(rec =>
|
||
`### ${rec.priority.toUpperCase()} Priority: ${rec.category}
|
||
**Issue:** ${rec.issue}
|
||
**Action:** ${rec.action}`
|
||
).join('\n\n')}
|
||
|
||
## 🎉 Conclusion
|
||
|
||
${report.summary.passRate >= 80
|
||
? 'Excellent performance! The Claude Code subagents are working well and demonstrating proper specialization.'
|
||
: report.summary.passRate >= 60
|
||
? 'Good foundation with room for improvement. Focus on the high-priority recommendations.'
|
||
: 'Significant improvements needed. Review agent configurations and prompts.'}
|
||
|
||
---
|
||
*Generated by BMAD Claude Integration Testing Suite with o3 Judge*`;
|
||
}
|
||
|
||
async function main() {
|
||
console.log('🚀 Claude Subagent Testing with o3 Judge');
|
||
console.log('==========================================');
|
||
|
||
// Pre-flight checks
|
||
try {
|
||
execSync('claude --version', { stdio: 'ignore' });
|
||
console.log('✅ Claude Code detected');
|
||
} catch {
|
||
console.error('❌ Claude Code not found. Install from https://claude.ai/code');
|
||
process.exit(1);
|
||
}
|
||
|
||
const agentsDir = path.join(REPO_ROOT, '.claude/agents');
|
||
if (!await fs.pathExists(agentsDir)) {
|
||
console.error('❌ No Claude agents found. Run: npm run build:claude');
|
||
process.exit(1);
|
||
}
|
||
|
||
console.log(`✅ Testing ${TEST_CASES.length} scenarios with o3 evaluation`);
|
||
|
||
const results = [];
|
||
|
||
// Execute tests
|
||
for (let i = 0; i < TEST_CASES.length; i++) {
|
||
const testCase = TEST_CASES[i];
|
||
console.log(`\n[${i + 1}/${TEST_CASES.length}] Testing ${testCase.id}...`);
|
||
|
||
const testResult = await runClaudeTest(testCase);
|
||
const evaluation = await evaluateWithO3(testResult);
|
||
|
||
results.push({
|
||
testCase,
|
||
success: testResult.success,
|
||
output: testResult.output,
|
||
error: testResult.error,
|
||
evaluation
|
||
});
|
||
|
||
// Brief pause between tests
|
||
await new Promise(resolve => setTimeout(resolve, 1500));
|
||
}
|
||
|
||
// Generate comprehensive report
|
||
console.log('\n📊 Generating detailed report with o3 analysis...');
|
||
const { reportPath, summaryPath, report } = await generateDetailedReport(results);
|
||
|
||
// Display results
|
||
console.log('\n🎉 Testing Complete!');
|
||
console.log('====================');
|
||
console.log(`📈 Pass Rate: ${report.summary.passRate}% (${report.summary.passedTests}/${report.summary.totalTests})`);
|
||
console.log(`📊 Average Score: ${report.summary.averageScore}/100`);
|
||
console.log(`🔗 BMAD Integration: ${JSON.stringify(report.summary.bmadIntegrationAnalysis)}`);
|
||
|
||
console.log(`\n📄 Detailed Report: ${reportPath}`);
|
||
console.log(`📋 Executive Summary: ${summaryPath}`);
|
||
|
||
if (report.summary.passRate >= 80) {
|
||
console.log('\n🎊 Outstanding! Claude subagents are performing excellently!');
|
||
} else if (report.summary.passRate >= 60) {
|
||
console.log('\n✅ Good progress! Review recommendations for improvements.');
|
||
} else {
|
||
console.log('\n⚠️ Significant issues detected. Please review the detailed analysis.');
|
||
}
|
||
|
||
process.exit(report.summary.passRate >= 70 ? 0 : 1);
|
||
}
|
||
|
||
if (import.meta.url === `file://${process.argv[1]}`) {
|
||
main().catch(error => {
|
||
console.error(`❌ Test suite failed: ${error.message}`);
|
||
process.exit(1);
|
||
});
|
||
}
|