BMAD-METHOD/scripts/lib/llm-task-verifier.py

347 lines
12 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
LLM-Powered Task Verification - Use Claude Haiku to ACTUALLY verify code quality
Purpose: Don't guess with regex - have Claude READ the code and verify it's real
Method: For each task, read mentioned files, ask Claude "is this actually implemented?"
Created: 2026-01-02
Cost: ~$0.13 per story with Haiku (50 tasks × 3K tokens × $1.25/1M)
Full platform: 511 stories × $0.13 = ~$66 total
"""
import json
import os
import re
import sys
from pathlib import Path
from typing import Dict, List
from anthropic import Anthropic
class LLMTaskVerifier:
"""Uses Claude API to verify tasks by reading and analyzing actual code"""
def __init__(self, api_key: str = None):
self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY')
if not self.api_key:
raise ValueError("ANTHROPIC_API_KEY required")
self.client = Anthropic(api_key=self.api_key)
self.model = 'claude-haiku-4-20250514' # Fast + cheap for verification tasks
self.repo_root = Path('.')
def verify_task(self, task_text: str, is_checked: bool, story_context: Dict) -> Dict:
"""
Use Claude to verify if a task is actually complete
Args:
task_text: The task description (e.g., "Implement UserService")
is_checked: Whether task is checked [x] or not [ ]
story_context: Context about the story (files, epic, etc.)
Returns:
{
'task': task_text,
'is_checked': bool,
'actually_complete': bool,
'confidence': 'very_high' | 'high' | 'medium' | 'low',
'evidence': str,
'issues_found': [list of issues],
'verification_status': 'correct' | 'false_positive' | 'false_negative'
}
"""
# Extract file references from task
file_refs = self._extract_file_references(task_text)
# Read the files
file_contents = {}
for file_ref in file_refs[:5]: # Limit to 5 files per task
content = self._read_file(file_ref)
if content:
file_contents[file_ref] = content
# If no files found, try reading files from story context
if not file_contents and story_context.get('files'):
for file_path in story_context['files'][:5]:
content = self._read_file(file_path)
if content:
file_contents[file_path] = content
# Build prompt for Claude
prompt = self._build_verification_prompt(task_text, is_checked, file_contents, story_context)
# Call Claude API
try:
response = self.client.messages.create(
model=self.model,
max_tokens=2000,
temperature=0, # Deterministic
messages=[{
'role': 'user',
'content': prompt
}]
)
# Parse response
result_text = response.content[0].text
result = self._parse_claude_response(result_text)
# Add metadata
result['task'] = task_text
result['is_checked'] = is_checked
result['tokens_used'] = response.usage.input_tokens + response.usage.output_tokens
# Determine verification status
if is_checked == result['actually_complete']:
result['verification_status'] = 'correct'
elif is_checked and not result['actually_complete']:
result['verification_status'] = 'false_positive'
else:
result['verification_status'] = 'false_negative'
return result
except Exception as e:
return {
'task': task_text,
'error': str(e),
'verification_status': 'error'
}
def _build_verification_prompt(self, task: str, is_checked: bool, files: Dict, context: Dict) -> str:
"""Build prompt for Claude to verify task completion"""
files_section = ""
if files:
files_section = "\n\n## Files Provided\n\n"
for file_path, content in files.items():
files_section += f"### {file_path}\n```typescript\n{content[:2000]}\n```\n\n"
else:
files_section = "\n\n## Files Provided\n\nNone - task may not reference specific files.\n"
prompt = f"""You are a code verification expert. Your job is to verify whether a task from a user story is actually complete.
## Task to Verify
**Task:** {task}
**Claimed Status:** {'[x] Complete' if is_checked else '[ ] Not complete'}
## Story Context
**Story:** {context.get('story_id', 'Unknown')}
**Epic:** {context.get('epic', 'Unknown')}
{files_section}
## Your Task
Analyze the files (if provided) and determine:
1. **Is the task actually complete?**
- If files provided: Does the code actually implement what the task describes?
- Is it real implementation or just stubs/TODOs?
- Are there tests? Do they pass?
2. **Confidence level:**
- very_high: Clear evidence (tests passing, full implementation)
- high: Strong evidence (code exists with logic, no stubs)
- medium: Some evidence but incomplete
- low: No files or cannot verify
3. **Evidence:**
- What did you find that proves/disproves completion?
- Specific line numbers or code snippets
- Test results if applicable
4. **Issues (if any):**
- Stub code or TODOs
- Missing error handling
- No multi-tenant isolation (dealerId filters)
- Security vulnerabilities
- Missing tests
## Response Format (JSON)
{{
"actually_complete": true/false,
"confidence": "very_high|high|medium|low",
"evidence": "Detailed explanation of what you found",
"issues_found": ["issue 1", "issue 2"],
"recommendation": "What needs to be done (if incomplete)"
}}
**Be objective. If code is a stub with TODOs, it's NOT complete even if files exist.**
"""
return prompt
def _parse_claude_response(self, response_text: str) -> Dict:
"""Parse Claude's JSON response"""
try:
# Extract JSON from response (may have markdown)
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
if json_match:
return json.loads(json_match.group(0))
else:
# Fallback: parse manually
return {
'actually_complete': 'complete' in response_text.lower() and 'not complete' not in response_text.lower(),
'confidence': 'low',
'evidence': response_text[:500],
'issues_found': [],
}
except:
return {
'actually_complete': False,
'confidence': 'low',
'evidence': 'Failed to parse response',
'issues_found': ['Parse error'],
}
def _extract_file_references(self, task_text: str) -> List[str]:
"""Extract file paths from task text"""
paths = []
# Common patterns
patterns = [
r'[\w/-]+/[\w-]+\.[\w]+', # Explicit paths
r'\b([A-Z][\w-]+\.(ts|tsx|service|controller|repository))', # Files
]
for pattern in patterns:
matches = re.findall(pattern, task_text)
if isinstance(matches[0], tuple) if matches else False:
paths.extend([m[0] for m in matches])
else:
paths.extend(matches)
return list(set(paths))[:5] # Max 5 files per task
def _read_file(self, file_ref: str) -> str:
"""Find and read file from repository"""
# Try exact path
if (self.repo_root / file_ref).exists():
try:
return (self.repo_root / file_ref).read_text()[:5000] # Max 5K chars
except:
return None
# Search for file
import subprocess
try:
result = subprocess.run(
['find', '.', '-name', Path(file_ref).name, '-type', 'f'],
capture_output=True,
text=True,
cwd=self.repo_root,
timeout=5
)
if result.stdout.strip():
file_path = result.stdout.strip().split('\n')[0]
return Path(file_path).read_text()[:5000]
except:
pass
return None
def verify_story_with_llm(story_file_path: str) -> Dict:
"""
Verify entire story using LLM for each task
Cost: ~$1.50 per story (50 tasks × 3K tokens/task × $15/1M)
Time: ~2-3 minutes per story
"""
verifier = LLMTaskVerifier()
story_path = Path(story_file_path)
if not story_path.exists():
return {'error': 'Story file not found'}
content = story_path.read_text()
# Extract story context
story_id = story_path.stem
epic_match = re.search(r'Epic:\*?\*?\s*(\w+)', content, re.IGNORECASE)
epic = epic_match.group(1) if epic_match else 'Unknown'
# Extract files from Dev Agent Record
file_list_match = re.search(r'### File List\n\n(.+?)###', content, re.DOTALL)
files = []
if file_list_match:
file_section = file_list_match.group(1)
files = re.findall(r'[\w/-]+\.[\w]+', file_section)
story_context = {
'story_id': story_id,
'epic': epic,
'files': files
}
# Extract all tasks
task_pattern = r'^-\s*\[([ xX])\]\s*(.+)$'
tasks = re.findall(task_pattern, content, re.MULTILINE)
if not tasks:
return {'error': 'No tasks found'}
# Verify each task with LLM
print(f"\n🔍 Verifying {len(tasks)} tasks with Claude...", file=sys.stderr)
task_results = []
for idx, (checkbox, task_text) in enumerate(tasks):
is_checked = checkbox.lower() == 'x'
print(f" {idx+1}/{len(tasks)}: {task_text[:60]}...", file=sys.stderr)
result = verifier.verify_task(task_text, is_checked, story_context)
task_results.append(result)
# Calculate summary
total = len(task_results)
correct = sum(1 for r in task_results if r.get('verification_status') == 'correct')
false_positives = sum(1 for r in task_results if r.get('verification_status') == 'false_positive')
false_negatives = sum(1 for r in task_results if r.get('verification_status') == 'false_negative')
return {
'story_id': story_id,
'total_tasks': total,
'correct': correct,
'false_positives': false_positives,
'false_negatives': false_negatives,
'verification_score': round((correct / total * 100), 1) if total > 0 else 0,
'task_results': task_results
}
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: llm-task-verifier.py <story-file>")
sys.exit(1)
results = verify_story_with_llm(sys.argv[1])
if 'error' in results:
print(f"{results['error']}")
sys.exit(1)
# Print summary
print(f"\n📊 Story: {results['story_id']}")
print(f"Verification Score: {results['verification_score']}/100")
print(f"✅ Correct: {results['correct']}")
print(f"❌ False Positives: {results['false_positives']}")
print(f"⚠️ False Negatives: {results['false_negatives']}")
# Show false positives
if results['false_positives'] > 0:
print(f"\n❌ FALSE POSITIVES (claimed done but not implemented):")
for task in results['task_results']:
if task.get('verification_status') == 'false_positive':
print(f" - {task['task'][:80]}")
print(f" {task.get('evidence', 'No evidence')}")
# Output JSON
if '--json' in sys.argv:
print(json.dumps(results, indent=2))