347 lines
12 KiB
Python
Executable File
347 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
LLM-Powered Task Verification - Use Claude Haiku to ACTUALLY verify code quality
|
||
|
||
Purpose: Don't guess with regex - have Claude READ the code and verify it's real
|
||
Method: For each task, read mentioned files, ask Claude "is this actually implemented?"
|
||
|
||
Created: 2026-01-02
|
||
Cost: ~$0.13 per story with Haiku (50 tasks × 3K tokens × $1.25/1M)
|
||
Full platform: 511 stories × $0.13 = ~$66 total
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import Dict, List
|
||
from anthropic import Anthropic
|
||
|
||
|
||
class LLMTaskVerifier:
|
||
"""Uses Claude API to verify tasks by reading and analyzing actual code"""
|
||
|
||
def __init__(self, api_key: str = None):
|
||
self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY')
|
||
if not self.api_key:
|
||
raise ValueError("ANTHROPIC_API_KEY required")
|
||
|
||
self.client = Anthropic(api_key=self.api_key)
|
||
self.model = 'claude-haiku-4-20250514' # Fast + cheap for verification tasks
|
||
self.repo_root = Path('.')
|
||
|
||
def verify_task(self, task_text: str, is_checked: bool, story_context: Dict) -> Dict:
|
||
"""
|
||
Use Claude to verify if a task is actually complete
|
||
|
||
Args:
|
||
task_text: The task description (e.g., "Implement UserService")
|
||
is_checked: Whether task is checked [x] or not [ ]
|
||
story_context: Context about the story (files, epic, etc.)
|
||
|
||
Returns:
|
||
{
|
||
'task': task_text,
|
||
'is_checked': bool,
|
||
'actually_complete': bool,
|
||
'confidence': 'very_high' | 'high' | 'medium' | 'low',
|
||
'evidence': str,
|
||
'issues_found': [list of issues],
|
||
'verification_status': 'correct' | 'false_positive' | 'false_negative'
|
||
}
|
||
"""
|
||
# Extract file references from task
|
||
file_refs = self._extract_file_references(task_text)
|
||
|
||
# Read the files
|
||
file_contents = {}
|
||
for file_ref in file_refs[:5]: # Limit to 5 files per task
|
||
content = self._read_file(file_ref)
|
||
if content:
|
||
file_contents[file_ref] = content
|
||
|
||
# If no files found, try reading files from story context
|
||
if not file_contents and story_context.get('files'):
|
||
for file_path in story_context['files'][:5]:
|
||
content = self._read_file(file_path)
|
||
if content:
|
||
file_contents[file_path] = content
|
||
|
||
# Build prompt for Claude
|
||
prompt = self._build_verification_prompt(task_text, is_checked, file_contents, story_context)
|
||
|
||
# Call Claude API
|
||
try:
|
||
response = self.client.messages.create(
|
||
model=self.model,
|
||
max_tokens=2000,
|
||
temperature=0, # Deterministic
|
||
messages=[{
|
||
'role': 'user',
|
||
'content': prompt
|
||
}]
|
||
)
|
||
|
||
# Parse response
|
||
result_text = response.content[0].text
|
||
result = self._parse_claude_response(result_text)
|
||
|
||
# Add metadata
|
||
result['task'] = task_text
|
||
result['is_checked'] = is_checked
|
||
result['tokens_used'] = response.usage.input_tokens + response.usage.output_tokens
|
||
|
||
# Determine verification status
|
||
if is_checked == result['actually_complete']:
|
||
result['verification_status'] = 'correct'
|
||
elif is_checked and not result['actually_complete']:
|
||
result['verification_status'] = 'false_positive'
|
||
else:
|
||
result['verification_status'] = 'false_negative'
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
return {
|
||
'task': task_text,
|
||
'error': str(e),
|
||
'verification_status': 'error'
|
||
}
|
||
|
||
def _build_verification_prompt(self, task: str, is_checked: bool, files: Dict, context: Dict) -> str:
|
||
"""Build prompt for Claude to verify task completion"""
|
||
|
||
files_section = ""
|
||
if files:
|
||
files_section = "\n\n## Files Provided\n\n"
|
||
for file_path, content in files.items():
|
||
files_section += f"### {file_path}\n```typescript\n{content[:2000]}\n```\n\n"
|
||
else:
|
||
files_section = "\n\n## Files Provided\n\nNone - task may not reference specific files.\n"
|
||
|
||
prompt = f"""You are a code verification expert. Your job is to verify whether a task from a user story is actually complete.
|
||
|
||
## Task to Verify
|
||
|
||
**Task:** {task}
|
||
**Claimed Status:** {'[x] Complete' if is_checked else '[ ] Not complete'}
|
||
|
||
## Story Context
|
||
|
||
**Story:** {context.get('story_id', 'Unknown')}
|
||
**Epic:** {context.get('epic', 'Unknown')}
|
||
|
||
{files_section}
|
||
|
||
## Your Task
|
||
|
||
Analyze the files (if provided) and determine:
|
||
|
||
1. **Is the task actually complete?**
|
||
- If files provided: Does the code actually implement what the task describes?
|
||
- Is it real implementation or just stubs/TODOs?
|
||
- Are there tests? Do they pass?
|
||
|
||
2. **Confidence level:**
|
||
- very_high: Clear evidence (tests passing, full implementation)
|
||
- high: Strong evidence (code exists with logic, no stubs)
|
||
- medium: Some evidence but incomplete
|
||
- low: No files or cannot verify
|
||
|
||
3. **Evidence:**
|
||
- What did you find that proves/disproves completion?
|
||
- Specific line numbers or code snippets
|
||
- Test results if applicable
|
||
|
||
4. **Issues (if any):**
|
||
- Stub code or TODOs
|
||
- Missing error handling
|
||
- No multi-tenant isolation (dealerId filters)
|
||
- Security vulnerabilities
|
||
- Missing tests
|
||
|
||
## Response Format (JSON)
|
||
|
||
{{
|
||
"actually_complete": true/false,
|
||
"confidence": "very_high|high|medium|low",
|
||
"evidence": "Detailed explanation of what you found",
|
||
"issues_found": ["issue 1", "issue 2"],
|
||
"recommendation": "What needs to be done (if incomplete)"
|
||
}}
|
||
|
||
**Be objective. If code is a stub with TODOs, it's NOT complete even if files exist.**
|
||
"""
|
||
return prompt
|
||
|
||
def _parse_claude_response(self, response_text: str) -> Dict:
|
||
"""Parse Claude's JSON response"""
|
||
try:
|
||
# Extract JSON from response (may have markdown)
|
||
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
||
if json_match:
|
||
return json.loads(json_match.group(0))
|
||
else:
|
||
# Fallback: parse manually
|
||
return {
|
||
'actually_complete': 'complete' in response_text.lower() and 'not complete' not in response_text.lower(),
|
||
'confidence': 'low',
|
||
'evidence': response_text[:500],
|
||
'issues_found': [],
|
||
}
|
||
except:
|
||
return {
|
||
'actually_complete': False,
|
||
'confidence': 'low',
|
||
'evidence': 'Failed to parse response',
|
||
'issues_found': ['Parse error'],
|
||
}
|
||
|
||
def _extract_file_references(self, task_text: str) -> List[str]:
|
||
"""Extract file paths from task text"""
|
||
paths = []
|
||
|
||
# Common patterns
|
||
patterns = [
|
||
r'[\w/-]+/[\w-]+\.[\w]+', # Explicit paths
|
||
r'\b([A-Z][\w-]+\.(ts|tsx|service|controller|repository))', # Files
|
||
]
|
||
|
||
for pattern in patterns:
|
||
matches = re.findall(pattern, task_text)
|
||
if isinstance(matches[0], tuple) if matches else False:
|
||
paths.extend([m[0] for m in matches])
|
||
else:
|
||
paths.extend(matches)
|
||
|
||
return list(set(paths))[:5] # Max 5 files per task
|
||
|
||
def _read_file(self, file_ref: str) -> str:
|
||
"""Find and read file from repository"""
|
||
# Try exact path
|
||
if (self.repo_root / file_ref).exists():
|
||
try:
|
||
return (self.repo_root / file_ref).read_text()[:5000] # Max 5K chars
|
||
except:
|
||
return None
|
||
|
||
# Search for file
|
||
import subprocess
|
||
try:
|
||
result = subprocess.run(
|
||
['find', '.', '-name', Path(file_ref).name, '-type', 'f'],
|
||
capture_output=True,
|
||
text=True,
|
||
cwd=self.repo_root,
|
||
timeout=5
|
||
)
|
||
|
||
if result.stdout.strip():
|
||
file_path = result.stdout.strip().split('\n')[0]
|
||
return Path(file_path).read_text()[:5000]
|
||
except:
|
||
pass
|
||
|
||
return None
|
||
|
||
|
||
def verify_story_with_llm(story_file_path: str) -> Dict:
|
||
"""
|
||
Verify entire story using LLM for each task
|
||
|
||
Cost: ~$1.50 per story (50 tasks × 3K tokens/task × $15/1M)
|
||
Time: ~2-3 minutes per story
|
||
"""
|
||
verifier = LLMTaskVerifier()
|
||
story_path = Path(story_file_path)
|
||
|
||
if not story_path.exists():
|
||
return {'error': 'Story file not found'}
|
||
|
||
content = story_path.read_text()
|
||
|
||
# Extract story context
|
||
story_id = story_path.stem
|
||
epic_match = re.search(r'Epic:\*?\*?\s*(\w+)', content, re.IGNORECASE)
|
||
epic = epic_match.group(1) if epic_match else 'Unknown'
|
||
|
||
# Extract files from Dev Agent Record
|
||
file_list_match = re.search(r'### File List\n\n(.+?)###', content, re.DOTALL)
|
||
files = []
|
||
if file_list_match:
|
||
file_section = file_list_match.group(1)
|
||
files = re.findall(r'[\w/-]+\.[\w]+', file_section)
|
||
|
||
story_context = {
|
||
'story_id': story_id,
|
||
'epic': epic,
|
||
'files': files
|
||
}
|
||
|
||
# Extract all tasks
|
||
task_pattern = r'^-\s*\[([ xX])\]\s*(.+)$'
|
||
tasks = re.findall(task_pattern, content, re.MULTILINE)
|
||
|
||
if not tasks:
|
||
return {'error': 'No tasks found'}
|
||
|
||
# Verify each task with LLM
|
||
print(f"\n🔍 Verifying {len(tasks)} tasks with Claude...", file=sys.stderr)
|
||
|
||
task_results = []
|
||
for idx, (checkbox, task_text) in enumerate(tasks):
|
||
is_checked = checkbox.lower() == 'x'
|
||
|
||
print(f" {idx+1}/{len(tasks)}: {task_text[:60]}...", file=sys.stderr)
|
||
|
||
result = verifier.verify_task(task_text, is_checked, story_context)
|
||
task_results.append(result)
|
||
|
||
# Calculate summary
|
||
total = len(task_results)
|
||
correct = sum(1 for r in task_results if r.get('verification_status') == 'correct')
|
||
false_positives = sum(1 for r in task_results if r.get('verification_status') == 'false_positive')
|
||
false_negatives = sum(1 for r in task_results if r.get('verification_status') == 'false_negative')
|
||
|
||
return {
|
||
'story_id': story_id,
|
||
'total_tasks': total,
|
||
'correct': correct,
|
||
'false_positives': false_positives,
|
||
'false_negatives': false_negatives,
|
||
'verification_score': round((correct / total * 100), 1) if total > 0 else 0,
|
||
'task_results': task_results
|
||
}
|
||
|
||
|
||
if __name__ == '__main__':
|
||
if len(sys.argv) < 2:
|
||
print("Usage: llm-task-verifier.py <story-file>")
|
||
sys.exit(1)
|
||
|
||
results = verify_story_with_llm(sys.argv[1])
|
||
|
||
if 'error' in results:
|
||
print(f"❌ {results['error']}")
|
||
sys.exit(1)
|
||
|
||
# Print summary
|
||
print(f"\n📊 Story: {results['story_id']}")
|
||
print(f"Verification Score: {results['verification_score']}/100")
|
||
print(f"✅ Correct: {results['correct']}")
|
||
print(f"❌ False Positives: {results['false_positives']}")
|
||
print(f"⚠️ False Negatives: {results['false_negatives']}")
|
||
|
||
# Show false positives
|
||
if results['false_positives'] > 0:
|
||
print(f"\n❌ FALSE POSITIVES (claimed done but not implemented):")
|
||
for task in results['task_results']:
|
||
if task.get('verification_status') == 'false_positive':
|
||
print(f" - {task['task'][:80]}")
|
||
print(f" {task.get('evidence', 'No evidence')}")
|
||
|
||
# Output JSON
|
||
if '--json' in sys.argv:
|
||
print(json.dumps(results, indent=2))
|