#!/usr/bin/env python3 """ LLM-Powered Task Verification - Use Claude Haiku to ACTUALLY verify code quality Purpose: Don't guess with regex - have Claude READ the code and verify it's real Method: For each task, read mentioned files, ask Claude "is this actually implemented?" Created: 2026-01-02 Cost: ~$0.13 per story with Haiku (50 tasks × 3K tokens × $1.25/1M) Full platform: 511 stories × $0.13 = ~$66 total """ import json import os import re import sys from pathlib import Path from typing import Dict, List from anthropic import Anthropic class LLMTaskVerifier: """Uses Claude API to verify tasks by reading and analyzing actual code""" def __init__(self, api_key: str = None): self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY') if not self.api_key: raise ValueError("ANTHROPIC_API_KEY required") self.client = Anthropic(api_key=self.api_key) self.model = 'claude-haiku-4-20250514' # Fast + cheap for verification tasks self.repo_root = Path('.') def verify_task(self, task_text: str, is_checked: bool, story_context: Dict) -> Dict: """ Use Claude to verify if a task is actually complete Args: task_text: The task description (e.g., "Implement UserService") is_checked: Whether task is checked [x] or not [ ] story_context: Context about the story (files, epic, etc.) Returns: { 'task': task_text, 'is_checked': bool, 'actually_complete': bool, 'confidence': 'very_high' | 'high' | 'medium' | 'low', 'evidence': str, 'issues_found': [list of issues], 'verification_status': 'correct' | 'false_positive' | 'false_negative' } """ # Extract file references from task file_refs = self._extract_file_references(task_text) # Read the files file_contents = {} for file_ref in file_refs[:5]: # Limit to 5 files per task content = self._read_file(file_ref) if content: file_contents[file_ref] = content # If no files found, try reading files from story context if not file_contents and story_context.get('files'): for file_path in story_context['files'][:5]: content = self._read_file(file_path) if content: file_contents[file_path] = content # Build prompt for Claude prompt = self._build_verification_prompt(task_text, is_checked, file_contents, story_context) # Call Claude API try: response = self.client.messages.create( model=self.model, max_tokens=2000, temperature=0, # Deterministic messages=[{ 'role': 'user', 'content': prompt }] ) # Parse response result_text = response.content[0].text result = self._parse_claude_response(result_text) # Add metadata result['task'] = task_text result['is_checked'] = is_checked result['tokens_used'] = response.usage.input_tokens + response.usage.output_tokens # Determine verification status if is_checked == result['actually_complete']: result['verification_status'] = 'correct' elif is_checked and not result['actually_complete']: result['verification_status'] = 'false_positive' else: result['verification_status'] = 'false_negative' return result except Exception as e: return { 'task': task_text, 'error': str(e), 'verification_status': 'error' } def _build_verification_prompt(self, task: str, is_checked: bool, files: Dict, context: Dict) -> str: """Build prompt for Claude to verify task completion""" files_section = "" if files: files_section = "\n\n## Files Provided\n\n" for file_path, content in files.items(): files_section += f"### {file_path}\n```typescript\n{content[:2000]}\n```\n\n" else: files_section = "\n\n## Files Provided\n\nNone - task may not reference specific files.\n" prompt = f"""You are a code verification expert. Your job is to verify whether a task from a user story is actually complete. ## Task to Verify **Task:** {task} **Claimed Status:** {'[x] Complete' if is_checked else '[ ] Not complete'} ## Story Context **Story:** {context.get('story_id', 'Unknown')} **Epic:** {context.get('epic', 'Unknown')} {files_section} ## Your Task Analyze the files (if provided) and determine: 1. **Is the task actually complete?** - If files provided: Does the code actually implement what the task describes? - Is it real implementation or just stubs/TODOs? - Are there tests? Do they pass? 2. **Confidence level:** - very_high: Clear evidence (tests passing, full implementation) - high: Strong evidence (code exists with logic, no stubs) - medium: Some evidence but incomplete - low: No files or cannot verify 3. **Evidence:** - What did you find that proves/disproves completion? - Specific line numbers or code snippets - Test results if applicable 4. **Issues (if any):** - Stub code or TODOs - Missing error handling - No multi-tenant isolation (dealerId filters) - Security vulnerabilities - Missing tests ## Response Format (JSON) {{ "actually_complete": true/false, "confidence": "very_high|high|medium|low", "evidence": "Detailed explanation of what you found", "issues_found": ["issue 1", "issue 2"], "recommendation": "What needs to be done (if incomplete)" }} **Be objective. If code is a stub with TODOs, it's NOT complete even if files exist.** """ return prompt def _parse_claude_response(self, response_text: str) -> Dict: """Parse Claude's JSON response""" try: # Extract JSON from response (may have markdown) json_match = re.search(r'\{.*\}', response_text, re.DOTALL) if json_match: return json.loads(json_match.group(0)) else: # Fallback: parse manually return { 'actually_complete': 'complete' in response_text.lower() and 'not complete' not in response_text.lower(), 'confidence': 'low', 'evidence': response_text[:500], 'issues_found': [], } except: return { 'actually_complete': False, 'confidence': 'low', 'evidence': 'Failed to parse response', 'issues_found': ['Parse error'], } def _extract_file_references(self, task_text: str) -> List[str]: """Extract file paths from task text""" paths = [] # Common patterns patterns = [ r'[\w/-]+/[\w-]+\.[\w]+', # Explicit paths r'\b([A-Z][\w-]+\.(ts|tsx|service|controller|repository))', # Files ] for pattern in patterns: matches = re.findall(pattern, task_text) if isinstance(matches[0], tuple) if matches else False: paths.extend([m[0] for m in matches]) else: paths.extend(matches) return list(set(paths))[:5] # Max 5 files per task def _read_file(self, file_ref: str) -> str: """Find and read file from repository""" # Try exact path if (self.repo_root / file_ref).exists(): try: return (self.repo_root / file_ref).read_text()[:5000] # Max 5K chars except: return None # Search for file import subprocess try: result = subprocess.run( ['find', '.', '-name', Path(file_ref).name, '-type', 'f'], capture_output=True, text=True, cwd=self.repo_root, timeout=5 ) if result.stdout.strip(): file_path = result.stdout.strip().split('\n')[0] return Path(file_path).read_text()[:5000] except: pass return None def verify_story_with_llm(story_file_path: str) -> Dict: """ Verify entire story using LLM for each task Cost: ~$1.50 per story (50 tasks × 3K tokens/task × $15/1M) Time: ~2-3 minutes per story """ verifier = LLMTaskVerifier() story_path = Path(story_file_path) if not story_path.exists(): return {'error': 'Story file not found'} content = story_path.read_text() # Extract story context story_id = story_path.stem epic_match = re.search(r'Epic:\*?\*?\s*(\w+)', content, re.IGNORECASE) epic = epic_match.group(1) if epic_match else 'Unknown' # Extract files from Dev Agent Record file_list_match = re.search(r'### File List\n\n(.+?)###', content, re.DOTALL) files = [] if file_list_match: file_section = file_list_match.group(1) files = re.findall(r'[\w/-]+\.[\w]+', file_section) story_context = { 'story_id': story_id, 'epic': epic, 'files': files } # Extract all tasks task_pattern = r'^-\s*\[([ xX])\]\s*(.+)$' tasks = re.findall(task_pattern, content, re.MULTILINE) if not tasks: return {'error': 'No tasks found'} # Verify each task with LLM print(f"\n🔍 Verifying {len(tasks)} tasks with Claude...", file=sys.stderr) task_results = [] for idx, (checkbox, task_text) in enumerate(tasks): is_checked = checkbox.lower() == 'x' print(f" {idx+1}/{len(tasks)}: {task_text[:60]}...", file=sys.stderr) result = verifier.verify_task(task_text, is_checked, story_context) task_results.append(result) # Calculate summary total = len(task_results) correct = sum(1 for r in task_results if r.get('verification_status') == 'correct') false_positives = sum(1 for r in task_results if r.get('verification_status') == 'false_positive') false_negatives = sum(1 for r in task_results if r.get('verification_status') == 'false_negative') return { 'story_id': story_id, 'total_tasks': total, 'correct': correct, 'false_positives': false_positives, 'false_negatives': false_negatives, 'verification_score': round((correct / total * 100), 1) if total > 0 else 0, 'task_results': task_results } if __name__ == '__main__': if len(sys.argv) < 2: print("Usage: llm-task-verifier.py ") sys.exit(1) results = verify_story_with_llm(sys.argv[1]) if 'error' in results: print(f"❌ {results['error']}") sys.exit(1) # Print summary print(f"\n📊 Story: {results['story_id']}") print(f"Verification Score: {results['verification_score']}/100") print(f"✅ Correct: {results['correct']}") print(f"❌ False Positives: {results['false_positives']}") print(f"⚠️ False Negatives: {results['false_negatives']}") # Show false positives if results['false_positives'] > 0: print(f"\n❌ FALSE POSITIVES (claimed done but not implemented):") for task in results['task_results']: if task.get('verification_status') == 'false_positive': print(f" - {task['task'][:80]}") print(f" {task.get('evidence', 'No evidence')}") # Output JSON if '--json' in sys.argv: print(json.dumps(results, indent=2))