#!/usr/bin/env python3 """ Task Verification Engine - Verify story task checkboxes match ACTUAL CODE Purpose: Prevent false positives where tasks are checked but code doesn't exist Method: Parse task text, infer what files/functions should exist, verify in codebase Created: 2026-01-02 Part of: Comprehensive validation solution """ import re import subprocess from pathlib import Path from typing import Dict, List, Tuple, Optional class TaskVerificationEngine: """Verifies that checked tasks correspond to actual code in the repository""" def __init__(self, repo_root: Path = Path(".")): self.repo_root = repo_root def verify_task(self, task_text: str, is_checked: bool) -> Dict: """ Verify a single task against codebase reality DEEP VERIFICATION - Not just file existence, but: - Files exist AND have real implementation (not stubs) - Tests exist AND are passing - No TODO/FIXME comments in implementation - Code has actual logic (not empty classes) Returns: { 'task': task_text, 'is_checked': bool, 'should_be_checked': bool, 'confidence': 'high'|'medium'|'low', 'evidence': [list of evidence], 'verification_status': 'correct'|'false_positive'|'false_negative'|'uncertain' } """ # Extract potential file paths from task text file_refs = self._extract_file_references(task_text) # Extract class/function names code_refs = self._extract_code_references(task_text) # Extract test requirements test_refs = self._extract_test_references(task_text) # Verify file existence AND implementation quality files_exist = [] files_missing = [] for file_ref in file_refs: if self._file_exists(file_ref): # DEEP CHECK: Is it really implemented or just a stub? if self._verify_real_implementation(file_ref, None): files_exist.append(file_ref) else: files_missing.append(f"{file_ref} (stub/TODO)") else: files_missing.append(file_ref) # Verify code existence AND implementation code_found = [] code_missing = [] for code_ref in code_refs: if self._code_exists(code_ref): code_found.append(code_ref) else: code_missing.append(code_ref) # Verify tests exist AND pass tests_passing = [] tests_failing_or_missing = [] for test_ref in test_refs: test_status = self._verify_test_exists_and_passes(test_ref) if test_status == 'passing': tests_passing.append(test_ref) else: tests_failing_or_missing.append(f"{test_ref} ({test_status})") # Build evidence with DEEP verification evidence = [] confidence = 'low' should_be_checked = False # STRONGEST evidence: Tests exist AND pass if tests_passing: evidence.append(f"{len(tests_passing)} tests passing (VERIFIED)") confidence = 'very high' should_be_checked = True # Strong evidence: Files exist with real implementation if files_exist and not files_missing: evidence.append(f"All {len(files_exist)} files exist with real code (no stubs)") if confidence != 'very high': confidence = 'high' should_be_checked = True # Strong evidence: Code found with implementation if code_found and not code_missing: evidence.append(f"All {len(code_found)} code elements implemented") if confidence == 'low': confidence = 'high' should_be_checked = True # NEGATIVE evidence: Tests missing or failing if tests_failing_or_missing: evidence.append(f"{len(tests_failing_or_missing)} tests missing/failing") # Even if files exist, no passing tests = NOT done should_be_checked = False confidence = 'medium' # NEGATIVE evidence: Mixed results if files_exist and files_missing: evidence.append(f"{len(files_exist)} files OK, {len(files_missing)} missing/stubs") confidence = 'medium' should_be_checked = False # Incomplete # Strong evidence of incompletion if not files_exist and files_missing: evidence.append(f"All {len(files_missing)} files missing or stubs") confidence = 'high' should_be_checked = False if not code_found and code_missing: evidence.append(f"Code not found: {', '.join(code_missing[:3])}") confidence = 'medium' should_be_checked = False # No file/code/test references - use heuristics if not file_refs and not code_refs and not test_refs: # Check for action keywords if self._has_completion_keywords(task_text): evidence.append("Research/analysis task (no code artifacts)") confidence = 'low' # Can't verify - trust the checkbox should_be_checked = is_checked else: evidence.append("No verifiable references") confidence = 'low' should_be_checked = is_checked # Determine verification status if is_checked == should_be_checked: verification_status = 'correct' elif is_checked and not should_be_checked: verification_status = 'false_positive' # Checked but code missing elif not is_checked and should_be_checked: verification_status = 'false_negative' # Unchecked but code exists else: verification_status = 'uncertain' return { 'task': task_text, 'is_checked': is_checked, 'should_be_checked': should_be_checked, 'confidence': confidence, 'evidence': evidence, 'verification_status': verification_status, 'files_exist': files_exist, 'files_missing': files_missing, 'code_found': code_found, 'code_missing': code_missing, } def _extract_file_references(self, task_text: str) -> List[str]: """Extract file path references from task text""" paths = [] # Pattern 1: Explicit paths (src/foo/bar.ts) explicit_paths = re.findall(r'[\w/-]+/[\w-]+\.[\w]+', task_text) paths.extend(explicit_paths) # Pattern 2: "Create Foo.ts" or "Implement Bar.service.ts" file_mentions = re.findall(r'\b([A-Z][\w-]+\.(ts|tsx|js|jsx|py|md|yaml|json))\b', task_text) paths.extend([f[0] for f in file_mentions]) # Pattern 3: "in components/Widget.tsx" contextual = re.findall(r'in\s+([\w/-]+\.[\w]+)', task_text, re.IGNORECASE) paths.extend(contextual) return list(set(paths)) # Deduplicate def _extract_code_references(self, task_text: str) -> List[str]: """Extract class/function/interface names from task text""" code_refs = [] # Pattern 1: "Create FooService class" class_patterns = re.findall(r'(?:Create|Implement|Add)\s+(\w+(?:Service|Controller|Repository|Component|Interface|Type))', task_text, re.IGNORECASE) code_refs.extend(class_patterns) # Pattern 2: "Implement getFoo method" method_patterns = re.findall(r'(?:Implement|Add|Create)\s+(\w+)\s+(?:method|function)', task_text, re.IGNORECASE) code_refs.extend(method_patterns) # Pattern 3: Camel/PascalCase references camelcase = re.findall(r'\b([A-Z][a-z]+(?:[A-Z][a-z]+)+)\b', task_text) code_refs.extend(camelcase) return list(set(code_refs)) def _file_exists(self, file_path: str) -> bool: """Check if file exists in repository""" # Try exact path first if (self.repo_root / file_path).exists(): return True # Try common locations search_dirs = [ 'apps/backend/', 'apps/frontend/', 'packages/', 'src/', 'infrastructure/', ] for search_dir in search_dirs: if (self.repo_root / search_dir).exists(): # Use find command try: result = subprocess.run( ['find', search_dir, '-name', Path(file_path).name, '-type', 'f'], capture_output=True, text=True, cwd=self.repo_root, timeout=5 ) if result.returncode == 0 and result.stdout.strip(): return True except: pass return False def _code_exists(self, code_ref: str) -> bool: """Check if class/function/interface exists AND is actually implemented (not just a stub)""" try: # Search for class, interface, function, or type declaration patterns = [ f'class {code_ref}', f'interface {code_ref}', f'function {code_ref}', f'export const {code_ref}', f'export function {code_ref}', f'type {code_ref}', ] for pattern in patterns: result = subprocess.run( ['grep', '-r', '-l', pattern, '.', '--include=*.ts', '--include=*.tsx', '--include=*.js'], capture_output=True, text=True, cwd=self.repo_root, timeout=10 ) if result.returncode == 0 and result.stdout.strip(): # Found the declaration - now verify it's not a stub file_path = result.stdout.strip().split('\n')[0] if self._verify_real_implementation(file_path, code_ref): return True except: pass return False def _verify_real_implementation(self, file_path: str, code_ref: str) -> bool: """ Verify code is REALLY implemented, not just a stub or TODO Checks for: - File has substantial code (not just empty class) - No TODO/FIXME comments near the code - Has actual methods/logic (not just interface) """ try: full_path = self.repo_root / file_path if not full_path.exists(): return False content = full_path.read_text() # Find the code reference code_index = content.find(code_ref) if code_index == -1: return False # Get 500 chars after the reference (the implementation) code_snippet = content[code_index:code_index + 500] # RED FLAGS - indicates stub/incomplete code red_flags = [ 'TODO', 'FIXME', 'throw new Error(\'Not implemented', 'return null;', '// Placeholder', '// Stub', 'return {};', 'return [];', 'return undefined;', ] for flag in red_flags: if flag in code_snippet: return False # Found stub/placeholder # GREEN FLAGS - indicates real implementation green_flags = [ 'return', # Has return statements 'this.', # Uses instance members 'await', # Has async logic 'if (', # Has conditional logic 'for (', # Has loops 'const ', # Has variables ] green_count = sum(1 for flag in green_flags if flag in code_snippet) # Need at least 3 green flags for "real" implementation return green_count >= 3 except: return False def _extract_test_references(self, task_text: str) -> List[str]: """Extract test file references from task text""" test_refs = [] # Pattern 1: Explicit test files test_files = re.findall(r'([\w/-]+\.(?:spec|test)\.(?:ts|tsx|js))', task_text) test_refs.extend(test_files) # Pattern 2: "Write tests for X" or "Add test coverage" if re.search(r'\b(?:test|tests|testing|coverage)\b', task_text, re.IGNORECASE): # Extract potential test subjects subjects = re.findall(r'(?:for|to)\s+(\w+(?:Service|Controller|Component|Repository|Widget))', task_text) test_refs.extend([f"{subj}.spec.ts" for subj in subjects]) return list(set(test_refs)) def _verify_test_exists_and_passes(self, test_ref: str) -> str: """ Verify test file exists AND tests are passing Returns: 'passing' | 'failing' | 'missing' | 'not_run' """ # Find test file if not self._file_exists(test_ref): return 'missing' # Try to run the test try: # Find the actual test file path result = subprocess.run( ['find', '.', '-name', Path(test_ref).name, '-type', 'f'], capture_output=True, text=True, cwd=self.repo_root, timeout=5 ) if not result.stdout.strip(): return 'missing' test_file_path = result.stdout.strip().split('\n')[0] # Run the test (with timeout - don't hang) test_result = subprocess.run( ['pnpm', 'test', '--', test_file_path, '--run'], capture_output=True, text=True, cwd=self.repo_root, timeout=30 # 30 second timeout per test file ) # Check output for pass/fail output = test_result.stdout + test_result.stderr if 'PASS' in output or 'passing' in output.lower(): return 'passing' elif 'FAIL' in output or 'failing' in output.lower(): return 'failing' else: return 'not_run' except subprocess.TimeoutExpired: return 'timeout' except: return 'not_run' def _has_completion_keywords(self, task_text: str) -> bool: """Check if task has action-oriented keywords""" keywords = [ 'research', 'investigate', 'analyze', 'review', 'document', 'plan', 'design', 'decide', 'choose', 'evaluate', 'assess' ] text_lower = task_text.lower() return any(keyword in text_lower for keyword in keywords) def verify_story_tasks(story_file_path: str) -> Dict: """ Verify all tasks in a story file Returns: { 'total_tasks': int, 'checked_tasks': int, 'correct_checkboxes': int, 'false_positives': int, # Checked but code missing 'false_negatives': int, # Unchecked but code exists 'uncertain': int, 'verification_score': float, # 0-100 'task_details': [...], } """ story_path = Path(story_file_path) if not story_path.exists(): return {'error': 'Story file not found'} content = story_path.read_text() # Extract all tasks (- [ ] or - [x]) task_pattern = r'^-\s*\[([ xX])\]\s*(.+)$' tasks = re.findall(task_pattern, content, re.MULTILINE) if not tasks: return { 'total_tasks': 0, 'error': 'No task list found in story file' } # Verify each task engine = TaskVerificationEngine(story_path.parent.parent) # Go up to repo root task_verifications = [] for checkbox, task_text in tasks: is_checked = checkbox.lower() == 'x' verification = engine.verify_task(task_text, is_checked) task_verifications.append(verification) # Calculate summary total_tasks = len(task_verifications) checked_tasks = sum(1 for v in task_verifications if v['is_checked']) correct = sum(1 for v in task_verifications if v['verification_status'] == 'correct') false_positives = sum(1 for v in task_verifications if v['verification_status'] == 'false_positive') false_negatives = sum(1 for v in task_verifications if v['verification_status'] == 'false_negative') uncertain = sum(1 for v in task_verifications if v['verification_status'] == 'uncertain') # Verification score: (correct / total) * 100 verification_score = (correct / total_tasks * 100) if total_tasks > 0 else 0 return { 'total_tasks': total_tasks, 'checked_tasks': checked_tasks, 'correct_checkboxes': correct, 'false_positives': false_positives, 'false_negatives': false_negatives, 'uncertain': uncertain, 'verification_score': round(verification_score, 1), 'task_details': task_verifications, } def main(): """CLI entry point""" import sys import json if len(sys.argv) < 2: print("Usage: task-verification-engine.py ", file=sys.stderr) sys.exit(1) story_file = sys.argv[1] results = verify_story_tasks(story_file) # Print summary print(f"\nšŸ“‹ Task Verification Report: {Path(story_file).name}") print("=" * 80) if 'error' in results: print(f"āŒ {results['error']}") sys.exit(1) print(f"Total tasks: {results['total_tasks']}") print(f"Checked: {results['checked_tasks']}") print(f"Verification score: {results['verification_score']}/100") print() print(f"āœ… Correct: {results['correct_checkboxes']}") print(f"āŒ False positives: {results['false_positives']} (checked but code missing)") print(f"āŒ False negatives: {results['false_negatives']} (unchecked but code exists)") print(f"ā” Uncertain: {results['uncertain']}") # Show false positives if results['false_positives'] > 0: print("\nāš ļø FALSE POSITIVES (checked but no evidence):") for task in results['task_details']: if task['verification_status'] == 'false_positive': print(f" - {task['task'][:80]}") print(f" Evidence: {', '.join(task['evidence'])}") # Show false negatives if results['false_negatives'] > 0: print("\nšŸ’” FALSE NEGATIVES (unchecked but code exists):") for task in results['task_details']: if task['verification_status'] == 'false_negative': print(f" - {task['task'][:80]}") print(f" Evidence: {', '.join(task['evidence'])}") # Output JSON for programmatic use if '--json' in sys.argv: print("\n" + json.dumps(results, indent=2)) if __name__ == '__main__': main()