# Semantic Search Engine ## Advanced Semantic Search and Knowledge Retrieval for Enhanced BMAD System The Semantic Search Engine provides intelligent, context-aware search capabilities across all knowledge domains, using advanced vector embeddings, semantic understanding, and multi-modal search techniques. ### Semantic Search Architecture #### Multi-Modal Search Framework ```yaml semantic_search_architecture: search_modalities: text_search: - natural_language_queries: "Find authentication patterns for microservices" - code_search: "Search for functions similar to getUserProfile()" - concept_search: "Search for concepts related to caching strategies" - intent_search: "Search by development intent and goals" code_search: - semantic_code_search: "Find semantically similar code blocks" - structural_search: "Search by code structure and patterns" - functional_search: "Search by function signature and behavior" - ast_pattern_search: "Search by abstract syntax tree patterns" visual_search: - diagram_search: "Search architectural diagrams and flowcharts" - ui_mockup_search: "Search UI designs and wireframes" - chart_search: "Search data visualizations and metrics" - code_visualization_search: "Search code structure visualizations" contextual_search: - project_context_search: "Search within specific project contexts" - temporal_search: "Search by time periods and development phases" - team_context_search: "Search by team activities and contributions" - domain_context_search: "Search within specific technical domains" embedding_models: text_embeddings: - transformer_models: "BERT, RoBERTa, T5 for natural language" - domain_specific: "SciBERT for technical documentation" - multilingual: "mBERT for multiple languages" - instruction_tuned: "Instruction-following models" code_embeddings: - codebert: "Microsoft CodeBERT for code understanding" - graphcodebert: "Graph-based code representation" - codet5: "Code-text dual encoder" - unixcoder: "Unified cross-modal code representation" multimodal_embeddings: - clip_variants: "CLIP for text-image understanding" - code_clip: "Code-diagram understanding" - technical_clip: "Technical document understanding" - architectural_embeddings: "Architecture diagram understanding" search_strategies: similarity_search: - cosine_similarity: "Vector cosine similarity matching" - euclidean_distance: "L2 distance for vector proximity" - dot_product: "Inner product similarity" - learned_similarity: "Neural similarity functions" hybrid_search: - dense_sparse_fusion: "Combine vector and keyword search" - multi_vector_search: "Multiple embedding spaces" - cross_modal_search: "Search across different modalities" - contextual_reranking: "Context-aware result reranking" graph_search: - knowledge_graph_traversal: "Search through graph relationships" - semantic_path_finding: "Find semantic paths between concepts" - graph_embedding_search: "Node2Vec and Graph2Vec search" - community_detection_search: "Search within knowledge communities" ``` #### Advanced Search Engine Implementation ```python import faiss import numpy as np from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModel import torch from sklearn.metrics.pairwise import cosine_similarity import networkx as nx from collections import defaultdict import asyncio class SemanticSearchEngine: """ Advanced semantic search engine for multi-modal knowledge retrieval """ def __init__(self): # Initialize embedding models self.text_encoder = SentenceTransformer('all-MiniLM-L6-v2') self.code_encoder = AutoModel.from_pretrained('microsoft/codebert-base') self.code_tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base') # Initialize search indices self.text_index = None self.code_index = None self.multimodal_index = None self.graph_index = None # Initialize search strategies self.search_strategies = { 'semantic_similarity': SemanticSimilaritySearch(), 'hybrid_search': HybridSearch(), 'graph_search': GraphSearch(), 'contextual_search': ContextualSearch() } # Search result cache self.search_cache = {} self.cache_ttl = 3600 # 1 hour async def initialize_search_indices(self, knowledge_base): """ Initialize all search indices from knowledge base """ initialization_results = { 'text_index': await self.build_text_index(knowledge_base), 'code_index': await self.build_code_index(knowledge_base), 'multimodal_index': await self.build_multimodal_index(knowledge_base), 'graph_index': await self.build_graph_index(knowledge_base) } return initialization_results async def build_text_index(self, knowledge_base): """ Build FAISS index for text-based semantic search """ text_documents = [] document_metadata = [] # Extract text content from knowledge base for node_id, node_data in knowledge_base.nodes(data=True): if 'text_content' in node_data: text_documents.append(node_data['text_content']) document_metadata.append({ 'node_id': node_id, 'type': node_data.get('type', 'unknown'), 'domain': node_data.get('domain', 'general'), 'timestamp': node_data.get('timestamp'), 'importance': node_data.get('importance_score', 1.0) }) # Generate embeddings text_embeddings = self.text_encoder.encode(text_documents) # Build FAISS index dimension = text_embeddings.shape[1] self.text_index = faiss.IndexFlatIP(dimension) # Inner product for similarity self.text_index.add(text_embeddings.astype('float32')) # Store metadata self.text_metadata = document_metadata return { 'index_type': 'text', 'documents_indexed': len(text_documents), 'embedding_dimension': dimension, 'index_size_mb': self.text_index.ntotal * dimension * 4 / 1024 / 1024 } async def build_code_index(self, knowledge_base): """ Build specialized index for code-based semantic search """ code_documents = [] code_metadata = [] # Extract code content from knowledge base for node_id, node_data in knowledge_base.nodes(data=True): if 'code_content' in node_data: code_documents.append(node_data['code_content']) code_metadata.append({ 'node_id': node_id, 'language': node_data.get('language', 'unknown'), 'file_path': node_data.get('file_path'), 'function_name': node_data.get('function_name'), 'class_name': node_data.get('class_name'), 'complexity': node_data.get('complexity_score', 1.0) }) # Generate code embeddings using CodeBERT code_embeddings = [] for code in code_documents: embedding = await self.generate_code_embedding(code) code_embeddings.append(embedding) if code_embeddings: code_embeddings = np.array(code_embeddings) # Build FAISS index for code dimension = code_embeddings.shape[1] self.code_index = faiss.IndexFlatIP(dimension) self.code_index.add(code_embeddings.astype('float32')) # Store metadata self.code_metadata = code_metadata return { 'index_type': 'code', 'documents_indexed': len(code_documents), 'embedding_dimension': dimension if code_embeddings else 0, 'languages_indexed': set(meta['language'] for meta in code_metadata) } async def generate_code_embedding(self, code_content): """ Generate embeddings for code using CodeBERT """ # Tokenize code tokens = self.code_tokenizer( code_content, return_tensors="pt", truncation=True, max_length=512, padding=True ) # Generate embeddings with torch.no_grad(): outputs = self.code_encoder(**tokens) # Use mean pooling of last hidden state embedding = outputs.last_hidden_state.mean(dim=1).squeeze() return embedding.numpy() async def semantic_search(self, query, search_config=None): """ Perform advanced semantic search across all knowledge modalities """ if search_config is None: search_config = { 'modalities': ['text', 'code', 'multimodal'], 'max_results': 10, 'similarity_threshold': 0.7, 'context_filters': {}, 'rerank_results': True } search_session = { 'query': query, 'search_config': search_config, 'modality_results': {}, 'fused_results': [], 'search_metadata': {} } # Analyze query to determine optimal search strategy query_analysis = await self.analyze_search_query(query) search_session['query_analysis'] = query_analysis # Execute searches across modalities search_tasks = [] if 'text' in search_config['modalities']: search_tasks.append(self.search_text_modality(query, search_config)) if 'code' in search_config['modalities']: search_tasks.append(self.search_code_modality(query, search_config)) if 'multimodal' in search_config['modalities']: search_tasks.append(self.search_multimodal_content(query, search_config)) if 'graph' in search_config['modalities']: search_tasks.append(self.search_graph_relationships(query, search_config)) # Execute searches in parallel modality_results = await asyncio.gather(*search_tasks) # Combine and fuse results fused_results = await self.fuse_search_results( modality_results, query_analysis, search_config ) # Apply contextual filtering filtered_results = await self.apply_contextual_filters( fused_results, search_config.get('context_filters', {}) ) # Rerank results if requested if search_config.get('rerank_results', True): final_results = await self.rerank_search_results( filtered_results, query, query_analysis ) else: final_results = filtered_results search_session.update({ 'modality_results': {f'modality_{i}': result for i, result in enumerate(modality_results)}, 'fused_results': fused_results, 'final_results': final_results[:search_config['max_results']], 'search_metadata': { 'total_results_before_filtering': len(fused_results), 'total_results_after_filtering': len(filtered_results), 'final_result_count': len(final_results[:search_config['max_results']]), 'search_time': datetime.utcnow() } }) return search_session async def search_text_modality(self, query, search_config): """ Search text content using semantic embeddings """ if self.text_index is None: return {'results': [], 'modality': 'text', 'error': 'Text index not initialized'} # Generate query embedding query_embedding = self.text_encoder.encode([query]) # Search in FAISS index similarities, indices = self.text_index.search( query_embedding.astype('float32'), min(search_config.get('max_results', 10) * 2, self.text_index.ntotal) ) # Build results with metadata results = [] for similarity, idx in zip(similarities[0], indices[0]): if similarity >= search_config.get('similarity_threshold', 0.7): result = { 'content_id': self.text_metadata[idx]['node_id'], 'similarity_score': float(similarity), 'content_type': 'text', 'metadata': self.text_metadata[idx], 'modality': 'text' } results.append(result) return { 'results': results, 'modality': 'text', 'search_method': 'semantic_embedding', 'total_candidates': len(indices[0]) } async def search_code_modality(self, query, search_config): """ Search code content using specialized code embeddings """ if self.code_index is None: return {'results': [], 'modality': 'code', 'error': 'Code index not initialized'} # Generate query embedding for code search query_embedding = await self.generate_code_embedding(query) # Search in code FAISS index similarities, indices = self.code_index.search( query_embedding.reshape(1, -1).astype('float32'), min(search_config.get('max_results', 10) * 2, self.code_index.ntotal) ) # Build results with metadata results = [] for similarity, idx in zip(similarities[0], indices[0]): if similarity >= search_config.get('similarity_threshold', 0.7): result = { 'content_id': self.code_metadata[idx]['node_id'], 'similarity_score': float(similarity), 'content_type': 'code', 'metadata': self.code_metadata[idx], 'modality': 'code' } results.append(result) return { 'results': results, 'modality': 'code', 'search_method': 'code_semantic_embedding', 'total_candidates': len(indices[0]) } async def analyze_search_query(self, query): """ Analyze search query to determine optimal search strategy """ query_analysis = { 'query_type': 'general', 'intent': 'information_retrieval', 'complexity': 'medium', 'domains': [], 'entities': [], 'temporal_indicators': [], 'code_indicators': [] } # Analyze query characteristics query_lower = query.lower() # Detect query type if any(keyword in query_lower for keyword in ['function', 'method', 'class', 'code']): query_analysis['query_type'] = 'code' elif any(keyword in query_lower for keyword in ['pattern', 'architecture', 'design']): query_analysis['query_type'] = 'architectural' elif any(keyword in query_lower for keyword in ['how to', 'implement', 'create']): query_analysis['query_type'] = 'procedural' elif any(keyword in query_lower for keyword in ['similar', 'like', 'related']): query_analysis['query_type'] = 'similarity' # Detect intent if any(keyword in query_lower for keyword in ['find', 'search', 'show']): query_analysis['intent'] = 'information_retrieval' elif any(keyword in query_lower for keyword in ['compare', 'difference', 'versus']): query_analysis['intent'] = 'comparison' elif any(keyword in query_lower for keyword in ['recommend', 'suggest', 'best']): query_analysis['intent'] = 'recommendation' elif any(keyword in query_lower for keyword in ['explain', 'understand', 'learn']): query_analysis['intent'] = 'explanation' # Extract entities using NLP doc = self.nlp(query) query_analysis['entities'] = [ent.text for ent in doc.ents] # Detect temporal indicators temporal_keywords = ['recent', 'latest', 'old', 'previous', 'current', 'new'] query_analysis['temporal_indicators'] = [word for word in temporal_keywords if word in query_lower] # Detect code indicators code_keywords = ['function', 'method', 'class', 'variable', 'API', 'library', 'framework'] query_analysis['code_indicators'] = [word for word in code_keywords if word in query_lower] return query_analysis async def fuse_search_results(self, modality_results, query_analysis, search_config): """ Fuse results from different search modalities """ all_results = [] # Collect all results for modality_result in modality_results: if 'results' in modality_result: all_results.extend(modality_result['results']) # Remove duplicates based on content_id seen_ids = set() unique_results = [] for result in all_results: if result['content_id'] not in seen_ids: unique_results.append(result) seen_ids.add(result['content_id']) # Apply fusion scoring fused_results = [] for result in unique_results: # Calculate fusion score fusion_score = await self.calculate_fusion_score( result, query_analysis, search_config ) result['fusion_score'] = fusion_score fused_results.append(result) # Sort by fusion score fused_results.sort(key=lambda x: x['fusion_score'], reverse=True) return fused_results async def calculate_fusion_score(self, result, query_analysis, search_config): """ Calculate fusion score combining multiple factors """ base_similarity = result['similarity_score'] # Modality bonus based on query type modality_bonus = 0.0 if query_analysis['query_type'] == 'code' and result['modality'] == 'code': modality_bonus = 0.2 elif query_analysis['query_type'] == 'architectural' and result['modality'] == 'text': modality_bonus = 0.1 # Recency bonus recency_bonus = 0.0 if 'timestamp' in result['metadata'] and result['metadata']['timestamp']: days_old = (datetime.utcnow() - datetime.fromisoformat(result['metadata']['timestamp'])).days recency_bonus = max(0, 0.1 - (days_old / 365) * 0.1) # Decay over time # Importance bonus importance_bonus = result['metadata'].get('importance', 1.0) * 0.05 # Calculate final fusion score fusion_score = base_similarity + modality_bonus + recency_bonus + importance_bonus return min(fusion_score, 1.0) # Cap at 1.0 ``` #### Advanced Search Features ```python class ContextualSearch: """ Context-aware search that considers project, team, and temporal context """ def __init__(self): self.context_weights = { 'project': 0.3, 'team': 0.2, 'temporal': 0.2, 'domain': 0.3 } async def contextual_search(self, query, context, knowledge_base): """ Perform search with rich contextual understanding """ contextual_results = { 'base_search_results': [], 'context_enhanced_results': [], 'context_analysis': {}, 'relevance_scoring': {} } # Perform base semantic search base_results = await self.base_semantic_search(query, knowledge_base) contextual_results['base_search_results'] = base_results # Analyze context context_analysis = await self.analyze_search_context(context) contextual_results['context_analysis'] = context_analysis # Enhance results with context enhanced_results = [] for result in base_results: enhanced_result = await self.enhance_result_with_context( result, context_analysis, knowledge_base ) enhanced_results.append(enhanced_result) # Re-rank based on contextual relevance contextually_ranked = await self.rank_by_contextual_relevance( enhanced_results, context_analysis ) contextual_results['context_enhanced_results'] = contextually_ranked return contextual_results async def enhance_result_with_context(self, result, context_analysis, knowledge_base): """ Enhance search result with contextual information """ enhanced_result = { **result, 'contextual_relevance': {}, 'context_connections': [], 'contextual_score': 0.0 } # Analyze project context relevance if 'project' in context_analysis: project_relevance = await self.calculate_project_relevance( result, context_analysis['project'], knowledge_base ) enhanced_result['contextual_relevance']['project'] = project_relevance # Analyze team context relevance if 'team' in context_analysis: team_relevance = await self.calculate_team_relevance( result, context_analysis['team'], knowledge_base ) enhanced_result['contextual_relevance']['team'] = team_relevance # Analyze temporal context relevance if 'temporal' in context_analysis: temporal_relevance = await self.calculate_temporal_relevance( result, context_analysis['temporal'] ) enhanced_result['contextual_relevance']['temporal'] = temporal_relevance # Calculate overall contextual score contextual_score = 0.0 for context_type, weight in self.context_weights.items(): if context_type in enhanced_result['contextual_relevance']: contextual_score += enhanced_result['contextual_relevance'][context_type] * weight enhanced_result['contextual_score'] = contextual_score return enhanced_result class HybridSearch: """ Hybrid search combining dense vector search with sparse keyword search """ def __init__(self): self.dense_weight = 0.7 self.sparse_weight = 0.3 self.keyword_index = {} async def hybrid_search(self, query, knowledge_base, search_config): """ Perform hybrid search combining dense and sparse methods """ hybrid_results = { 'dense_results': [], 'sparse_results': [], 'fused_results': [], 'fusion_metadata': {} } # Perform dense vector search dense_results = await self.dense_vector_search(query, knowledge_base) hybrid_results['dense_results'] = dense_results # Perform sparse keyword search sparse_results = await self.sparse_keyword_search(query, knowledge_base) hybrid_results['sparse_results'] = sparse_results # Fuse results using reciprocal rank fusion fused_results = await self.reciprocal_rank_fusion( dense_results, sparse_results, search_config ) hybrid_results['fused_results'] = fused_results return hybrid_results async def reciprocal_rank_fusion(self, dense_results, sparse_results, search_config): """ Fuse dense and sparse results using reciprocal rank fusion """ k = search_config.get('rrf_k', 60) # RRF parameter # Create unified result set all_results = {} # Add dense results with RRF scoring for rank, result in enumerate(dense_results, 1): content_id = result['content_id'] rrf_score = 1.0 / (k + rank) if content_id in all_results: all_results[content_id]['rrf_score'] += self.dense_weight * rrf_score else: all_results[content_id] = { **result, 'rrf_score': self.dense_weight * rrf_score, 'dense_rank': rank, 'sparse_rank': None } # Add sparse results with RRF scoring for rank, result in enumerate(sparse_results, 1): content_id = result['content_id'] rrf_score = 1.0 / (k + rank) if content_id in all_results: all_results[content_id]['rrf_score'] += self.sparse_weight * rrf_score all_results[content_id]['sparse_rank'] = rank else: all_results[content_id] = { **result, 'rrf_score': self.sparse_weight * rrf_score, 'dense_rank': None, 'sparse_rank': rank } # Sort by RRF score fused_results = sorted( all_results.values(), key=lambda x: x['rrf_score'], reverse=True ) return fused_results ``` ### Search Engine Commands ```bash # Basic semantic search bmad search --query "authentication patterns for microservices" bmad search --code "function getUserProfile" --language "javascript" bmad search --semantic "caching strategies" --context "high-performance" # Advanced search options bmad search --hybrid "database connection pooling" --modalities "text,code" bmad search --contextual "error handling" --project-context "current" bmad search --graph-search "relationships between Auth and Database" # Search configuration and optimization bmad search config --similarity-threshold 0.8 --max-results 20 bmad search index --rebuild --include-recent-changes bmad search analyze --query-performance --optimization-suggestions # Search result management bmad search export --results "last-search" --format "json" bmad search feedback --result-id "uuid" --relevance-score 0.9 bmad search history --show-patterns --time-period "last-week" ``` This Semantic Search Engine provides sophisticated, multi-modal search capabilities that understand context, intent, and semantic relationships, enabling developers to find relevant knowledge quickly and accurately across all domains of their development activities.