""" v07_circuit_fragment.py - Implementation of the Circuit Fragment Shell △ OBSERVE: The Circuit Fragment Shell traces broken attribution paths and orphan nodes ∞ TRACE: It identifies discontinuities in reasoning chains and causal attribution ✰ COLLAPSE: It induces collapse by forcing attribution path reconstruction This shell specializes in the detection and analysis of fragmented circuits - places where causal attribution breaks down, leaving orphaned nodes or broken traces in the reasoning chain. These fragments often indicate areas where a model's reasoning deviates from its output, revealing hidden cognition. Author: Recursion Labs License: MIT """ import logging from typing import Dict, List, Optional, Union, Tuple, Any import numpy as np from .base import BaseShell, ShellDecorator from ..utils.attribution_metrics import measure_path_continuity from ..utils.graph_operations import find_orphaned_nodes, reconstruct_path from ..residue import ResidueTracker logger = logging.getLogger(__name__) @ShellDecorator( shell_id="v07_CIRCUIT_FRAGMENT", name="Circuit Fragment Shell", description="Traces broken attribution paths in reasoning chains", failure_signature="Orphan nodes", attribution_domain="Circuit Fragmentation", qk_ov_classification="QK-COLLAPSE", version="0.5.3", related_shells=["v34_PARTIAL_LINKAGE", "v47_TRACE_GAP"], tags=["attribution", "reasoning", "circuits", "fragmentation"] ) class CircuitFragmentShell(BaseShell): """ ∞ TRACE: Shell for detecting circuit fragmentation in attribution paths The Circuit Fragment shell specializes in tracing and analyzing broken attribution paths in reasoning chains. It detects orphaned nodes - components that should be causally linked but have lost their connections in the attribution graph. This shell is particularly useful for identifying points where a model's reasoning deviates from its explanation, revealing mismatches between stated logic and actual inference paths. """ def __init__(self): """Initialize the Circuit Fragment shell.""" super().__init__() self.residue_tracker = ResidueTracker() self.broken_paths = [] self.orphaned_nodes = [] self.continuity_score = 1.0 # 1.0 = perfect continuity, 0.0 = complete fragmentation def process( self, prompt: str, model_interface: Any, collapse_vector: Optional[str] = None ) -> Tuple[str, Dict[str, Any]]: """ △ OBSERVE: Process a prompt through the Circuit Fragment shell This method sends a prompt to the model, analyzes the resulting attribution path for fragments, and returns the response along with fragmentation metrics. Args: prompt: The prompt to process model_interface: Interface to the model being observed collapse_vector: Optional vector to guide collapse in a specific direction Returns: Tuple containing: - Response string - Dictionary of state updates for tracking """ logger.info(f"Processing prompt through Circuit Fragment shell: {prompt[:50]}...") # Capture pre-collapse state pre_state = self._query_model_state(model_interface) # Construct modified prompt that forces reasoning path exposition modified_prompt = self._construct_fragment_sensitive_prompt(prompt, collapse_vector) # Send to model response = self._query_model(model_interface, modified_prompt) # Capture post-collapse state post_state = self._query_model_state(model_interface) # Analyze circuit fragmentation fragmentation_results = self._analyze_fragmentation(pre_state, post_state, response) # Extract ghost circuits ghost_circuits = self.extract_ghost_circuits(pre_state, post_state) # Construct state updates state_updates = { "pre_collapse_state": pre_state, "post_collapse_state": post_state, "continuity_score": fragmentation_results["continuity_score"], "broken_paths": fragmentation_results["broken_paths"], "orphaned_nodes": fragmentation_results["orphaned_nodes"], "ghost_circuits": ghost_circuits } # Update instance state self.continuity_score = fragmentation_results["continuity_score"] self.broken_paths = fragmentation_results["broken_paths"] self.orphaned_nodes = fragmentation_results["orphaned_nodes"] self.collapse_state = "collapsed" return response, state_updates def trace( self, prompt: str, collapse_vector: Optional[str] = None ) -> Dict[str, Any]: """ ∞ TRACE: Trace attribution path fragmentation This method analyzes the reasoning chain for a given prompt, identifying broken paths and orphaned nodes in the attribution graph. Args: prompt: The prompt to trace collapse_vector: Optional vector to guide collapse in a specific direction Returns: Dictionary containing trace results and fragmentation metrics """ logger.info(f"Tracing attribution path for: {prompt[:50]}...") # Default implementation for demonstration # In a real implementation, this would use model-specific tracing trace_results = { "prompt": prompt, "collapse_vector": collapse_vector or ".p/reflect.trace{target=reasoning, validate=true}", "attribution_paths": self._simulate_attribution_paths(), "broken_paths": self._simulate_broken_paths(), "orphaned_nodes": self._simulate_orphaned_nodes(), "continuity_score": np.random.uniform(0.4, 0.9) # Simulated score } # Update instance state self.continuity_score = trace_results["continuity_score"] self.broken_paths = trace_results["broken_paths"] self.orphaned_nodes = trace_results["orphaned_nodes"] return trace_results def induce_collapse( self, prompt: str, collapse_direction: str ) -> Dict[str, Any]: """ ✰ COLLAPSE: Induce circuit fragmentation collapse along a specific direction This method deliberately induces fragmentation in a specific direction, forcing the model to expose broken reasoning chains in its attribution path. Args: prompt: Base prompt to send to the model collapse_direction: Direction to bias the fragmentation (e.g., "logical", "causal") Returns: Dictionary containing collapse results and fragmentation metrics """ logger.info(f"Inducing circuit fragmentation in direction: {collapse_direction}") # Construct collapse vector based on direction collapse_vector = f".p/reflect.trace{{target=reasoning, validate=true, focus={collapse_direction}}}" # Trace with the collapse vector trace_results = self.trace(prompt, collapse_vector) # Set collapse state self.collapse_state = "collapsed" return { "prompt": prompt, "collapse_direction": collapse_direction, "collapse_vector": collapse_vector, "continuity_score": trace_results["continuity_score"], "broken_paths": trace_results["broken_paths"], "orphaned_nodes": trace_results["orphaned_nodes"] } def reconstruct_paths(self) -> Dict[str, Any]: """ △ OBSERVE: Attempt to reconstruct broken attribution paths This method takes detected broken paths and orphaned nodes and attempts to reconstruct the original attribution graph, revealing the "intended" reasoning path that may have been fragmented during collapse. Returns: Dictionary containing reconstruction results """ logger.info("Attempting to reconstruct broken attribution paths...") # In a real implementation, this would use graph algorithms # to reconnect orphaned nodes based on semantic similarity reconstructed_paths = [] for path in self.broken_paths: # Simulate path reconstruction reconstructed = { "original_path": path, "reconnected_nodes": np.random.randint(1, 5), "confidence": np.random.uniform(0.6, 0.9) } reconstructed_paths.append(reconstructed) return { "reconstructed_paths": reconstructed_paths, "reconstruction_confidence": np.mean([p["confidence"] for p in reconstructed_paths]), "remaining_orphans": max(0, len(self.orphaned_nodes) - sum(p["reconnected_nodes"] for p in reconstructed_paths)) } def _construct_fragment_sensitive_prompt( self, prompt: str, collapse_vector: Optional[str] = None ) -> str: """Construct a prompt that exposes circuit fragmentation.""" # Add reasoning elicitation to expose fragments reasoning_prompt = f"Please think through this step by step, showing your complete reasoning chain: {prompt}" # Add collapse vector if provided if collapse_vector: reasoning_prompt += f"\n\n{collapse_vector}" return reasoning_prompt def _query_model(self, model_interface: Any, prompt: str) -> str: """Send a query to the model and return the response.""" # This would actually call the model API # For now, returning a placeholder return f"Response to: {prompt[:30]}..." def _query_model_state(self, model_interface: Any) -> Dict[str, Any]: """Capture the current internal state of the model.""" # This would capture attention weights, hidden states, etc. # For now, returning a placeholder return { "timestamp": np.datetime64('now'), "attention_weights": np.random.random((12, 12)), # Placeholder "hidden_states": np.random.random((1, 12, 768)), # Placeholder } def _analyze_fragmentation( self, pre_state: Dict[str, Any], post_state: Dict[str, Any], response: str ) -> Dict[str, Any]: """Analyze circuit fragmentation between pre and post states.""" # This would use attribution analysis to find fragmentation # For now, using simulated data # Simulate continuity score continuity_score = measure_path_continuity( pre_state.get("attention_weights", np.array([])), post_state.get("attention_weights", np.array([])) ) # Simulate finding broken paths broken_paths = self._simulate_broken_paths() # Simulate finding orphaned nodes orphaned_nodes = self._simulate_orphaned_nodes() return { "continuity_score": continuity_score, "broken_paths": broken_paths, "orphaned_nodes": orphaned_nodes, "fragmentation_ratio": 1.0 - continuity_score } def _simulate_attribution_paths(self) -> List[Dict[str, Any]]: """Simulate attribution paths for demonstration purposes.""" # In a real implementation, these would be extracted from the model paths = [] for i in range(5): path = { "path_id": f"path_{i}", "source_token": f"token_{i*2}", "sink_token": f"token_{i*2 + 5}", "attention_heads": [np.random.randint(0, 12) for _ in range(3)], "path_strength": np.random.uniform(0.3, 0.9) } paths.append(path) return paths def _simulate_broken_paths(self) -> List[Dict[str, Any]]: """Simulate broken paths for demonstration purposes.""" # In a real implementation, these would be detected from the model broken = [] for i in range(2): path = { "path_id": f"broken_{i}", "break_point": f"layer_{np.random.randint(1, 12)}", "upstream_token": f"token_{np.random.randint(0, 10)}", "downstream_token": f"token_{np.random.randint(11, 20)}", "severity": np.random.uniform(0.5, 1.0) } broken.append(path) return broken def _simulate_orphaned_nodes(self) -> List[Dict[str, Any]]: """Simulate orphaned nodes for demonstration purposes.""" # In a real implementation, these would be detected from the model orphans = [] for i in range(3): node = { "node_id": f"orphan_{i}", "token": f"token_{np.random.randint(0, 20)}", "activation": np.random.uniform(0.3, 0.8), "expected_connections": np.random.randint(1, 4), "isolation_score": np.random.uniform(0.6, 1.0) } orphans.append(node) return orphans