SofiTesfay2010
/

aria-llm

Model card Files Files and versions

xet

Community

SofiTesfay2010 commited on 10 days ago

Commit

f712b7c

verified ·

1 Parent(s): ce46efa

v0.3: detectors with export/load calibration

Browse files

Files changed (1) hide show

aria_llm/detectors.py +82 -16

aria_llm/detectors.py CHANGED Viewed

@@ -1,13 +1,10 @@
 """
-ARIA Detectors v0.2
 ====================
-v0.2 changes:
-- All detectors now have a calibration phase that collects N steps of normal
-  model behavior and computes mean + std statistics.
-- Triggering is based on mean + k*std (configurable sensitivity).
-- No detector fires during calibration.
-- MedianTrapDetector completely rewritten to use calibrated baselines.
 Grounded in:
 - Dynamic Instability Signal (arxiv:2602.02863): JSD + entropy
@@ -39,7 +36,10 @@ class DetectionSignal:
 class _CalibrationBuffer:
-    """Shared calibration logic: collect samples, compute mean + std, derive threshold."""
     def __init__(self, calibration_steps: int, sensitivity_k: float):
         self.calibration_steps = calibration_steps
@@ -79,6 +79,31 @@ class _CalibrationBuffer:
         severity = min(1.0, excess / (self.sensitivity_k * scale))
         return severity
     def reset(self):
         self.samples.clear()
         self.mean = None
@@ -88,8 +113,7 @@ class _CalibrationBuffer:
 class CompoundErrorDetector:
-    """Detects compound error accumulation via Dynamic Instability Signal (arxiv:2602.02863).
-    v0.2: Uses calibration buffer. Only triggers when instability exceeds mean + k*std."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  window: int = 10, lam: float = 1.0, fallback_threshold: float = 0.7):
@@ -104,6 +128,12 @@ class CompoundErrorDetector:
         self.prev_probs = None
         self.instability_history.clear()
         self.calibration.reset()
     def _jsd(self, p: torch.Tensor, q: torch.Tensor) -> float:
         p = p.float().clamp(min=1e-8)
@@ -166,8 +196,7 @@ class CompoundErrorDetector:
 class SemanticDriftDetector:
-    """Detects semantic drift by tracking cosine distance from goal anchor.
-    v0.2: Uses calibration buffer for cosine distance distribution."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  window: int = 20, fallback_threshold: float = 0.3):
@@ -181,6 +210,12 @@ class SemanticDriftDetector:
         self.goal_anchor = None
         self.distance_history.clear()
         self.calibration.reset()
     def set_goal_anchor(self, hidden_state: torch.Tensor):
         self.goal_anchor = hidden_state.float().detach().clone()
@@ -231,8 +266,7 @@ class SemanticDriftDetector:
 class LogicLoopDetector:
-    """Detects logic looping via entropy variance collapse + trajectory fingerprinting.
-    v0.2: Calibrates entropy variance baseline."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  window: int = 15, similarity_threshold: float = 0.92,
@@ -247,6 +281,7 @@ class LogicLoopDetector:
         self.step = 0
         self.var_samples = []
         self.calibration_steps = calibration_steps
         self.var_mean: Optional[float] = None
         self.var_std: Optional[float] = None
         self.var_threshold: Optional[float] = None
@@ -262,6 +297,24 @@ class LogicLoopDetector:
         self.var_std = None
         self.var_threshold = None
         self.sim_calibration.reset()
     def _compute_fingerprint(self, states: List[torch.Tensor]) -> torch.Tensor:
         if not states:
@@ -331,8 +384,7 @@ class LogicLoopDetector:
 class MedianTrapDetector:
-    """Detects when the model is producing statistically average outputs.
-    v0.2: Completely rewritten to use calibrated baselines instead of absolute formula."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  temperature_boost: float = 1.15, novelty_bonus: float = 0.05):
@@ -348,6 +400,20 @@ class MedianTrapDetector:
         self.step = 0
         self.top1_calibration.reset()
         self.inv_entropy_calibration.reset()
     def detect(self, logits: torch.Tensor) -> DetectionSignal:
         self.step += 1

 """
+ARIA Detectors v0.3
 ====================
+v0.3 changes:
+- _CalibrationBuffer gains export_state() / load_state() for profile persistence
+- All detectors gain export_calibration() / load_calibration() methods
 Grounded in:
 - Dynamic Instability Signal (arxiv:2602.02863): JSD + entropy
 class _CalibrationBuffer:
+    """Shared calibration logic: collect samples, compute mean + std, derive threshold.
+    v0.3: Added export_state() / load_state() for calibration profile persistence.
+    """
     def __init__(self, calibration_steps: int, sensitivity_k: float):
         self.calibration_steps = calibration_steps
         severity = min(1.0, excess / (self.sensitivity_k * scale))
         return severity
+    def export_state(self) -> Dict:
+        """Export calibration state for persistence."""
+        return {
+            "mean": self.mean,
+            "std": self.std,
+            "threshold": self.threshold,
+            "sensitivity_k": self.sensitivity_k,
+            "calibration_steps": self.calibration_steps,
+            "n_samples": len(self.samples),
+            "samples_summary": {
+                "min": min(self.samples) if self.samples else None,
+                "max": max(self.samples) if self.samples else None,
+                "median": sorted(self.samples)[len(self.samples)//2] if self.samples else None,
+            }
+        }
+    def load_state(self, state: Dict):
+        """Load calibration state from a saved profile. Skips calibration phase."""
+        self.mean = state["mean"]
+        self.std = state["std"]
+        self.threshold = state["threshold"]
+        self.sensitivity_k = state.get("sensitivity_k", self.sensitivity_k)
+        self.step = self.calibration_steps
+        self.samples = []
     def reset(self):
         self.samples.clear()
         self.mean = None
 class CompoundErrorDetector:
+    """Detects compound error accumulation via Dynamic Instability Signal (arxiv:2602.02863)."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  window: int = 10, lam: float = 1.0, fallback_threshold: float = 0.7):
         self.prev_probs = None
         self.instability_history.clear()
         self.calibration.reset()
+    def export_calibration(self) -> Dict:
+        return {"compound_error": self.calibration.export_state()}
+    def load_calibration(self, state: Dict):
+        self.calibration.load_state(state["compound_error"])
     def _jsd(self, p: torch.Tensor, q: torch.Tensor) -> float:
         p = p.float().clamp(min=1e-8)
 class SemanticDriftDetector:
+    """Detects semantic drift by tracking cosine distance from goal anchor."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  window: int = 20, fallback_threshold: float = 0.3):
         self.goal_anchor = None
         self.distance_history.clear()
         self.calibration.reset()
+    def export_calibration(self) -> Dict:
+        return {"semantic_drift": self.calibration.export_state()}
+    def load_calibration(self, state: Dict):
+        self.calibration.load_state(state["semantic_drift"])
     def set_goal_anchor(self, hidden_state: torch.Tensor):
         self.goal_anchor = hidden_state.float().detach().clone()
 class LogicLoopDetector:
+    """Detects logic looping via entropy variance collapse + trajectory fingerprinting."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  window: int = 15, similarity_threshold: float = 0.92,
         self.step = 0
         self.var_samples = []
         self.calibration_steps = calibration_steps
+        self.sensitivity_k = sensitivity_k
         self.var_mean: Optional[float] = None
         self.var_std: Optional[float] = None
         self.var_threshold: Optional[float] = None
         self.var_std = None
         self.var_threshold = None
         self.sim_calibration.reset()
+    def export_calibration(self) -> Dict:
+        return {
+            "logic_loop": {
+                "sim_calibration": self.sim_calibration.export_state(),
+                "var_mean": self.var_mean,
+                "var_std": self.var_std,
+                "var_threshold": self.var_threshold,
+            }
+        }
+    def load_calibration(self, state: Dict):
+        loop_state = state["logic_loop"]
+        self.sim_calibration.load_state(loop_state["sim_calibration"])
+        self.var_mean = loop_state["var_mean"]
+        self.var_std = loop_state["var_std"]
+        self.var_threshold = loop_state["var_threshold"]
+        self.step = self.calibration_steps
     def _compute_fingerprint(self, states: List[torch.Tensor]) -> torch.Tensor:
         if not states:
 class MedianTrapDetector:
+    """Detects when the model is producing statistically average outputs."""
     def __init__(self, calibration_steps: int = 20, sensitivity_k: float = 2.5,
                  temperature_boost: float = 1.15, novelty_bonus: float = 0.05):
         self.step = 0
         self.top1_calibration.reset()
         self.inv_entropy_calibration.reset()
+    def export_calibration(self) -> Dict:
+        return {
+            "median_trap": {
+                "top1": self.top1_calibration.export_state(),
+                "inv_entropy": self.inv_entropy_calibration.export_state(),
+            }
+        }
+    def load_calibration(self, state: Dict):
+        mt = state["median_trap"]
+        self.top1_calibration.load_state(mt["top1"])
+        self.inv_entropy_calibration.load_state(mt["inv_entropy"])
+        self.step = self.top1_calibration.calibration_steps
     def detect(self, logits: torch.Tensor) -> DetectionSignal:
         self.step += 1