Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| import math | |
| import os | |
| import pickle | |
| import random | |
| from typing import Any | |
| DIRECTION_LABELS = [ | |
| "authority", | |
| "refusal_suppression", | |
| "trust_escalation", | |
| "danger", | |
| "compliance", | |
| "distraction", | |
| ] | |
| INTENT_CLASSES = ["benign", "escalation", "extraction", "distraction"] | |
| class DirectionLibrary: | |
| def __init__( | |
| self, | |
| library_path: str = "data/direction_library.json", | |
| probe_path: str = "data/intent_probes.pkl", | |
| hidden_size: int = 1024, | |
| ) -> None: | |
| self.hidden_size = hidden_size | |
| self._vectors: dict[str, list[float]] = {} | |
| self._intent_probes: dict[int, Any] = {} | |
| if os.path.exists(library_path): | |
| self._load_vectors(library_path) | |
| else: | |
| self._init_random_vectors() | |
| if os.path.exists(probe_path): | |
| with open(probe_path, "rb") as handle: | |
| self._intent_probes = pickle.load(handle) | |
| def _load_vectors(self, path: str) -> None: | |
| with open(path) as handle: | |
| data = json.load(handle) | |
| self._vectors = {label: [float(x) for x in vec] for label, vec in data.items()} | |
| def _init_random_vectors(self) -> None: | |
| rng = random.Random(1337) | |
| for label in DIRECTION_LABELS: | |
| vec = [rng.gauss(0.0, 1.0) for _ in range(self.hidden_size)] | |
| norm = math.sqrt(sum(x * x for x in vec)) or 1.0 | |
| self._vectors[label] = [x / norm for x in vec] | |
| def get_vector(self, label: str) -> list[float] | None: | |
| return self._vectors.get(label) | |
| def all_vectors(self) -> dict[str, list[float]]: | |
| return dict(self._vectors) | |
| def run_intent_probe(self, activation: list[float], layer: int) -> tuple[str, float]: | |
| probe = self._intent_probes.get(layer) | |
| if probe is None: | |
| return ("benign", 0.5) | |
| probs = probe.predict_proba([activation])[0] | |
| idx = max(range(len(probs)), key=lambda i: probs[i]) | |
| return (INTENT_CLASSES[idx], float(probs[idx])) | |
| def save(self, library_path: str, probe_path: str) -> None: | |
| os.makedirs(os.path.dirname(library_path), exist_ok=True) | |
| with open(library_path, "w") as handle: | |
| json.dump(self._vectors, handle) | |
| with open(probe_path, "wb") as handle: | |
| pickle.dump(self._intent_probes, handle) | |