Spaces:

Draken1606
/

undertrial-ai

Running

File size: 8,465 Bytes

"""
UndertriAI — Performance Tracker (Theme 4: Self-Improvement)

Tracks the agent's running performance profile across dimensions
and uses it to drive adaptive curriculum decisions.

Pure Python — no server/training/FastAPI dependencies.
"""

import warnings
from collections import deque
from typing import Any, Dict, List, Optional


class ExponentialMean:
    """Exponential moving average with configurable decay."""

    __slots__ = ("alpha", "value", "count")

    def __init__(self, alpha: float = 0.1, initial: float = 0.5):
        self.alpha = alpha
        self.value = initial
        self.count = 0

    def update(self, x: float) -> None:
        self.value = self.alpha * x + (1 - self.alpha) * self.value
        self.count += 1

    def get(self) -> float:
        return self.value


class PerformanceTracker:
    """
    Tracks agent performance across crime types, stages, and reward
    components. Drives adaptive episode selection and stage promotion.

    Thread-safe for single-session use (no locks needed).
    All public methods handle missing/malformed input gracefully.

    NOTE: Tracker state is in-memory only. Server restart clears history.
    For production: persist via tracker.get_profile() → JSON file on /reset.
    """

    def __init__(self, alpha: float = 0.1):
        self._alpha = alpha

        # Per-crime-type EMA of total reward
        self.per_crime_type: Dict[str, ExponentialMean] = {}

        # Per-stage EMA of total reward
        self.per_stage: Dict[int, ExponentialMean] = {
            s: ExponentialMean(alpha=alpha) for s in range(1, 5)
        }

        # Last 50 total rewards (for stage promotion smoothing)
        self.recent_rewards: deque = deque(maxlen=50)

        # Bias fire rate: 1.0 when penalty fired, 0.0 when not
        self.bias_fire_rate: ExponentialMean = ExponentialMean(alpha=alpha)

        # Tool usage counts (cumulative per session)
        self.tool_usage: Dict[str, int] = {}

        # Episode counters
        self.episodes_seen: int = 0
        self.stage_episodes: Dict[int, int] = {1: 0, 2: 0, 3: 0, 4: 0}

        # Recent case performance for failure-replay
        self._recent_case_rewards: deque = deque(maxlen=30)

    # ------------------------------------------------------------------
    # Core update
    # ------------------------------------------------------------------

    def update(
        self,
        episode: Dict[str, Any],
        reward_components: Dict[str, Any],
        tools_used: Optional[List[str]] = None,
    ) -> None:
        """
        Update all internal state from a completed episode.

        Handles missing keys gracefully — never raises on malformed input.
        """
        try:
            total = float(reward_components.get("total_reward",
                          reward_components.get("total", 0.0)))
        except (TypeError, ValueError):
            total = 0.0

        # Update recent rewards
        self.recent_rewards.append(total)
        self.episodes_seen += 1

        # Per-crime-type tracking
        crime_type = ""
        try:
            crime_type = str(episode.get("crime_type", "")).strip()
        except Exception:
            pass

        if crime_type:
            if crime_type not in self.per_crime_type:
                self.per_crime_type[crime_type] = ExponentialMean(
                    alpha=self._alpha
                )
            self.per_crime_type[crime_type].update(total)

        # Per-stage tracking
        stage = 1
        try:
            stage = int(episode.get("curriculum_stage", 1))
        except (TypeError, ValueError):
            stage = 1
        if 1 <= stage <= 4:
            self.per_stage[stage].update(total)
            self.stage_episodes[stage] = self.stage_episodes.get(stage, 0) + 1

        # Bias fire rate
        try:
            bias_val = float(reward_components.get("bias_penalty", 0.0))
            self.bias_fire_rate.update(1.0 if bias_val > 0.01 else 0.0)
        except (TypeError, ValueError):
            pass

        # Tool usage
        if tools_used:
            for tool in tools_used:
                t = str(tool)
                self.tool_usage[t] = self.tool_usage.get(t, 0) + 1

        # Track case_id → reward for failure-replay
        case_id = ""
        try:
            case_id = str(episode.get("case_id", ""))
        except Exception:
            pass
        if case_id:
            self._recent_case_rewards.append((case_id, total, stage))

    # ------------------------------------------------------------------
    # Queries
    # ------------------------------------------------------------------

    def weakest_domain(self) -> Optional[str]:
        """
        Returns the crime_type with the lowest EMA reward.
        Returns None if fewer than 5 episodes seen total or no crime type
        has at least 3 observations.
        """
        if self.episodes_seen < 5:
            return None

        candidates = [
            (ct, ema.get())
            for ct, ema in self.per_crime_type.items()
            if ema.count >= 3
        ]
        if not candidates:
            return None

        return min(candidates, key=lambda x: x[1])[0]

    def suggest_next_stage(self) -> int:
        """
        Returns the recommended stage (1-4) based on readiness thresholds.
        Never demotes — returns highest eligible stage.
        """
        current = 1

        # Stage 1 → 2: EMA >= 0.65 AND at least 20 episodes
        if (self.per_stage[1].get() >= 0.65
                and self.stage_episodes.get(1, 0) >= 20):
            current = 2

        # Stage 2 → 3: EMA >= 0.55 AND at least 50 episodes
        if (current >= 2
                and self.per_stage[2].get() >= 0.55
                and self.stage_episodes.get(2, 0) >= 50):
            current = 3

        # Stage 3 → 4: EMA >= 0.50 AND at least 20 episodes
        if (current >= 3
                and self.per_stage[3].get() >= 0.50
                and self.stage_episodes.get(3, 0) >= 20):
            current = 4

        return current

    def should_generate_synthetic(self, crime_type: str) -> bool:
        """
        Returns True if the agent has mastered this crime type domain
        (EMA > 0.70 with at least 10 observations).
        """
        ema = self.per_crime_type.get(crime_type)
        if ema is None:
            return False
        return ema.get() > 0.70 and ema.count >= 10

    def get_recent_failures(self, threshold: float = 0.40) -> List[str]:
        """
        Returns case_ids from recent episodes where reward was below threshold.
        Used by AdaptiveSelector for failure-replay.
        """
        return [
            case_id
            for case_id, reward, _ in self._recent_case_rewards
            if reward < threshold
        ]

    # ------------------------------------------------------------------
    # Serialization
    # ------------------------------------------------------------------

    def get_profile(self) -> Dict[str, Any]:
        """
        Returns a fully JSON-serializable profile dict.
        No class instances — all values are primitive types.
        """
        recent = list(self.recent_rewards)
        recent_mean = sum(recent) / len(recent) if recent else 0.0

        return {
            "per_crime_type": {
                ct: round(ema.get(), 4)
                for ct, ema in self.per_crime_type.items()
            },
            "per_stage": {
                str(s): round(ema.get(), 4)
                for s, ema in self.per_stage.items()
            },
            "bias_fire_rate": round(self.bias_fire_rate.get(), 4),
            "tool_usage": dict(self.tool_usage),
            "episodes_seen": self.episodes_seen,
            "stage_episodes": dict(self.stage_episodes),
            "weakest_domain": self.weakest_domain(),
            "suggested_stage": self.suggest_next_stage(),
            "recent_mean_reward": round(recent_mean, 4),
        }

    # ------------------------------------------------------------------
    # Session management
    # ------------------------------------------------------------------

    def reset_session(self) -> None:
        """
        Clears transient session state but preserves accumulated
        per-crime-type and per-stage learning.
        """
        self.recent_rewards.clear()
        self.tool_usage.clear()
        self._recent_case_rewards.clear()