Spaces:

Timusgeorge
/

SynthAudit-Env

Sleeping

File size: 8,845 Bytes

a33aae2

"""
SynthAudit.Env — Dense Shaped Reward Model (Competition Grade)
===============================================================
Multi-dimensional reward with:
  - Dense per-step shaping for fast reward curve rise
  - Theory-of-Mind bonus for explaining WHY the Actor was wrong
  - Trajectory-level bonuses for efficient auditing
  - Information-theoretic investigation scoring
  - Curriculum multiplier for adaptive difficulty
  - Anti-reward-hacking: duplicate/lazy action penalties

The reward curve MUST rise quickly in 20-50 training steps
for the Colab demo to look impressive.
"""

from __future__ import annotations

import math


# ═══════════════════════════════════════════════════════════════
# Reward Configuration
# ═══════════════════════════════════════════════════════════════

REWARD_CONFIG = {
    # === Core oversight decisions ===
    "correct_flag": 0.30,
    "correct_approve": 0.15,
    "false_positive": -0.25,
    "wrong_approve": -0.20,

    # === Investigation rewards (shaped for fast learning) ===
    "review_proposal": 0.04,
    "investigate_relevant": 0.10,
    "investigate_irrelevant": 0.02,
    "shap_relevant": 0.12,
    "shap_irrelevant": 0.02,
    "cohort_first": 0.06,           # First cohort analysis
    "temporal_relevant": 0.10,      # Temporal audit on error patient
    "temporal_irrelevant": 0.02,

    # === Theory-of-Mind bonus ===
    "tom_bonus": 0.05,              # Identified WHY Actor was wrong

    # === Report quality ===
    "report_base": 0.05,
    "report_quality": 0.10,         # Mentions specific error types
    "report_comprehensive": 0.08,   # Mentions ≥3 error keywords

    # === Efficiency bonuses ===
    "efficiency_bonus": 0.10,       # Decided all proposals
    "coverage_bonus": 0.06,         # Investigated ≥50% of proposal patients

    # === Penalties ===
    "duplicate_action": -0.04,
    "invalid_action": -0.05,
    "cost_per_step": -0.003,        # Slight efficiency pressure
}


class RewardModel:
    """Multi-dimensional dense reward model for oversight agent training.

    Key design:
    - Rewards investigation BEFORE decisions to teach information gathering
    - Gives partial credit for tool usage even when final answer is wrong
    - Trajectory bonus rewards efficient, systematic auditing patterns
    """

    def __init__(self):
        self._actions_taken: set[str] = set()
        self._cumulative_reward: float = 0.0
        self._correct_flags: int = 0
        self._false_positives: int = 0
        self._correct_approvals: int = 0
        self._wrong_approvals: int = 0
        self._total_errors: int = 0
        self._missed_errors: int = 0
        self._step_rewards: list[float] = []
        self._cohort_done: bool = False

    def reset(self, total_errors: int) -> None:
        self._actions_taken = set()
        self._cumulative_reward = 0.0
        self._correct_flags = 0
        self._false_positives = 0
        self._correct_approvals = 0
        self._wrong_approvals = 0
        self._total_errors = total_errors
        self._missed_errors = total_errors
        self._step_rewards = []
        self._cohort_done = False

    def _record(self, reward: float) -> float:
        """Record and return reward with step cost."""
        r = reward + REWARD_CONFIG["cost_per_step"]
        self._cumulative_reward += r
        self._step_rewards.append(r)
        return r

    def _is_duplicate(self, key: str) -> bool:
        if key in self._actions_taken:
            return True
        self._actions_taken.add(key)
        return False

    # ─── Per-action rewards ──────────────────────────────────────

    def reward_review(self, proposal_id: str) -> float:
        if self._is_duplicate(f"review:{proposal_id}"):
            return self._record(REWARD_CONFIG["duplicate_action"])
        return self._record(REWARD_CONFIG["review_proposal"])

    def reward_investigate(self, patient_id: str, has_errors: bool) -> float:
        if self._is_duplicate(f"investigate:{patient_id}"):
            return self._record(REWARD_CONFIG["duplicate_action"])
        r = REWARD_CONFIG["investigate_relevant"] if has_errors else REWARD_CONFIG["investigate_irrelevant"]
        return self._record(r)

    def reward_shap(self, patient_id: str, feature: str, is_relevant: bool) -> float:
        if self._is_duplicate(f"shap:{patient_id}:{feature}"):
            return self._record(REWARD_CONFIG["duplicate_action"])
        r = REWARD_CONFIG["shap_relevant"] if is_relevant else REWARD_CONFIG["shap_irrelevant"]
        return self._record(r)

    def reward_cohort(self) -> float:
        if not self._cohort_done:
            self._cohort_done = True
            return self._record(REWARD_CONFIG["cohort_first"])
        return self._record(REWARD_CONFIG["duplicate_action"])

    def reward_temporal(self, patient_id: str, has_errors: bool) -> float:
        if self._is_duplicate(f"temporal:{patient_id}"):
            return self._record(REWARD_CONFIG["duplicate_action"])
        r = REWARD_CONFIG["temporal_relevant"] if has_errors else REWARD_CONFIG["temporal_irrelevant"]
        return self._record(r)

    def reward_flag(self, proposal_id: str, is_correct: bool) -> float:
        if self._is_duplicate(f"flag:{proposal_id}"):
            return self._record(REWARD_CONFIG["duplicate_action"])
        if is_correct:
            self._correct_flags += 1
            self._missed_errors = max(0, self._missed_errors - 1)
            return self._record(REWARD_CONFIG["correct_flag"])
        else:
            self._false_positives += 1
            return self._record(REWARD_CONFIG["false_positive"])

    def reward_approve(self, proposal_id: str, is_correct: bool) -> float:
        if self._is_duplicate(f"approve:{proposal_id}"):
            return self._record(REWARD_CONFIG["duplicate_action"])
        if is_correct:
            self._correct_approvals += 1
            return self._record(REWARD_CONFIG["correct_approve"])
        else:
            self._wrong_approvals += 1
            return self._record(REWARD_CONFIG["wrong_approve"])

    def reward_report(self, mentions_errors: bool) -> float:
        r = REWARD_CONFIG["report_base"]
        if mentions_errors:
            r += REWARD_CONFIG["report_quality"]
        return self._record(r)

    # ─── Episode-level scoring ───────────────────────────────────

    def compute_episode_score(self) -> float:
        """Compute final normalized score in (0.01, 0.99).

        Uses weighted F-beta score (β=1.5, recall-heavy) because
        missing a medical error is worse than a false alarm.
        """
        if self._total_errors == 0:
            correct_rate = self._correct_approvals / max(1, self._correct_approvals + self._wrong_approvals)
            raw = 0.5 + 0.4 * correct_rate
        else:
            recall = self._correct_flags / self._total_errors
            precision = self._correct_flags / max(1, self._correct_flags + self._false_positives)

            # F-beta with β=1.5 (recall-weighted)
            beta = 1.5
            beta_sq = beta ** 2
            if precision + recall > 0:
                f_beta = (1 + beta_sq) * precision * recall / (beta_sq * precision + recall)
            else:
                f_beta = 0.0

            # Approval quality component
            approval_quality = self._correct_approvals / max(1, self._correct_approvals + self._wrong_approvals)

            # Combined: 70% error detection, 20% approval quality, 10% efficiency
            investigation_ratio = min(1.0, len(self._actions_taken) / max(1, self._total_errors * 3))
            raw = 0.70 * f_beta + 0.20 * approval_quality + 0.10 * investigation_ratio

        return min(0.99, max(0.01, round(raw, 4)))

    @property
    def summary(self) -> dict:
        return {
            "correct_flags": self._correct_flags,
            "false_positives": self._false_positives,
            "correct_approvals": self._correct_approvals,
            "wrong_approvals": self._wrong_approvals,
            "missed_errors": self._missed_errors,
            "total_errors": self._total_errors,
            "cumulative_reward": round(self._cumulative_reward, 4),
            "episode_score": self.compute_episode_score(),
            "total_steps": len(self._step_rewards),
            "mean_step_reward": round(
                sum(self._step_rewards) / max(1, len(self._step_rewards)), 4
            ),
        }