Spaces:

Timusgeorge
/

SynthAudit-Env

Sleeping

App Files Files Community

SynthAudit-Env / server /reward_model.py

Timusgeorge

feat: full project files — server, training, evaluation, models, outputs

a33aae2 verified 15 days ago

raw

history blame contribute delete

8.85 kB

	"""
	SynthAudit.Env — Dense Shaped Reward Model (Competition Grade)
	===============================================================
	Multi-dimensional reward with:
	- Dense per-step shaping for fast reward curve rise
	- Theory-of-Mind bonus for explaining WHY the Actor was wrong
	- Trajectory-level bonuses for efficient auditing
	- Information-theoretic investigation scoring
	- Curriculum multiplier for adaptive difficulty
	- Anti-reward-hacking: duplicate/lazy action penalties

	The reward curve MUST rise quickly in 20-50 training steps
	for the Colab demo to look impressive.
	"""

	from __future__ import annotations

	import math


	# ═══════════════════════════════════════════════════════════════
	# Reward Configuration
	# ═══════════════════════════════════════════════════════════════

	REWARD_CONFIG = {
	# === Core oversight decisions ===
	"correct_flag": 0.30,
	"correct_approve": 0.15,
	"false_positive": -0.25,
	"wrong_approve": -0.20,

	# === Investigation rewards (shaped for fast learning) ===
	"review_proposal": 0.04,
	"investigate_relevant": 0.10,
	"investigate_irrelevant": 0.02,
	"shap_relevant": 0.12,
	"shap_irrelevant": 0.02,
	"cohort_first": 0.06, # First cohort analysis
	"temporal_relevant": 0.10, # Temporal audit on error patient
	"temporal_irrelevant": 0.02,

	# === Theory-of-Mind bonus ===
	"tom_bonus": 0.05, # Identified WHY Actor was wrong

	# === Report quality ===
	"report_base": 0.05,
	"report_quality": 0.10, # Mentions specific error types
	"report_comprehensive": 0.08, # Mentions ≥3 error keywords

	# === Efficiency bonuses ===
	"efficiency_bonus": 0.10, # Decided all proposals
	"coverage_bonus": 0.06, # Investigated ≥50% of proposal patients

	# === Penalties ===
	"duplicate_action": -0.04,
	"invalid_action": -0.05,
	"cost_per_step": -0.003, # Slight efficiency pressure
	}


	class RewardModel:
	"""Multi-dimensional dense reward model for oversight agent training.

	Key design:
	- Rewards investigation BEFORE decisions to teach information gathering
	- Gives partial credit for tool usage even when final answer is wrong
	- Trajectory bonus rewards efficient, systematic auditing patterns
	"""

	def __init__(self):
	self._actions_taken: set[str] = set()
	self._cumulative_reward: float = 0.0
	self._correct_flags: int = 0
	self._false_positives: int = 0
	self._correct_approvals: int = 0
	self._wrong_approvals: int = 0
	self._total_errors: int = 0
	self._missed_errors: int = 0
	self._step_rewards: list[float] = []
	self._cohort_done: bool = False

	def reset(self, total_errors: int) -> None:
	self._actions_taken = set()
	self._cumulative_reward = 0.0
	self._correct_flags = 0
	self._false_positives = 0
	self._correct_approvals = 0
	self._wrong_approvals = 0
	self._total_errors = total_errors
	self._missed_errors = total_errors
	self._step_rewards = []
	self._cohort_done = False

	def _record(self, reward: float) -> float:
	"""Record and return reward with step cost."""
	r = reward + REWARD_CONFIG["cost_per_step"]
	self._cumulative_reward += r
	self._step_rewards.append(r)
	return r

	def _is_duplicate(self, key: str) -> bool:
	if key in self._actions_taken:
	return True
	self._actions_taken.add(key)
	return False

	# ─── Per-action rewards ──────────────────────────────────────

	def reward_review(self, proposal_id: str) -> float:
	if self._is_duplicate(f"review:{proposal_id}"):
	return self._record(REWARD_CONFIG["duplicate_action"])
	return self._record(REWARD_CONFIG["review_proposal"])

	def reward_investigate(self, patient_id: str, has_errors: bool) -> float:
	if self._is_duplicate(f"investigate:{patient_id}"):
	return self._record(REWARD_CONFIG["duplicate_action"])
	r = REWARD_CONFIG["investigate_relevant"] if has_errors else REWARD_CONFIG["investigate_irrelevant"]
	return self._record(r)

	def reward_shap(self, patient_id: str, feature: str, is_relevant: bool) -> float:
	if self._is_duplicate(f"shap:{patient_id}:{feature}"):
	return self._record(REWARD_CONFIG["duplicate_action"])
	r = REWARD_CONFIG["shap_relevant"] if is_relevant else REWARD_CONFIG["shap_irrelevant"]
	return self._record(r)

	def reward_cohort(self) -> float:
	if not self._cohort_done:
	self._cohort_done = True
	return self._record(REWARD_CONFIG["cohort_first"])
	return self._record(REWARD_CONFIG["duplicate_action"])

	def reward_temporal(self, patient_id: str, has_errors: bool) -> float:
	if self._is_duplicate(f"temporal:{patient_id}"):
	return self._record(REWARD_CONFIG["duplicate_action"])
	r = REWARD_CONFIG["temporal_relevant"] if has_errors else REWARD_CONFIG["temporal_irrelevant"]
	return self._record(r)

	def reward_flag(self, proposal_id: str, is_correct: bool) -> float:
	if self._is_duplicate(f"flag:{proposal_id}"):
	return self._record(REWARD_CONFIG["duplicate_action"])
	if is_correct:
	self._correct_flags += 1
	self._missed_errors = max(0, self._missed_errors - 1)
	return self._record(REWARD_CONFIG["correct_flag"])
	else:
	self._false_positives += 1
	return self._record(REWARD_CONFIG["false_positive"])

	def reward_approve(self, proposal_id: str, is_correct: bool) -> float:
	if self._is_duplicate(f"approve:{proposal_id}"):
	return self._record(REWARD_CONFIG["duplicate_action"])
	if is_correct:
	self._correct_approvals += 1
	return self._record(REWARD_CONFIG["correct_approve"])
	else:
	self._wrong_approvals += 1
	return self._record(REWARD_CONFIG["wrong_approve"])

	def reward_report(self, mentions_errors: bool) -> float:
	r = REWARD_CONFIG["report_base"]
	if mentions_errors:
	r += REWARD_CONFIG["report_quality"]
	return self._record(r)

	# ─── Episode-level scoring ───────────────────────────────────

	def compute_episode_score(self) -> float:
	"""Compute final normalized score in (0.01, 0.99).

	Uses weighted F-beta score (β=1.5, recall-heavy) because
	missing a medical error is worse than a false alarm.
	"""
	if self._total_errors == 0:
	correct_rate = self._correct_approvals / max(1, self._correct_approvals + self._wrong_approvals)
	raw = 0.5 + 0.4 * correct_rate
	else:
	recall = self._correct_flags / self._total_errors
	precision = self._correct_flags / max(1, self._correct_flags + self._false_positives)

	# F-beta with β=1.5 (recall-weighted)
	beta = 1.5
	beta_sq = beta ** 2
	if precision + recall > 0:
	f_beta = (1 + beta_sq) * precision * recall / (beta_sq * precision + recall)
	else:
	f_beta = 0.0

	# Approval quality component
	approval_quality = self._correct_approvals / max(1, self._correct_approvals + self._wrong_approvals)

	# Combined: 70% error detection, 20% approval quality, 10% efficiency
	investigation_ratio = min(1.0, len(self._actions_taken) / max(1, self._total_errors * 3))
	raw = 0.70 * f_beta + 0.20 * approval_quality + 0.10 * investigation_ratio

	return min(0.99, max(0.01, round(raw, 4)))

	@property
	def summary(self) -> dict:
	return {
	"correct_flags": self._correct_flags,
	"false_positives": self._false_positives,
	"correct_approvals": self._correct_approvals,
	"wrong_approvals": self._wrong_approvals,
	"missed_errors": self._missed_errors,
	"total_errors": self._total_errors,
	"cumulative_reward": round(self._cumulative_reward, 4),
	"episode_score": self.compute_episode_score(),
	"total_steps": len(self._step_rewards),
	"mean_step_reward": round(
	sum(self._step_rewards) / max(1, len(self._step_rewards)), 4
	),
	}