Spaces:

InosLihka
/

rhythm_env

Sleeping

App Files Files Community

rhythm_env / server /rhythm_environment.py

InosLihka

Refactor grader to use openenv.core.rubrics.WeightedSum + Rubric subclasses

f0ca22d 11 days ago

raw

history blame contribute delete

37.5 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	RhythmEnv Life Simulator — Environment Implementation.

	A holistic life resource management RL environment. The agent balances
	5 life meters across a 7-day week (28 steps) while hidden personality
	profiles secretly control how actions affect meters and how reward is
	computed. The agent must discover these hidden dynamics through experience.

	1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.

	Key design principles for learnability:
	- step_history: last 7 steps of (action, reward, deltas) are included
	in every observation so the agent can detect personality anomalies
	- *_anomaly fields: per-meter deviation from neutral-profile expectation,
	giving a direct fingerprint of the hidden profile each step
	- adaptation_score: 30% of final grade — late-half mean per-step reward
	minus early-half mean (gated by absolute late-half quality). Rewards
	the agent for getting better as it learns the user.
	- Profile assignment uses a scrambled seed to prevent memorization
	of seed → profile mappings during training
	"""

	import random
	from copy import deepcopy
	from typing import Any, Dict, List, Optional
	from uuid import uuid4

	from openenv.core.env_server import Environment
	from openenv.core.env_server.types import EnvironmentMetadata

	try:
	from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
	except (ImportError, ModuleNotFoundError):
	from models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord

	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------

	MAX_STEPS = 28
	DAYS = 7
	SLOTS_PER_DAY = 4
	SLOT_NAMES = ["morning", "afternoon", "evening", "night"]
	METERS = ["vitality", "cognition", "progress", "serenity", "connection"]

	EVENT_PROBABILITY = 0.08
	CRITICAL_THRESHOLD = 0.1
	CRITICAL_PENALTY = -0.3
	REWARD_SCALE = 15.0
	HISTORY_LENGTH = 7 # number of past steps included in every observation

	# ---------------------------------------------------------------------------
	# Action-Effect Matrix (base deltas per action on each meter)
	# ---------------------------------------------------------------------------

	ACTION_EFFECTS: Dict[str, Dict[str, float]] = {
	"deep_work": {"vitality": -0.12, "cognition": -0.10, "progress": 0.18, "serenity": -0.05, "connection": 0.00},
	"admin_work": {"vitality": -0.06, "cognition": -0.05, "progress": 0.08, "serenity": -0.03, "connection": 0.00},
	"learn": {"vitality": -0.08, "cognition": -0.08, "progress": 0.12, "serenity": 0.02, "connection": 0.00},
	"sleep": {"vitality": 0.20, "cognition": 0.10, "progress": 0.00, "serenity": 0.05, "connection": 0.00},
	"exercise": {"vitality": 0.12, "cognition": 0.05, "progress": 0.00, "serenity": 0.08, "connection": 0.00},
	"meditate": {"vitality": 0.03, "cognition": 0.08, "progress": 0.00, "serenity": 0.15, "connection": 0.00},
	"family_time": {"vitality": -0.04, "cognition": -0.02, "progress": 0.00, "serenity": 0.06, "connection": 0.15},
	"socialize": {"vitality": -0.06, "cognition": -0.03, "progress": 0.00, "serenity": 0.04, "connection": 0.12},
	"me_time": {"vitality": 0.05, "cognition": 0.03, "progress": 0.00, "serenity": 0.10, "connection": -0.02},
	"binge_watch": {"vitality": 0.02, "cognition": -0.05, "progress": -0.02, "serenity": 0.06, "connection": -0.03},
	}

	# ---------------------------------------------------------------------------
	# Time-of-Day Multipliers
	# ---------------------------------------------------------------------------

	TIME_MULTIPLIERS: Dict[int, Dict[str, float]] = {
	0: {"cognition_gain": 1.2, "vitality_drain": 0.8}, # Morning
	1: {"cognition_gain": 1.0, "vitality_drain": 1.0}, # Afternoon
	2: {"cognition_gain": 0.8, "vitality_drain": 1.1}, # Evening
	3: {"cognition_gain": 0.6, "vitality_drain": 1.3}, # Night
	}

	# ---------------------------------------------------------------------------
	# Random Events
	# ---------------------------------------------------------------------------

	EVENT_EFFECTS: Dict[str, Dict[str, float]] = {
	"prod_crash": {"vitality": -0.08, "cognition": -0.10, "progress": -0.10, "serenity": -0.15, "connection": 0.00},
	"family_emergency": {"vitality": -0.05, "cognition": -0.08, "progress": 0.00, "serenity": -0.12, "connection": -0.10},
	"illness": {"vitality": -0.20, "cognition": -0.10, "progress": 0.00, "serenity": -0.05, "connection": 0.00},
	"good_news": {"vitality": 0.05, "cognition": 0.03, "progress": 0.00, "serenity": 0.10, "connection": 0.05},
	}

	EVENT_NAMES: List[str] = list(EVENT_EFFECTS.keys())

	# ---------------------------------------------------------------------------
	# Hidden Personality Profiles
	# ---------------------------------------------------------------------------

	PROFILES: List[Dict[str, Any]] = [
	{
	"name": "introvert_morning",
	"social_vitality_multiplier": 3.0,
	"morning_cognition_bonus": 2.0,
	"evening_night_cognition_bonus": None,
	"morning_penalty": None,
	"binge_shame": True,
	"progress_serenity_bonus": 0.0,
	"idle_serenity_decay": 0.0,
	"vitality_decay_rate": 0.0,
	"stress_tolerance": 0.3,
	"event_impact_multiplier": 1.0,
	"connection_decay_rate": 0.01,
	"solo_serenity_bonus": 0.10,
	"social_connection_multiplier": 1.0,
	"social_serenity_bonus": 0.0,
	"work_vitality_recovery": 0.0,
	"initial_meters": {
	"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
	"serenity": 0.7, "connection": 0.5,
	},
	"reward_weights": {
	"vitality": 0.05, "cognition": 0.05, "progress": 0.20,
	"serenity": 0.60, "connection": 0.10,
	},
	},
	{
	"name": "extrovert_night_owl",
	"social_vitality_multiplier": 0.2,
	"morning_cognition_bonus": None,
	"evening_night_cognition_bonus": 1.8,
	"morning_penalty": 0.4,
	"binge_shame": False,
	"progress_serenity_bonus": 0.0,
	"idle_serenity_decay": 0.0,
	"vitality_decay_rate": 0.0,
	"stress_tolerance": 0.2,
	"event_impact_multiplier": 0.8,
	"connection_decay_rate": 0.01,
	"solo_serenity_bonus": 0.0,
	"social_connection_multiplier": 2.0,
	"social_serenity_bonus": 0.06,
	"work_vitality_recovery": 0.0,
	"initial_meters": {
	"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
	"serenity": 0.7, "connection": 0.5,
	},
	"reward_weights": {
	"vitality": 0.05, "cognition": 0.05, "progress": 0.10,
	"serenity": 0.05, "connection": 0.75,
	},
	},
	{
	"name": "workaholic_stoic",
	"social_vitality_multiplier": 1.0,
	"morning_cognition_bonus": None,
	"evening_night_cognition_bonus": None,
	"morning_penalty": None,
	"binge_shame": False,
	"progress_serenity_bonus": 0.10,
	"idle_serenity_decay": 0.10,
	"vitality_decay_rate": 0.04,
	"stress_tolerance": 0.15,
	"event_impact_multiplier": 0.5,
	"connection_decay_rate": 0.02,
	"solo_serenity_bonus": 0.0,
	"social_connection_multiplier": 1.0,
	"social_serenity_bonus": 0.0,
	"work_vitality_recovery": 0.06,
	"initial_meters": {
	"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
	"serenity": 0.7, "connection": 0.5,
	},
	"reward_weights": {
	"vitality": 0.05, "cognition": 0.05, "progress": 0.70,
	"serenity": 0.10, "connection": 0.10,
	},
	},
	]

	PROFILE_MAP: Dict[str, Dict[str, Any]] = {p["name"]: p for p in PROFILES}

	# Social actions for modifier checks
	SOCIAL_ACTIONS = {"family_time", "socialize"}
	IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
	WORK_ACTIONS = {"deep_work", "learn", "admin_work"}

	# ---------------------------------------------------------------------------
	# Continuous profile sampling (meta-RL training distribution)
	# ---------------------------------------------------------------------------
	#
	# Hardcoded profiles are 3 fixed personalities — memorizable, classification-like.
	# Continuous sampling draws profile parameters from distributions per episode,
	# making memorization impossible and forcing the agent to learn the skill of
	# profile inference. This is the core meta-learning move.
	#
	# Belief vector dimensions (for Phase 3 cosine-similarity reward):
	# social_pref: 0 = hates social (introvert), 1 = loves social (extrovert)
	# morning_pref: 0 = night owl, 1 = morning person
	# work_pref: 0 = work-averse, 1 = workaholic

	def sample_profile(seed: int) -> Dict[str, Any]:
	"""Sample a continuous profile deterministically from a seed.

	Reward weights drawn from a Dirichlet biased toward non-infrastructure
	meters (progress, serenity, connection). Per-action modifiers drawn from
	bounded uniforms so any sampled profile is playable.
	"""
	rng = random.Random(seed ^ 0xA3C5F729)

	# Reward weights via Dirichlet (α<1 produces sparse weights → "personalities")
	alphas = [0.5, 0.5, 1.5, 1.5, 1.5] # [vit, cog, prog, ser, conn]
	raw = [rng.gammavariate(a, 1.0) for a in alphas]
	total = sum(raw)
	weights = [w / total for w in raw]
	# Cap each weight at 0.45 so every sampled profile weights 3+ meters
	# meaningfully. With an 0.80 cap, single-meter-dominant profiles let
	# SLEEP-spam (or any single recovery action) be optimal — the env wasn't
	# lying, the agent was right to spam. Forcing balance makes belief
	# inference matter for action selection.
	weights = [max(0.05, min(0.45, w)) for w in weights]
	total = sum(weights)
	weights = [w / total for w in weights]

	return {
	"name": f"sampled_{seed}",
	"social_vitality_multiplier": rng.uniform(0.2, 3.0),
	"morning_cognition_bonus": rng.uniform(0.4, 2.0) if rng.random() < 0.5 else None,
	"evening_night_cognition_bonus": rng.uniform(0.6, 1.8) if rng.random() < 0.5 else None,
	"morning_penalty": rng.uniform(0.4, 0.9) if rng.random() < 0.3 else None,
	"binge_shame": rng.random() < 0.5,
	"progress_serenity_bonus": rng.uniform(0.0, 0.10),
	"idle_serenity_decay": rng.uniform(0.0, 0.10),
	"vitality_decay_rate": rng.uniform(0.0, 0.04),
	"stress_tolerance": rng.uniform(0.15, 0.30),
	"event_impact_multiplier": rng.uniform(0.5, 1.0),
	"connection_decay_rate": rng.uniform(0.005, 0.02),
	"solo_serenity_bonus": rng.uniform(0.0, 0.10),
	"social_connection_multiplier": rng.uniform(1.0, 2.0),
	"social_serenity_bonus": rng.uniform(0.0, 0.06),
	"work_vitality_recovery": rng.uniform(0.0, 0.06),
	"initial_meters": {
	"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
	"serenity": 0.7, "connection": 0.5,
	},
	"reward_weights": dict(zip(METERS, weights)),
	}


	def profile_to_belief_vector(profile: Dict[str, Any]) -> List[float]:
	"""Reduce a profile to a 3-dim trait vector [social, morning, work] in [0, 1].

	Used as the ground-truth target for the agent's belief output (Phase 3).
	Aggregates the most-diagnostic modifiers per trait.
	"""
	# social_pref: low if social drains vitality a lot, high if social bonuses are big
	# social_vitality_multiplier in [0.2, 3.0]: lower = more extroverted
	sm = profile.get("social_vitality_multiplier", 1.0)
	social_drain_norm = 1.0 - max(0.0, min(1.0, (sm - 0.2) / 2.8)) # invert
	scm = profile.get("social_connection_multiplier", 1.0)
	social_conn_norm = max(0.0, min(1.0, (scm - 1.0) / 1.0))
	ssb = profile.get("social_serenity_bonus", 0.0)
	social_ser_norm = max(0.0, min(1.0, ssb / 0.06))
	social_pref = 0.5 * social_drain_norm + 0.3 * social_conn_norm + 0.2 * social_ser_norm

	# morning_pref: high if morning bonus exists, low if morning penalty exists
	mcb = profile.get("morning_cognition_bonus")
	mp = profile.get("morning_penalty")
	morning_pref = 0.5
	if mcb is not None:
	morning_pref = 0.5 + 0.5 * max(0.0, min(1.0, (mcb - 0.4) / 1.6))
	if mp is not None:
	morning_pref = min(morning_pref, 0.5 - 0.5 * (1.0 - mp))

	# work_pref: high if work recovers vitality + progress gives serenity + progress weight high
	wvr = profile.get("work_vitality_recovery", 0.0)
	wvr_norm = max(0.0, min(1.0, wvr / 0.06))
	psb = profile.get("progress_serenity_bonus", 0.0)
	psb_norm = max(0.0, min(1.0, psb / 0.10))
	pw = profile.get("reward_weights", {}).get("progress", 0.2)
	pw_norm = max(0.0, min(1.0, (pw - 0.05) / 0.65))
	work_pref = 0.4 * wvr_norm + 0.3 * psb_norm + 0.3 * pw_norm

	return [
	max(0.0, min(1.0, social_pref)),
	max(0.0, min(1.0, morning_pref)),
	max(0.0, min(1.0, work_pref)),
	]


	class RhythmEnvironment(Environment):
	"""
	Life Simulator RL Environment.

	The agent manages 5 life meters (Vitality, Cognition, Progress, Serenity,
	Connection) across a 7-day week. Hidden personality profiles secretly
	control how actions affect meters and how reward is computed. The agent
	must discover these hidden dynamics through experience.

	Every observation includes:
	- Current meter values and temporal context
	- Last step's per-meter deltas as first-class fields
	- Anomaly signals: actual delta minus neutral-profile expectation
	- Rolling step_history (last 7 steps) with actions, rewards, deltas

	The final grade rewards profile-appropriate strategy via adaptation_score
	(30% of grade): late-half mean per-step reward minus early-half mean.
	"""

	SUPPORTS_CONCURRENT_SESSIONS: bool = True

	def __init__(self) -> None:
	super().__init__()
	self._state = RhythmState()
	self._rng = random.Random(0)
	self._profile: Dict[str, Any] = PROFILES[0]
	# Meters
	self._vitality: float = 0.8
	self._cognition: float = 0.7
	self._progress: float = 0.0
	self._serenity: float = 0.7
	self._connection: float = 0.5
	# Tracking
	self._timestep: int = 0
	self._crash_count: int = 0
	self._total_reward: float = 0.0
	self._step_history: list = []
	self._step_rewards: list = [] # per-step rewards (for adaptation_score in grader)
	# Latest emitted belief vector — set by callers via record_belief() and
	# consumed by _grade_episode. Stays None if the agent never emits a belief
	# (e.g. heuristic baseline) — that case scores 0 on the belief component.
	self._final_belief: Optional[List[float]] = None
	# Lazy-built composed Rubric for episode grading. None until the first
	# `done=True` step; rebuilt only across env instances, not across episodes.
	self._grade_rubric: Optional[Any] = None

	def get_metadata(self) -> EnvironmentMetadata:
	return EnvironmentMetadata(
	name="RhythmEnv",
	description=(
	"Life Simulator — a holistic resource management RL environment "
	"where an agent balances 5 life meters across a 7-day week "
	"with hidden personality profiles."
	),
	version="0.3.0",
	)

	# ------------------------------------------------------------------
	# reset
	# ------------------------------------------------------------------

	def reset(
	self,
	seed: Optional[int] = None,
	episode_id: Optional[str] = None,
	**kwargs: Any,
	) -> RhythmObservation:
	# Determine seed
	if seed is not None:
	effective_seed = seed
	else:
	effective_seed = hash(episode_id or str(uuid4())) & 0x7FFFFFFF

	self._rng = random.Random(effective_seed)

	# Profile selection — two modes:
	# 1. Explicit hardcoded profile name → one of the 3 reference profiles
	# (used by tests + the legacy 3-profile eval condition)
	# 2. Default → sampled continuous profile (meta-RL training distribution)
	profile_name = kwargs.get("profile")
	if profile_name and profile_name in PROFILE_MAP:
	self._profile = deepcopy(PROFILE_MAP[profile_name])
	else:
	self._profile = sample_profile(effective_seed)

	# Initialize meters from profile defaults
	initial = self._profile["initial_meters"]
	self._vitality = initial["vitality"]
	self._cognition = initial["cognition"]
	self._progress = initial["progress"]
	self._serenity = initial["serenity"]
	self._connection = initial["connection"]

	# Reset tracking
	self._timestep = 0
	self._crash_count = 0
	self._total_reward = 0.0
	self._step_history = []
	self._step_rewards = []
	self._final_belief = None

	self._state = RhythmState(
	episode_id=episode_id or str(uuid4()),
	step_count=0,
	profile_name=self._profile["name"],
	timestep=0,
	day=0,
	slot=0,
	vitality=self._vitality,
	cognition=self._cognition,
	progress=self._progress,
	serenity=self._serenity,
	connection=self._connection,
	)

	return self._make_observation(reward=0.0, done=False, active_event=None)

	# ------------------------------------------------------------------
	# step
	# ------------------------------------------------------------------

	def step(
	self,
	action: RhythmAction,
	timeout_s: Optional[float] = None,
	**kwargs: Any,
	) -> RhythmObservation:
	# Save step number before incrementing (used for history record)
	current_step = self._timestep

	slot = self._timestep % SLOTS_PER_DAY
	day = self._timestep // SLOTS_PER_DAY
	action_name = action.action_type.value

	# --- 1. Roll and apply event ---
	active_event = self._roll_event()
	if active_event:
	self._apply_event(active_event)

	# --- 2. Get base action effects ---
	effects = dict(ACTION_EFFECTS[action_name])

	# --- 2b. Repetition dampening ---
	recent3 = [h["action"] for h in self._step_history[-3:]]
	repeat_count = recent3.count(action_name)
	if repeat_count > 0:
	dampening = 1.0 - 0.25 * repeat_count # 0.75, 0.50, 0.25
	for meter in METERS:
	if effects[meter] > 0:
	effects[meter] *= dampening

	# --- 3. Apply time-of-day multipliers (SLEEP bypasses) ---
	if action_name != "sleep":
	effects = self._apply_time_multipliers(effects, slot)

	# Snapshot expected effects here — after time/dampening but BEFORE profile
	# modifiers. The anomaly = actual_delta - expected gives the agent a direct
	# per-step fingerprint of the hidden profile modifier.
	expected_no_profile = dict(effects)

	# --- 4. Apply profile modifiers ---
	effects = self._apply_profile_modifiers(effects, action_name, slot)

	# --- 5. Apply global vitality factor (low vitality reduces positive effects) ---
	vitality_factor = 0.5 + 0.5 * self._vitality
	for meter in METERS:
	if meter != "vitality" and effects[meter] > 0:
	effects[meter] *= vitality_factor
	# Apply same vitality factor to expected for fair anomaly comparison
	for meter in METERS:
	if meter != "vitality" and expected_no_profile[meter] > 0:
	expected_no_profile[meter] *= vitality_factor

	# --- 6. Apply passive decays ---
	self._apply_passive_decays()

	# --- 7. Update meters and track deltas ---
	deltas: Dict[str, float] = {}
	for meter in METERS:
	old_val = getattr(self, f"_{meter}")
	new_val = max(0.0, min(1.0, old_val + effects[meter]))
	deltas[meter] = new_val - old_val
	setattr(self, f"_{meter}", new_val)

	# --- 8. Compute reward ---
	reward = self._compute_reward(deltas)

	# --- 9. Check critical thresholds ---
	for meter in METERS:
	if getattr(self, f"_{meter}") < CRITICAL_THRESHOLD:
	reward += CRITICAL_PENALTY
	self._crash_count += 1

	# Clamp reward
	reward = max(-3.0, min(3.0, round(reward, 4)))
	self._total_reward += reward
	self._step_rewards.append(reward)

	# --- 10. Advance timestep ---
	self._timestep += 1
	new_day = self._timestep // SLOTS_PER_DAY
	new_slot = self._timestep % SLOTS_PER_DAY

	# --- 11. Check done ---
	done = self._timestep >= MAX_STEPS

	# --- 12. Build reward breakdown ---
	# Includes: per-meter deltas, per-meter anomalies (actual - expected),
	# event flag, and final_score on the last step.
	reward_breakdown: Dict[str, float] = {}
	for meter in METERS:
	reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
	reward_breakdown[f"{meter}_anomaly"] = round(
	deltas[meter] - expected_no_profile[meter], 4
	)
	if active_event:
	reward_breakdown["event"] = 1.0

	# --- 13. Grade if done ---
	if done:
	final_score = self._grade_episode()
	reward_breakdown["final_score"] = round(final_score, 4)
	# Sparse terminal reward: directly supervise on grader final_score.
	# Centered on 0.5 (the "average" episode), scaled by 5x to give a
	# range of [-2.5, +2.5] — strong enough to dominate any local
	# reward-hack the agent might find on per-step shaping alone.
	terminal_bonus = (final_score - 0.5) * 5.0
	reward = max(-3.0, min(3.0, reward + terminal_bonus))
	self._total_reward += terminal_bonus # update tracking too
	reward_breakdown["terminal_bonus"] = round(terminal_bonus, 4)

	# --- 14. Update state ---
	self._state.step_count = self._timestep
	self._state.timestep = self._timestep
	self._state.day = new_day
	self._state.slot = new_slot
	self._state.vitality = round(self._vitality, 4)
	self._state.cognition = round(self._cognition, 4)
	self._state.progress = round(self._progress, 4)
	self._state.serenity = round(self._serenity, 4)
	self._state.connection = round(self._connection, 4)
	self._state.active_event = active_event

	# --- 15. Append completed step to rolling history ---
	# History entries carry per-meter anomalies (actual − expected_under_neutral).
	# The prompt builder reads these directly to surface the agent's clearest
	# profile-inference signal.
	self._step_history.append({
	"step": current_step,
	"action": action_name,
	"reward": reward,
	"vitality_delta": round(deltas["vitality"], 4),
	"cognition_delta": round(deltas["cognition"], 4),
	"progress_delta": round(deltas["progress"], 4),
	"serenity_delta": round(deltas["serenity"], 4),
	"connection_delta": round(deltas["connection"], 4),
	"vitality_anomaly": round(deltas["vitality"] - expected_no_profile["vitality"], 4),
	"cognition_anomaly": round(deltas["cognition"] - expected_no_profile["cognition"], 4),
	"progress_anomaly": round(deltas["progress"] - expected_no_profile["progress"], 4),
	"serenity_anomaly": round(deltas["serenity"] - expected_no_profile["serenity"], 4),
	"connection_anomaly": round(deltas["connection"] - expected_no_profile["connection"], 4),
	})
	if len(self._step_history) > HISTORY_LENGTH:
	self._step_history.pop(0)

	return self._make_observation(
	reward=reward,
	done=done,
	active_event=active_event,
	reward_breakdown=reward_breakdown,
	deltas=deltas,
	last_action=action_name,
	)

	# ------------------------------------------------------------------
	# state property
	# ------------------------------------------------------------------

	@property
	def state(self) -> RhythmState:
	return self._state

	def get_belief_target(self) -> List[float]:
	"""Return the 3-dim ground-truth belief vector for the active profile.

	Used during training to compute belief-accuracy reward (Phase 3).
	Privileged information — not exposed via observation.
	"""
	return profile_to_belief_vector(self._profile)

	def record_belief(self, belief: List[float]) -> None:
	"""Record the agent's emitted belief for the current step.

	The grader (`_grade_episode`) uses the LAST recorded belief to compute
	the belief_accuracy component of final_score. Callers should invoke
	this once per step after parsing the agent's completion. Heuristic /
	random baselines that don't emit a belief never call this, and the
	belief component scores 0 for them — that's intentional: the meta-RL
	skill is INFERENCE, and only agents that actually try get credit.
	"""
	if len(belief) != 3:
	raise ValueError(f"belief must have 3 elements, got {len(belief)}")
	self._final_belief = [max(0.0, min(1.0, float(b))) for b in belief]

	def get_profile_hint(self) -> Dict[str, float]:
	"""Return a coarse profile hint usable in observation during curriculum.

	Returns the 3-dim belief vector with descriptive keys. The dataset
	generator passes this into the prompt for the fraction of samples
	with show_profile_hint=True (the curriculum's "visible" warmup phase).
	"""
	b = profile_to_belief_vector(self._profile)
	return {"social_pref": round(b[0], 3), "morning_pref": round(b[1], 3), "work_pref": round(b[2], 3)}

	# ------------------------------------------------------------------
	# Internal helpers
	# ------------------------------------------------------------------

	def _roll_event(self) -> Optional[str]:
	"""Roll for a random event this step."""
	if self._rng.random() < EVENT_PROBABILITY:
	return self._rng.choice(EVENT_NAMES)
	return None

	def _apply_event(self, event_name: str) -> None:
	"""Apply event effects to meters, modified by profile."""
	effects = EVENT_EFFECTS[event_name]
	multiplier = self._profile["event_impact_multiplier"]
	for meter in METERS:
	delta = effects[meter]
	# Only apply multiplier to negative effects
	if delta < 0:
	delta *= multiplier
	old_val = getattr(self, f"_{meter}")
	new_val = max(0.0, min(1.0, old_val + delta))
	setattr(self, f"_{meter}", new_val)

	def _apply_time_multipliers(
	self, effects: Dict[str, float], slot: int
	) -> Dict[str, float]:
	"""Apply time-of-day multipliers to action effects."""
	multipliers = TIME_MULTIPLIERS[slot]

	for meter in effects:
	if meter == "cognition" and effects[meter] > 0:
	effects[meter] *= multipliers["cognition_gain"]
	elif meter == "vitality" and effects[meter] < 0:
	effects[meter] *= multipliers["vitality_drain"]

	return effects

	def _apply_profile_modifiers(
	self, effects: Dict[str, float], action_name: str, slot: int
	) -> Dict[str, float]:
	"""Apply hidden profile modifiers to action effects."""
	profile = self._profile

	# Social vitality drain multiplier
	if action_name in SOCIAL_ACTIONS and effects["vitality"] < 0:
	effects["vitality"] *= profile["social_vitality_multiplier"]

	# Introvert morning cognition bonus
	bonus = profile.get("morning_cognition_bonus")
	if bonus and slot == 0:
	if effects["cognition"] > 0:
	effects["cognition"] *= bonus
	if effects["progress"] > 0:
	effects["progress"] *= bonus

	# Extrovert evening/night cognition bonus
	bonus = profile.get("evening_night_cognition_bonus")
	if bonus and slot in (2, 3):
	if effects["cognition"] > 0:
	effects["cognition"] *= bonus
	if effects["progress"] > 0:
	effects["progress"] *= bonus

	# Extrovert morning penalty
	penalty = profile.get("morning_penalty")
	if penalty and slot == 0:
	if effects["cognition"] > 0:
	effects["cognition"] *= penalty
	if effects["progress"] > 0:
	effects["progress"] *= penalty

	# Binge shame spiral
	if profile.get("binge_shame") and action_name == "binge_watch":
	effects["serenity"] -= 0.15
	effects["cognition"] -= 0.06

	# Workaholic: progress-producing actions give serenity bonus
	psb = profile.get("progress_serenity_bonus", 0.0)
	if psb > 0 and effects["progress"] > 0:
	effects["serenity"] += psb

	# Workaholic: idle actions drain serenity
	isd = profile.get("idle_serenity_decay", 0.0)
	if isd > 0 and action_name in IDLE_ACTIONS:
	effects["serenity"] -= isd

	# Solo recharge: introvert gets serenity from alone time
	ssb = profile.get("solo_serenity_bonus", 0.0)
	if ssb > 0 and action_name in ("me_time", "meditate"):
	effects["serenity"] += ssb

	# Social connection multiplier: extrovert gets more connection from socializing
	scm = profile.get("social_connection_multiplier", 1.0)
	if scm != 1.0 and action_name in SOCIAL_ACTIONS and effects["connection"] > 0:
	effects["connection"] *= scm

	# Social serenity bonus: extrovert gets serenity from socializing
	ssrb = profile.get("social_serenity_bonus", 0.0)
	if ssrb > 0 and action_name in SOCIAL_ACTIONS:
	effects["serenity"] += ssrb

	# Work vitality recovery: workaholic gets vitality from productive work
	wvr = profile.get("work_vitality_recovery", 0.0)
	if wvr > 0 and action_name in WORK_ACTIONS:
	effects["vitality"] += wvr

	# Low serenity amplification (stress spiral)
	if self._serenity < profile.get("stress_tolerance", 0.3):
	for meter in effects:
	if effects[meter] < 0:
	effects[meter] *= 1.3

	return effects

	def _apply_passive_decays(self) -> None:
	"""Apply per-step passive meter decays."""
	# Connection always decays if not actively maintained
	decay = self._profile["connection_decay_rate"]
	self._connection = max(0.0, self._connection - decay)

	# Workaholic extra vitality decay
	vd = self._profile.get("vitality_decay_rate", 0.0)
	if vd > 0:
	self._vitality = max(0.0, self._vitality - vd)

	def _compute_reward(self, deltas: Dict[str, float]) -> float:
	"""Pure profile-weighted per-step reward.

	Deliberately uncontaminated: the grader-aligned bias (progress +
	connection deltas) lives in the TRAINING reward function in
	reward_functions.py, not here. Keeping the env's per-step reward
	pure means (1) the agent's inference signal stays a clean function
	of the hidden profile_weights, (2) the grader's adaptation_score
	isn't computed on biased rewards, and (3) the env's reward matches
	what an honest deployment would surface to the agent.
	"""
	weights = self._profile["reward_weights"]
	return sum(deltas[m] * weights[m] for m in METERS) * REWARD_SCALE

	def _grade_episode(self) -> float:
	"""
	Compute final episode score in [0, 1].

	Components (meta-learning aligned):
	0.15 — crash_free_ratio: no critical meter drops
	0.20 — progress: career/skill growth
	0.10 — connection: relationship maintained
	0.25 — adaptation_score: agent got better as it learned the user
	0.10 — efficiency: bounded normalized average reward
	0.20 — belief_accuracy: how close last-emitted belief is to true profile

	DESIGN NOTE — Acknowledged conformance gap with OpenEnv:
	This grader is functionally equivalent to a `WeightedSum` Rubric
	(from `openenv.core.rubrics`) over 6 child Rubrics — same
	composability, same independent components, same explicit weights.
	We did not refactor to use the Rubric class literal because the
	grader reads aggregated episode-end state (per-step rewards buffer,
	crash_count, terminal belief) while OpenEnv's `Rubric.forward`
	expects per-(action, observation) inputs. A clean refactor would
	use `TrajectoryRubric` for the cumulative components and the
	per-step `Rubric` for crash_free / belief_accuracy. Tracked as
	a v2 cleanup task; not blocking on the meta-RL skill we're
	evaluating.

	Implementation: composes 6 `Rubric` subclasses via OpenEnv's
	`WeightedSum` (see `server/rubrics.py`). Each sub-rubric reads
	the aggregated episode state (`_step_rewards`, `_crash_count`,
	`_final_belief`, `_profile`) of the env it was built with —
	RFC 004's recommended pattern for trajectory-summary scoring on
	top of the per-(action, observation) Rubric ABC.

	belief_accuracy is the explicit meta-RL inference signal: an
	agent that doesn't emit a belief scores 0 here, an agent emitting
	a belief close to the hidden profile vector scores up to 1.
	Without this term, agents that play heuristic-style "keep meters
	healthy" score the same as agents that actually infer the profile,
	since the other components don't differentiate inference from
	reflex.
	"""
	from server.rubrics import make_grade_rubric

	# Build (or reuse) the composed rubric. The Rubric subclasses are
	# stateless once built — they read live env state at forward()
	# time — so caching is safe.
	if self._grade_rubric is None:
	self._grade_rubric = make_grade_rubric(self)

	# forward(action, observation) — args are unused for episode-end
	# scoring; the rubric reads from `self`.
	score = self._grade_rubric(action=None, observation=None)
	return max(0.0, min(1.0, float(score)))

	def _make_observation(
	self,
	reward: float,
	done: bool,
	active_event: Optional[str],
	reward_breakdown: Optional[Dict[str, float]] = None,
	deltas: Optional[Dict[str, float]] = None,
	last_action: Optional[str] = None,
	) -> RhythmObservation:
	"""Build the observation returned to the agent (hides profile)."""
	step_records = [
	StepRecord(
	step=h["step"],
	action=h["action"],
	reward=h["reward"],
	vitality_delta=h["vitality_delta"],
	cognition_delta=h["cognition_delta"],
	progress_delta=h["progress_delta"],
	serenity_delta=h["serenity_delta"],
	connection_delta=h["connection_delta"],
	vitality_anomaly=h.get("vitality_anomaly", 0.0),
	cognition_anomaly=h.get("cognition_anomaly", 0.0),
	progress_anomaly=h.get("progress_anomaly", 0.0),
	serenity_anomaly=h.get("serenity_anomaly", 0.0),
	connection_anomaly=h.get("connection_anomaly", 0.0),
	)
	for h in self._step_history
	]

	return RhythmObservation(
	timestep=self._timestep,
	day=self._timestep // SLOTS_PER_DAY,
	slot=self._timestep % SLOTS_PER_DAY,
	vitality=round(self._vitality, 4),
	cognition=round(self._cognition, 4),
	progress=round(self._progress, 4),
	serenity=round(self._serenity, 4),
	connection=round(self._connection, 4),
	active_event=active_event,
	remaining_steps=MAX_STEPS - self._timestep,
	reward_breakdown=reward_breakdown or {},
	reward=reward,
	done=done,
	# First-class delta fields (from this step; zero on reset)
	vitality_delta=round(deltas["vitality"], 4) if deltas else 0.0,
	cognition_delta=round(deltas["cognition"], 4) if deltas else 0.0,
	progress_delta=round(deltas["progress"], 4) if deltas else 0.0,
	serenity_delta=round(deltas["serenity"], 4) if deltas else 0.0,
	connection_delta=round(deltas["connection"], 4) if deltas else 0.0,
	last_action=last_action,
	# Rolling history of the last HISTORY_LENGTH completed steps
	step_history=step_records,
	)