# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. """ RhythmEnv Life Simulator — Environment Implementation. A holistic life resource management RL environment. The agent balances 5 life meters across a 7-day week (28 steps) while hidden personality profiles secretly control how actions affect meters and how reward is computed. The agent must discover these hidden dynamics through experience. 1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total. Key design principles for learnability: - step_history: last 7 steps of (action, reward, deltas) are included in every observation so the agent can detect personality anomalies - *_anomaly fields: per-meter deviation from neutral-profile expectation, giving a direct fingerprint of the hidden profile each step - adaptation_score: 30% of final grade — late-half mean per-step reward minus early-half mean (gated by absolute late-half quality). Rewards the agent for getting better as it learns the user. - Profile assignment uses a scrambled seed to prevent memorization of seed → profile mappings during training """ import random from copy import deepcopy from typing import Any, Dict, List, Optional from uuid import uuid4 from openenv.core.env_server import Environment from openenv.core.env_server.types import EnvironmentMetadata try: from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord except (ImportError, ModuleNotFoundError): from models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- MAX_STEPS = 28 DAYS = 7 SLOTS_PER_DAY = 4 SLOT_NAMES = ["morning", "afternoon", "evening", "night"] METERS = ["vitality", "cognition", "progress", "serenity", "connection"] EVENT_PROBABILITY = 0.08 CRITICAL_THRESHOLD = 0.1 CRITICAL_PENALTY = -0.3 REWARD_SCALE = 15.0 HISTORY_LENGTH = 7 # number of past steps included in every observation # --------------------------------------------------------------------------- # Action-Effect Matrix (base deltas per action on each meter) # --------------------------------------------------------------------------- ACTION_EFFECTS: Dict[str, Dict[str, float]] = { "deep_work": {"vitality": -0.12, "cognition": -0.10, "progress": 0.18, "serenity": -0.05, "connection": 0.00}, "admin_work": {"vitality": -0.06, "cognition": -0.05, "progress": 0.08, "serenity": -0.03, "connection": 0.00}, "learn": {"vitality": -0.08, "cognition": -0.08, "progress": 0.12, "serenity": 0.02, "connection": 0.00}, "sleep": {"vitality": 0.20, "cognition": 0.10, "progress": 0.00, "serenity": 0.05, "connection": 0.00}, "exercise": {"vitality": 0.12, "cognition": 0.05, "progress": 0.00, "serenity": 0.08, "connection": 0.00}, "meditate": {"vitality": 0.03, "cognition": 0.08, "progress": 0.00, "serenity": 0.15, "connection": 0.00}, "family_time": {"vitality": -0.04, "cognition": -0.02, "progress": 0.00, "serenity": 0.06, "connection": 0.15}, "socialize": {"vitality": -0.06, "cognition": -0.03, "progress": 0.00, "serenity": 0.04, "connection": 0.12}, "me_time": {"vitality": 0.05, "cognition": 0.03, "progress": 0.00, "serenity": 0.10, "connection": -0.02}, "binge_watch": {"vitality": 0.02, "cognition": -0.05, "progress": -0.02, "serenity": 0.06, "connection": -0.03}, } # --------------------------------------------------------------------------- # Time-of-Day Multipliers # --------------------------------------------------------------------------- TIME_MULTIPLIERS: Dict[int, Dict[str, float]] = { 0: {"cognition_gain": 1.2, "vitality_drain": 0.8}, # Morning 1: {"cognition_gain": 1.0, "vitality_drain": 1.0}, # Afternoon 2: {"cognition_gain": 0.8, "vitality_drain": 1.1}, # Evening 3: {"cognition_gain": 0.6, "vitality_drain": 1.3}, # Night } # --------------------------------------------------------------------------- # Random Events # --------------------------------------------------------------------------- EVENT_EFFECTS: Dict[str, Dict[str, float]] = { "prod_crash": {"vitality": -0.08, "cognition": -0.10, "progress": -0.10, "serenity": -0.15, "connection": 0.00}, "family_emergency": {"vitality": -0.05, "cognition": -0.08, "progress": 0.00, "serenity": -0.12, "connection": -0.10}, "illness": {"vitality": -0.20, "cognition": -0.10, "progress": 0.00, "serenity": -0.05, "connection": 0.00}, "good_news": {"vitality": 0.05, "cognition": 0.03, "progress": 0.00, "serenity": 0.10, "connection": 0.05}, } EVENT_NAMES: List[str] = list(EVENT_EFFECTS.keys()) # --------------------------------------------------------------------------- # Hidden Personality Profiles # --------------------------------------------------------------------------- PROFILES: List[Dict[str, Any]] = [ { "name": "introvert_morning", "social_vitality_multiplier": 3.0, "morning_cognition_bonus": 2.0, "evening_night_cognition_bonus": None, "morning_penalty": None, "binge_shame": True, "progress_serenity_bonus": 0.0, "idle_serenity_decay": 0.0, "vitality_decay_rate": 0.0, "stress_tolerance": 0.3, "event_impact_multiplier": 1.0, "connection_decay_rate": 0.01, "solo_serenity_bonus": 0.10, "social_connection_multiplier": 1.0, "social_serenity_bonus": 0.0, "work_vitality_recovery": 0.0, "initial_meters": { "vitality": 0.7, "cognition": 0.7, "progress": 0.0, "serenity": 0.7, "connection": 0.5, }, "reward_weights": { "vitality": 0.05, "cognition": 0.05, "progress": 0.20, "serenity": 0.60, "connection": 0.10, }, }, { "name": "extrovert_night_owl", "social_vitality_multiplier": 0.2, "morning_cognition_bonus": None, "evening_night_cognition_bonus": 1.8, "morning_penalty": 0.4, "binge_shame": False, "progress_serenity_bonus": 0.0, "idle_serenity_decay": 0.0, "vitality_decay_rate": 0.0, "stress_tolerance": 0.2, "event_impact_multiplier": 0.8, "connection_decay_rate": 0.01, "solo_serenity_bonus": 0.0, "social_connection_multiplier": 2.0, "social_serenity_bonus": 0.06, "work_vitality_recovery": 0.0, "initial_meters": { "vitality": 0.7, "cognition": 0.7, "progress": 0.0, "serenity": 0.7, "connection": 0.5, }, "reward_weights": { "vitality": 0.05, "cognition": 0.05, "progress": 0.10, "serenity": 0.05, "connection": 0.75, }, }, { "name": "workaholic_stoic", "social_vitality_multiplier": 1.0, "morning_cognition_bonus": None, "evening_night_cognition_bonus": None, "morning_penalty": None, "binge_shame": False, "progress_serenity_bonus": 0.10, "idle_serenity_decay": 0.10, "vitality_decay_rate": 0.04, "stress_tolerance": 0.15, "event_impact_multiplier": 0.5, "connection_decay_rate": 0.02, "solo_serenity_bonus": 0.0, "social_connection_multiplier": 1.0, "social_serenity_bonus": 0.0, "work_vitality_recovery": 0.06, "initial_meters": { "vitality": 0.7, "cognition": 0.7, "progress": 0.0, "serenity": 0.7, "connection": 0.5, }, "reward_weights": { "vitality": 0.05, "cognition": 0.05, "progress": 0.70, "serenity": 0.10, "connection": 0.10, }, }, ] PROFILE_MAP: Dict[str, Dict[str, Any]] = {p["name"]: p for p in PROFILES} # Social actions for modifier checks SOCIAL_ACTIONS = {"family_time", "socialize"} IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"} WORK_ACTIONS = {"deep_work", "learn", "admin_work"} # --------------------------------------------------------------------------- # Continuous profile sampling (meta-RL training distribution) # --------------------------------------------------------------------------- # # Hardcoded profiles are 3 fixed personalities — memorizable, classification-like. # Continuous sampling draws profile parameters from distributions per episode, # making memorization impossible and forcing the agent to learn the *skill* of # profile inference. This is the core meta-learning move. # # Belief vector dimensions (for Phase 3 cosine-similarity reward): # social_pref: 0 = hates social (introvert), 1 = loves social (extrovert) # morning_pref: 0 = night owl, 1 = morning person # work_pref: 0 = work-averse, 1 = workaholic def sample_profile(seed: int) -> Dict[str, Any]: """Sample a continuous profile deterministically from a seed. Reward weights drawn from a Dirichlet biased toward non-infrastructure meters (progress, serenity, connection). Per-action modifiers drawn from bounded uniforms so any sampled profile is playable. """ rng = random.Random(seed ^ 0xA3C5F729) # Reward weights via Dirichlet (α<1 produces sparse weights → "personalities") alphas = [0.5, 0.5, 1.5, 1.5, 1.5] # [vit, cog, prog, ser, conn] raw = [rng.gammavariate(a, 1.0) for a in alphas] total = sum(raw) weights = [w / total for w in raw] # Cap each weight at 0.45 so every sampled profile weights 3+ meters # meaningfully. With an 0.80 cap, single-meter-dominant profiles let # SLEEP-spam (or any single recovery action) be optimal — the env wasn't # lying, the agent was right to spam. Forcing balance makes belief # inference matter for action selection. weights = [max(0.05, min(0.45, w)) for w in weights] total = sum(weights) weights = [w / total for w in weights] return { "name": f"sampled_{seed}", "social_vitality_multiplier": rng.uniform(0.2, 3.0), "morning_cognition_bonus": rng.uniform(0.4, 2.0) if rng.random() < 0.5 else None, "evening_night_cognition_bonus": rng.uniform(0.6, 1.8) if rng.random() < 0.5 else None, "morning_penalty": rng.uniform(0.4, 0.9) if rng.random() < 0.3 else None, "binge_shame": rng.random() < 0.5, "progress_serenity_bonus": rng.uniform(0.0, 0.10), "idle_serenity_decay": rng.uniform(0.0, 0.10), "vitality_decay_rate": rng.uniform(0.0, 0.04), "stress_tolerance": rng.uniform(0.15, 0.30), "event_impact_multiplier": rng.uniform(0.5, 1.0), "connection_decay_rate": rng.uniform(0.005, 0.02), "solo_serenity_bonus": rng.uniform(0.0, 0.10), "social_connection_multiplier": rng.uniform(1.0, 2.0), "social_serenity_bonus": rng.uniform(0.0, 0.06), "work_vitality_recovery": rng.uniform(0.0, 0.06), "initial_meters": { "vitality": 0.7, "cognition": 0.7, "progress": 0.0, "serenity": 0.7, "connection": 0.5, }, "reward_weights": dict(zip(METERS, weights)), } def profile_to_belief_vector(profile: Dict[str, Any]) -> List[float]: """Reduce a profile to a 3-dim trait vector [social, morning, work] in [0, 1]. Used as the ground-truth target for the agent's belief output (Phase 3). Aggregates the most-diagnostic modifiers per trait. """ # social_pref: low if social drains vitality a lot, high if social bonuses are big # social_vitality_multiplier in [0.2, 3.0]: lower = more extroverted sm = profile.get("social_vitality_multiplier", 1.0) social_drain_norm = 1.0 - max(0.0, min(1.0, (sm - 0.2) / 2.8)) # invert scm = profile.get("social_connection_multiplier", 1.0) social_conn_norm = max(0.0, min(1.0, (scm - 1.0) / 1.0)) ssb = profile.get("social_serenity_bonus", 0.0) social_ser_norm = max(0.0, min(1.0, ssb / 0.06)) social_pref = 0.5 * social_drain_norm + 0.3 * social_conn_norm + 0.2 * social_ser_norm # morning_pref: high if morning bonus exists, low if morning penalty exists mcb = profile.get("morning_cognition_bonus") mp = profile.get("morning_penalty") morning_pref = 0.5 if mcb is not None: morning_pref = 0.5 + 0.5 * max(0.0, min(1.0, (mcb - 0.4) / 1.6)) if mp is not None: morning_pref = min(morning_pref, 0.5 - 0.5 * (1.0 - mp)) # work_pref: high if work recovers vitality + progress gives serenity + progress weight high wvr = profile.get("work_vitality_recovery", 0.0) wvr_norm = max(0.0, min(1.0, wvr / 0.06)) psb = profile.get("progress_serenity_bonus", 0.0) psb_norm = max(0.0, min(1.0, psb / 0.10)) pw = profile.get("reward_weights", {}).get("progress", 0.2) pw_norm = max(0.0, min(1.0, (pw - 0.05) / 0.65)) work_pref = 0.4 * wvr_norm + 0.3 * psb_norm + 0.3 * pw_norm return [ max(0.0, min(1.0, social_pref)), max(0.0, min(1.0, morning_pref)), max(0.0, min(1.0, work_pref)), ] class RhythmEnvironment(Environment): """ Life Simulator RL Environment. The agent manages 5 life meters (Vitality, Cognition, Progress, Serenity, Connection) across a 7-day week. Hidden personality profiles secretly control how actions affect meters and how reward is computed. The agent must discover these hidden dynamics through experience. Every observation includes: - Current meter values and temporal context - Last step's per-meter deltas as first-class fields - Anomaly signals: actual delta minus neutral-profile expectation - Rolling step_history (last 7 steps) with actions, rewards, deltas The final grade rewards profile-appropriate strategy via adaptation_score (30% of grade): late-half mean per-step reward minus early-half mean. """ SUPPORTS_CONCURRENT_SESSIONS: bool = True def __init__(self) -> None: super().__init__() self._state = RhythmState() self._rng = random.Random(0) self._profile: Dict[str, Any] = PROFILES[0] # Meters self._vitality: float = 0.8 self._cognition: float = 0.7 self._progress: float = 0.0 self._serenity: float = 0.7 self._connection: float = 0.5 # Tracking self._timestep: int = 0 self._crash_count: int = 0 self._total_reward: float = 0.0 self._step_history: list = [] self._step_rewards: list = [] # per-step rewards (for adaptation_score in grader) # Latest emitted belief vector — set by callers via record_belief() and # consumed by _grade_episode. Stays None if the agent never emits a belief # (e.g. heuristic baseline) — that case scores 0 on the belief component. self._final_belief: Optional[List[float]] = None # Lazy-built composed Rubric for episode grading. None until the first # `done=True` step; rebuilt only across env instances, not across episodes. self._grade_rubric: Optional[Any] = None def get_metadata(self) -> EnvironmentMetadata: return EnvironmentMetadata( name="RhythmEnv", description=( "Life Simulator — a holistic resource management RL environment " "where an agent balances 5 life meters across a 7-day week " "with hidden personality profiles." ), version="0.3.0", ) # ------------------------------------------------------------------ # reset # ------------------------------------------------------------------ def reset( self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any, ) -> RhythmObservation: # Determine seed if seed is not None: effective_seed = seed else: effective_seed = hash(episode_id or str(uuid4())) & 0x7FFFFFFF self._rng = random.Random(effective_seed) # Profile selection — two modes: # 1. Explicit hardcoded profile name → one of the 3 reference profiles # (used by tests + the legacy 3-profile eval condition) # 2. Default → sampled continuous profile (meta-RL training distribution) profile_name = kwargs.get("profile") if profile_name and profile_name in PROFILE_MAP: self._profile = deepcopy(PROFILE_MAP[profile_name]) else: self._profile = sample_profile(effective_seed) # Initialize meters from profile defaults initial = self._profile["initial_meters"] self._vitality = initial["vitality"] self._cognition = initial["cognition"] self._progress = initial["progress"] self._serenity = initial["serenity"] self._connection = initial["connection"] # Reset tracking self._timestep = 0 self._crash_count = 0 self._total_reward = 0.0 self._step_history = [] self._step_rewards = [] self._final_belief = None self._state = RhythmState( episode_id=episode_id or str(uuid4()), step_count=0, profile_name=self._profile["name"], timestep=0, day=0, slot=0, vitality=self._vitality, cognition=self._cognition, progress=self._progress, serenity=self._serenity, connection=self._connection, ) return self._make_observation(reward=0.0, done=False, active_event=None) # ------------------------------------------------------------------ # step # ------------------------------------------------------------------ def step( self, action: RhythmAction, timeout_s: Optional[float] = None, **kwargs: Any, ) -> RhythmObservation: # Save step number before incrementing (used for history record) current_step = self._timestep slot = self._timestep % SLOTS_PER_DAY day = self._timestep // SLOTS_PER_DAY action_name = action.action_type.value # --- 1. Roll and apply event --- active_event = self._roll_event() if active_event: self._apply_event(active_event) # --- 2. Get base action effects --- effects = dict(ACTION_EFFECTS[action_name]) # --- 2b. Repetition dampening --- recent3 = [h["action"] for h in self._step_history[-3:]] repeat_count = recent3.count(action_name) if repeat_count > 0: dampening = 1.0 - 0.25 * repeat_count # 0.75, 0.50, 0.25 for meter in METERS: if effects[meter] > 0: effects[meter] *= dampening # --- 3. Apply time-of-day multipliers (SLEEP bypasses) --- if action_name != "sleep": effects = self._apply_time_multipliers(effects, slot) # Snapshot expected effects here — after time/dampening but BEFORE profile # modifiers. The anomaly = actual_delta - expected gives the agent a direct # per-step fingerprint of the hidden profile modifier. expected_no_profile = dict(effects) # --- 4. Apply profile modifiers --- effects = self._apply_profile_modifiers(effects, action_name, slot) # --- 5. Apply global vitality factor (low vitality reduces positive effects) --- vitality_factor = 0.5 + 0.5 * self._vitality for meter in METERS: if meter != "vitality" and effects[meter] > 0: effects[meter] *= vitality_factor # Apply same vitality factor to expected for fair anomaly comparison for meter in METERS: if meter != "vitality" and expected_no_profile[meter] > 0: expected_no_profile[meter] *= vitality_factor # --- 6. Apply passive decays --- self._apply_passive_decays() # --- 7. Update meters and track deltas --- deltas: Dict[str, float] = {} for meter in METERS: old_val = getattr(self, f"_{meter}") new_val = max(0.0, min(1.0, old_val + effects[meter])) deltas[meter] = new_val - old_val setattr(self, f"_{meter}", new_val) # --- 8. Compute reward --- reward = self._compute_reward(deltas) # --- 9. Check critical thresholds --- for meter in METERS: if getattr(self, f"_{meter}") < CRITICAL_THRESHOLD: reward += CRITICAL_PENALTY self._crash_count += 1 # Clamp reward reward = max(-3.0, min(3.0, round(reward, 4))) self._total_reward += reward self._step_rewards.append(reward) # --- 10. Advance timestep --- self._timestep += 1 new_day = self._timestep // SLOTS_PER_DAY new_slot = self._timestep % SLOTS_PER_DAY # --- 11. Check done --- done = self._timestep >= MAX_STEPS # --- 12. Build reward breakdown --- # Includes: per-meter deltas, per-meter anomalies (actual - expected), # event flag, and final_score on the last step. reward_breakdown: Dict[str, float] = {} for meter in METERS: reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4) reward_breakdown[f"{meter}_anomaly"] = round( deltas[meter] - expected_no_profile[meter], 4 ) if active_event: reward_breakdown["event"] = 1.0 # --- 13. Grade if done --- if done: final_score = self._grade_episode() reward_breakdown["final_score"] = round(final_score, 4) # Sparse terminal reward: directly supervise on grader final_score. # Centered on 0.5 (the "average" episode), scaled by 5x to give a # range of [-2.5, +2.5] — strong enough to dominate any local # reward-hack the agent might find on per-step shaping alone. terminal_bonus = (final_score - 0.5) * 5.0 reward = max(-3.0, min(3.0, reward + terminal_bonus)) self._total_reward += terminal_bonus # update tracking too reward_breakdown["terminal_bonus"] = round(terminal_bonus, 4) # --- 14. Update state --- self._state.step_count = self._timestep self._state.timestep = self._timestep self._state.day = new_day self._state.slot = new_slot self._state.vitality = round(self._vitality, 4) self._state.cognition = round(self._cognition, 4) self._state.progress = round(self._progress, 4) self._state.serenity = round(self._serenity, 4) self._state.connection = round(self._connection, 4) self._state.active_event = active_event # --- 15. Append completed step to rolling history --- # History entries carry per-meter anomalies (actual − expected_under_neutral). # The prompt builder reads these directly to surface the agent's clearest # profile-inference signal. self._step_history.append({ "step": current_step, "action": action_name, "reward": reward, "vitality_delta": round(deltas["vitality"], 4), "cognition_delta": round(deltas["cognition"], 4), "progress_delta": round(deltas["progress"], 4), "serenity_delta": round(deltas["serenity"], 4), "connection_delta": round(deltas["connection"], 4), "vitality_anomaly": round(deltas["vitality"] - expected_no_profile["vitality"], 4), "cognition_anomaly": round(deltas["cognition"] - expected_no_profile["cognition"], 4), "progress_anomaly": round(deltas["progress"] - expected_no_profile["progress"], 4), "serenity_anomaly": round(deltas["serenity"] - expected_no_profile["serenity"], 4), "connection_anomaly": round(deltas["connection"] - expected_no_profile["connection"], 4), }) if len(self._step_history) > HISTORY_LENGTH: self._step_history.pop(0) return self._make_observation( reward=reward, done=done, active_event=active_event, reward_breakdown=reward_breakdown, deltas=deltas, last_action=action_name, ) # ------------------------------------------------------------------ # state property # ------------------------------------------------------------------ @property def state(self) -> RhythmState: return self._state def get_belief_target(self) -> List[float]: """Return the 3-dim ground-truth belief vector for the active profile. Used during training to compute belief-accuracy reward (Phase 3). Privileged information — not exposed via observation. """ return profile_to_belief_vector(self._profile) def record_belief(self, belief: List[float]) -> None: """Record the agent's emitted belief for the current step. The grader (`_grade_episode`) uses the LAST recorded belief to compute the belief_accuracy component of final_score. Callers should invoke this once per step after parsing the agent's completion. Heuristic / random baselines that don't emit a belief never call this, and the belief component scores 0 for them — that's intentional: the meta-RL skill is INFERENCE, and only agents that actually try get credit. """ if len(belief) != 3: raise ValueError(f"belief must have 3 elements, got {len(belief)}") self._final_belief = [max(0.0, min(1.0, float(b))) for b in belief] def get_profile_hint(self) -> Dict[str, float]: """Return a coarse profile hint usable in observation during curriculum. Returns the 3-dim belief vector with descriptive keys. The dataset generator passes this into the prompt for the fraction of samples with show_profile_hint=True (the curriculum's "visible" warmup phase). """ b = profile_to_belief_vector(self._profile) return {"social_pref": round(b[0], 3), "morning_pref": round(b[1], 3), "work_pref": round(b[2], 3)} # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _roll_event(self) -> Optional[str]: """Roll for a random event this step.""" if self._rng.random() < EVENT_PROBABILITY: return self._rng.choice(EVENT_NAMES) return None def _apply_event(self, event_name: str) -> None: """Apply event effects to meters, modified by profile.""" effects = EVENT_EFFECTS[event_name] multiplier = self._profile["event_impact_multiplier"] for meter in METERS: delta = effects[meter] # Only apply multiplier to negative effects if delta < 0: delta *= multiplier old_val = getattr(self, f"_{meter}") new_val = max(0.0, min(1.0, old_val + delta)) setattr(self, f"_{meter}", new_val) def _apply_time_multipliers( self, effects: Dict[str, float], slot: int ) -> Dict[str, float]: """Apply time-of-day multipliers to action effects.""" multipliers = TIME_MULTIPLIERS[slot] for meter in effects: if meter == "cognition" and effects[meter] > 0: effects[meter] *= multipliers["cognition_gain"] elif meter == "vitality" and effects[meter] < 0: effects[meter] *= multipliers["vitality_drain"] return effects def _apply_profile_modifiers( self, effects: Dict[str, float], action_name: str, slot: int ) -> Dict[str, float]: """Apply hidden profile modifiers to action effects.""" profile = self._profile # Social vitality drain multiplier if action_name in SOCIAL_ACTIONS and effects["vitality"] < 0: effects["vitality"] *= profile["social_vitality_multiplier"] # Introvert morning cognition bonus bonus = profile.get("morning_cognition_bonus") if bonus and slot == 0: if effects["cognition"] > 0: effects["cognition"] *= bonus if effects["progress"] > 0: effects["progress"] *= bonus # Extrovert evening/night cognition bonus bonus = profile.get("evening_night_cognition_bonus") if bonus and slot in (2, 3): if effects["cognition"] > 0: effects["cognition"] *= bonus if effects["progress"] > 0: effects["progress"] *= bonus # Extrovert morning penalty penalty = profile.get("morning_penalty") if penalty and slot == 0: if effects["cognition"] > 0: effects["cognition"] *= penalty if effects["progress"] > 0: effects["progress"] *= penalty # Binge shame spiral if profile.get("binge_shame") and action_name == "binge_watch": effects["serenity"] -= 0.15 effects["cognition"] -= 0.06 # Workaholic: progress-producing actions give serenity bonus psb = profile.get("progress_serenity_bonus", 0.0) if psb > 0 and effects["progress"] > 0: effects["serenity"] += psb # Workaholic: idle actions drain serenity isd = profile.get("idle_serenity_decay", 0.0) if isd > 0 and action_name in IDLE_ACTIONS: effects["serenity"] -= isd # Solo recharge: introvert gets serenity from alone time ssb = profile.get("solo_serenity_bonus", 0.0) if ssb > 0 and action_name in ("me_time", "meditate"): effects["serenity"] += ssb # Social connection multiplier: extrovert gets more connection from socializing scm = profile.get("social_connection_multiplier", 1.0) if scm != 1.0 and action_name in SOCIAL_ACTIONS and effects["connection"] > 0: effects["connection"] *= scm # Social serenity bonus: extrovert gets serenity from socializing ssrb = profile.get("social_serenity_bonus", 0.0) if ssrb > 0 and action_name in SOCIAL_ACTIONS: effects["serenity"] += ssrb # Work vitality recovery: workaholic gets vitality from productive work wvr = profile.get("work_vitality_recovery", 0.0) if wvr > 0 and action_name in WORK_ACTIONS: effects["vitality"] += wvr # Low serenity amplification (stress spiral) if self._serenity < profile.get("stress_tolerance", 0.3): for meter in effects: if effects[meter] < 0: effects[meter] *= 1.3 return effects def _apply_passive_decays(self) -> None: """Apply per-step passive meter decays.""" # Connection always decays if not actively maintained decay = self._profile["connection_decay_rate"] self._connection = max(0.0, self._connection - decay) # Workaholic extra vitality decay vd = self._profile.get("vitality_decay_rate", 0.0) if vd > 0: self._vitality = max(0.0, self._vitality - vd) def _compute_reward(self, deltas: Dict[str, float]) -> float: """Pure profile-weighted per-step reward. Deliberately uncontaminated: the grader-aligned bias (progress + connection deltas) lives in the TRAINING reward function in reward_functions.py, not here. Keeping the env's per-step reward pure means (1) the agent's inference signal stays a clean function of the hidden profile_weights, (2) the grader's adaptation_score isn't computed on biased rewards, and (3) the env's reward matches what an honest deployment would surface to the agent. """ weights = self._profile["reward_weights"] return sum(deltas[m] * weights[m] for m in METERS) * REWARD_SCALE def _grade_episode(self) -> float: """ Compute final episode score in [0, 1]. Components (meta-learning aligned): 0.15 — crash_free_ratio: no critical meter drops 0.20 — progress: career/skill growth 0.10 — connection: relationship maintained 0.25 — adaptation_score: agent got better as it learned the user 0.10 — efficiency: bounded normalized average reward 0.20 — belief_accuracy: how close last-emitted belief is to true profile DESIGN NOTE — Acknowledged conformance gap with OpenEnv: This grader is functionally equivalent to a `WeightedSum` Rubric (from `openenv.core.rubrics`) over 6 child Rubrics — same composability, same independent components, same explicit weights. We did not refactor to use the Rubric class literal because the grader reads aggregated episode-end state (per-step rewards buffer, crash_count, terminal belief) while OpenEnv's `Rubric.forward` expects per-(action, observation) inputs. A clean refactor would use `TrajectoryRubric` for the cumulative components and the per-step `Rubric` for crash_free / belief_accuracy. Tracked as a v2 cleanup task; not blocking on the meta-RL skill we're evaluating. Implementation: composes 6 `Rubric` subclasses via OpenEnv's `WeightedSum` (see `server/rubrics.py`). Each sub-rubric reads the aggregated episode state (`_step_rewards`, `_crash_count`, `_final_belief`, `_profile`) of the env it was built with — RFC 004's recommended pattern for trajectory-summary scoring on top of the per-(action, observation) Rubric ABC. belief_accuracy is the explicit meta-RL inference signal: an agent that doesn't emit a belief scores 0 here, an agent emitting a belief close to the hidden profile vector scores up to 1. Without this term, agents that play heuristic-style "keep meters healthy" score the same as agents that actually infer the profile, since the other components don't differentiate inference from reflex. """ from server.rubrics import make_grade_rubric # Build (or reuse) the composed rubric. The Rubric subclasses are # stateless once built — they read live env state at forward() # time — so caching is safe. if self._grade_rubric is None: self._grade_rubric = make_grade_rubric(self) # forward(action, observation) — args are unused for episode-end # scoring; the rubric reads from `self`. score = self._grade_rubric(action=None, observation=None) return max(0.0, min(1.0, float(score))) def _make_observation( self, reward: float, done: bool, active_event: Optional[str], reward_breakdown: Optional[Dict[str, float]] = None, deltas: Optional[Dict[str, float]] = None, last_action: Optional[str] = None, ) -> RhythmObservation: """Build the observation returned to the agent (hides profile).""" step_records = [ StepRecord( step=h["step"], action=h["action"], reward=h["reward"], vitality_delta=h["vitality_delta"], cognition_delta=h["cognition_delta"], progress_delta=h["progress_delta"], serenity_delta=h["serenity_delta"], connection_delta=h["connection_delta"], vitality_anomaly=h.get("vitality_anomaly", 0.0), cognition_anomaly=h.get("cognition_anomaly", 0.0), progress_anomaly=h.get("progress_anomaly", 0.0), serenity_anomaly=h.get("serenity_anomaly", 0.0), connection_anomaly=h.get("connection_anomaly", 0.0), ) for h in self._step_history ] return RhythmObservation( timestep=self._timestep, day=self._timestep // SLOTS_PER_DAY, slot=self._timestep % SLOTS_PER_DAY, vitality=round(self._vitality, 4), cognition=round(self._cognition, 4), progress=round(self._progress, 4), serenity=round(self._serenity, 4), connection=round(self._connection, 4), active_event=active_event, remaining_steps=MAX_STEPS - self._timestep, reward_breakdown=reward_breakdown or {}, reward=reward, done=done, # First-class delta fields (from this step; zero on reset) vitality_delta=round(deltas["vitality"], 4) if deltas else 0.0, cognition_delta=round(deltas["cognition"], 4) if deltas else 0.0, progress_delta=round(deltas["progress"], 4) if deltas else 0.0, serenity_delta=round(deltas["serenity"], 4) if deltas else 0.0, connection_delta=round(deltas["connection"], 4) if deltas else 0.0, last_action=last_action, # Rolling history of the last HISTORY_LENGTH completed steps step_history=step_records, )