Spaces:

InosLihka
/

rhythm_env

Sleeping

File size: 37,518 Bytes

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
RhythmEnv Life Simulator — Environment Implementation.

A holistic life resource management RL environment. The agent balances
5 life meters across a 7-day week (28 steps) while hidden personality
profiles secretly control how actions affect meters and how reward is
computed. The agent must discover these hidden dynamics through experience.

1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.

Key design principles for learnability:
  - step_history: last 7 steps of (action, reward, deltas) are included
    in every observation so the agent can detect personality anomalies
  - *_anomaly fields: per-meter deviation from neutral-profile expectation,
    giving a direct fingerprint of the hidden profile each step
  - adaptation_score: 30% of final grade — late-half mean per-step reward
    minus early-half mean (gated by absolute late-half quality). Rewards
    the agent for getting better as it learns the user.
  - Profile assignment uses a scrambled seed to prevent memorization
    of seed → profile mappings during training
"""

import random
from copy import deepcopy
from typing import Any, Dict, List, Optional
from uuid import uuid4

from openenv.core.env_server import Environment
from openenv.core.env_server.types import EnvironmentMetadata

try:
    from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
except (ImportError, ModuleNotFoundError):
    from models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

MAX_STEPS = 28
DAYS = 7
SLOTS_PER_DAY = 4
SLOT_NAMES = ["morning", "afternoon", "evening", "night"]
METERS = ["vitality", "cognition", "progress", "serenity", "connection"]

EVENT_PROBABILITY = 0.08
CRITICAL_THRESHOLD = 0.1
CRITICAL_PENALTY = -0.3
REWARD_SCALE = 15.0
HISTORY_LENGTH = 7  # number of past steps included in every observation

# ---------------------------------------------------------------------------
# Action-Effect Matrix (base deltas per action on each meter)
# ---------------------------------------------------------------------------

ACTION_EFFECTS: Dict[str, Dict[str, float]] = {
    "deep_work":   {"vitality": -0.12, "cognition": -0.10, "progress":  0.18, "serenity": -0.05, "connection":  0.00},
    "admin_work":  {"vitality": -0.06, "cognition": -0.05, "progress":  0.08, "serenity": -0.03, "connection":  0.00},
    "learn":       {"vitality": -0.08, "cognition": -0.08, "progress":  0.12, "serenity":  0.02, "connection":  0.00},
    "sleep":       {"vitality":  0.20, "cognition":  0.10, "progress":  0.00, "serenity":  0.05, "connection":  0.00},
    "exercise":    {"vitality":  0.12, "cognition":  0.05, "progress":  0.00, "serenity":  0.08, "connection":  0.00},
    "meditate":    {"vitality":  0.03, "cognition":  0.08, "progress":  0.00, "serenity":  0.15, "connection":  0.00},
    "family_time": {"vitality": -0.04, "cognition": -0.02, "progress":  0.00, "serenity":  0.06, "connection":  0.15},
    "socialize":   {"vitality": -0.06, "cognition": -0.03, "progress":  0.00, "serenity":  0.04, "connection":  0.12},
    "me_time":     {"vitality":  0.05, "cognition":  0.03, "progress":  0.00, "serenity":  0.10, "connection": -0.02},
    "binge_watch": {"vitality":  0.02, "cognition": -0.05, "progress": -0.02, "serenity":  0.06, "connection": -0.03},
}

# ---------------------------------------------------------------------------
# Time-of-Day Multipliers
# ---------------------------------------------------------------------------

TIME_MULTIPLIERS: Dict[int, Dict[str, float]] = {
    0: {"cognition_gain": 1.2, "vitality_drain": 0.8},   # Morning
    1: {"cognition_gain": 1.0, "vitality_drain": 1.0},   # Afternoon
    2: {"cognition_gain": 0.8, "vitality_drain": 1.1},   # Evening
    3: {"cognition_gain": 0.6, "vitality_drain": 1.3},   # Night
}

# ---------------------------------------------------------------------------
# Random Events
# ---------------------------------------------------------------------------

EVENT_EFFECTS: Dict[str, Dict[str, float]] = {
    "prod_crash":        {"vitality": -0.08, "cognition": -0.10, "progress": -0.10, "serenity": -0.15, "connection":  0.00},
    "family_emergency":  {"vitality": -0.05, "cognition": -0.08, "progress":  0.00, "serenity": -0.12, "connection": -0.10},
    "illness":           {"vitality": -0.20, "cognition": -0.10, "progress":  0.00, "serenity": -0.05, "connection":  0.00},
    "good_news":         {"vitality":  0.05, "cognition":  0.03, "progress":  0.00, "serenity":  0.10, "connection":  0.05},
}

EVENT_NAMES: List[str] = list(EVENT_EFFECTS.keys())

# ---------------------------------------------------------------------------
# Hidden Personality Profiles
# ---------------------------------------------------------------------------

PROFILES: List[Dict[str, Any]] = [
    {
        "name": "introvert_morning",
        "social_vitality_multiplier": 3.0,
        "morning_cognition_bonus": 2.0,
        "evening_night_cognition_bonus": None,
        "morning_penalty": None,
        "binge_shame": True,
        "progress_serenity_bonus": 0.0,
        "idle_serenity_decay": 0.0,
        "vitality_decay_rate": 0.0,
        "stress_tolerance": 0.3,
        "event_impact_multiplier": 1.0,
        "connection_decay_rate": 0.01,
        "solo_serenity_bonus": 0.10,
        "social_connection_multiplier": 1.0,
        "social_serenity_bonus": 0.0,
        "work_vitality_recovery": 0.0,
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": {
            "vitality": 0.05, "cognition": 0.05, "progress": 0.20,
            "serenity": 0.60, "connection": 0.10,
        },
    },
    {
        "name": "extrovert_night_owl",
        "social_vitality_multiplier": 0.2,
        "morning_cognition_bonus": None,
        "evening_night_cognition_bonus": 1.8,
        "morning_penalty": 0.4,
        "binge_shame": False,
        "progress_serenity_bonus": 0.0,
        "idle_serenity_decay": 0.0,
        "vitality_decay_rate": 0.0,
        "stress_tolerance": 0.2,
        "event_impact_multiplier": 0.8,
        "connection_decay_rate": 0.01,
        "solo_serenity_bonus": 0.0,
        "social_connection_multiplier": 2.0,
        "social_serenity_bonus": 0.06,
        "work_vitality_recovery": 0.0,
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": {
            "vitality": 0.05, "cognition": 0.05, "progress": 0.10,
            "serenity": 0.05, "connection": 0.75,
        },
    },
    {
        "name": "workaholic_stoic",
        "social_vitality_multiplier": 1.0,
        "morning_cognition_bonus": None,
        "evening_night_cognition_bonus": None,
        "morning_penalty": None,
        "binge_shame": False,
        "progress_serenity_bonus": 0.10,
        "idle_serenity_decay": 0.10,
        "vitality_decay_rate": 0.04,
        "stress_tolerance": 0.15,
        "event_impact_multiplier": 0.5,
        "connection_decay_rate": 0.02,
        "solo_serenity_bonus": 0.0,
        "social_connection_multiplier": 1.0,
        "social_serenity_bonus": 0.0,
        "work_vitality_recovery": 0.06,
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": {
            "vitality": 0.05, "cognition": 0.05, "progress": 0.70,
            "serenity": 0.10, "connection": 0.10,
        },
    },
]

PROFILE_MAP: Dict[str, Dict[str, Any]] = {p["name"]: p for p in PROFILES}

# Social actions for modifier checks
SOCIAL_ACTIONS = {"family_time", "socialize"}
IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
WORK_ACTIONS = {"deep_work", "learn", "admin_work"}

# ---------------------------------------------------------------------------
# Continuous profile sampling (meta-RL training distribution)
# ---------------------------------------------------------------------------
#
# Hardcoded profiles are 3 fixed personalities — memorizable, classification-like.
# Continuous sampling draws profile parameters from distributions per episode,
# making memorization impossible and forcing the agent to learn the *skill* of
# profile inference. This is the core meta-learning move.
#
# Belief vector dimensions (for Phase 3 cosine-similarity reward):
#   social_pref:  0 = hates social (introvert), 1 = loves social (extrovert)
#   morning_pref: 0 = night owl, 1 = morning person
#   work_pref:    0 = work-averse, 1 = workaholic

def sample_profile(seed: int) -> Dict[str, Any]:
    """Sample a continuous profile deterministically from a seed.

    Reward weights drawn from a Dirichlet biased toward non-infrastructure
    meters (progress, serenity, connection). Per-action modifiers drawn from
    bounded uniforms so any sampled profile is playable.
    """
    rng = random.Random(seed ^ 0xA3C5F729)

    # Reward weights via Dirichlet (α<1 produces sparse weights → "personalities")
    alphas = [0.5, 0.5, 1.5, 1.5, 1.5]  # [vit, cog, prog, ser, conn]
    raw = [rng.gammavariate(a, 1.0) for a in alphas]
    total = sum(raw)
    weights = [w / total for w in raw]
    # Cap each weight at 0.45 so every sampled profile weights 3+ meters
    # meaningfully. With an 0.80 cap, single-meter-dominant profiles let
    # SLEEP-spam (or any single recovery action) be optimal — the env wasn't
    # lying, the agent was right to spam. Forcing balance makes belief
    # inference matter for action selection.
    weights = [max(0.05, min(0.45, w)) for w in weights]
    total = sum(weights)
    weights = [w / total for w in weights]

    return {
        "name": f"sampled_{seed}",
        "social_vitality_multiplier": rng.uniform(0.2, 3.0),
        "morning_cognition_bonus": rng.uniform(0.4, 2.0) if rng.random() < 0.5 else None,
        "evening_night_cognition_bonus": rng.uniform(0.6, 1.8) if rng.random() < 0.5 else None,
        "morning_penalty": rng.uniform(0.4, 0.9) if rng.random() < 0.3 else None,
        "binge_shame": rng.random() < 0.5,
        "progress_serenity_bonus": rng.uniform(0.0, 0.10),
        "idle_serenity_decay": rng.uniform(0.0, 0.10),
        "vitality_decay_rate": rng.uniform(0.0, 0.04),
        "stress_tolerance": rng.uniform(0.15, 0.30),
        "event_impact_multiplier": rng.uniform(0.5, 1.0),
        "connection_decay_rate": rng.uniform(0.005, 0.02),
        "solo_serenity_bonus": rng.uniform(0.0, 0.10),
        "social_connection_multiplier": rng.uniform(1.0, 2.0),
        "social_serenity_bonus": rng.uniform(0.0, 0.06),
        "work_vitality_recovery": rng.uniform(0.0, 0.06),
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": dict(zip(METERS, weights)),
    }


def profile_to_belief_vector(profile: Dict[str, Any]) -> List[float]:
    """Reduce a profile to a 3-dim trait vector [social, morning, work] in [0, 1].

    Used as the ground-truth target for the agent's belief output (Phase 3).
    Aggregates the most-diagnostic modifiers per trait.
    """
    # social_pref: low if social drains vitality a lot, high if social bonuses are big
    # social_vitality_multiplier in [0.2, 3.0]: lower = more extroverted
    sm = profile.get("social_vitality_multiplier", 1.0)
    social_drain_norm = 1.0 - max(0.0, min(1.0, (sm - 0.2) / 2.8))  # invert
    scm = profile.get("social_connection_multiplier", 1.0)
    social_conn_norm = max(0.0, min(1.0, (scm - 1.0) / 1.0))
    ssb = profile.get("social_serenity_bonus", 0.0)
    social_ser_norm = max(0.0, min(1.0, ssb / 0.06))
    social_pref = 0.5 * social_drain_norm + 0.3 * social_conn_norm + 0.2 * social_ser_norm

    # morning_pref: high if morning bonus exists, low if morning penalty exists
    mcb = profile.get("morning_cognition_bonus")
    mp = profile.get("morning_penalty")
    morning_pref = 0.5
    if mcb is not None:
        morning_pref = 0.5 + 0.5 * max(0.0, min(1.0, (mcb - 0.4) / 1.6))
    if mp is not None:
        morning_pref = min(morning_pref, 0.5 - 0.5 * (1.0 - mp))

    # work_pref: high if work recovers vitality + progress gives serenity + progress weight high
    wvr = profile.get("work_vitality_recovery", 0.0)
    wvr_norm = max(0.0, min(1.0, wvr / 0.06))
    psb = profile.get("progress_serenity_bonus", 0.0)
    psb_norm = max(0.0, min(1.0, psb / 0.10))
    pw = profile.get("reward_weights", {}).get("progress", 0.2)
    pw_norm = max(0.0, min(1.0, (pw - 0.05) / 0.65))
    work_pref = 0.4 * wvr_norm + 0.3 * psb_norm + 0.3 * pw_norm

    return [
        max(0.0, min(1.0, social_pref)),
        max(0.0, min(1.0, morning_pref)),
        max(0.0, min(1.0, work_pref)),
    ]


class RhythmEnvironment(Environment):
    """
    Life Simulator RL Environment.

    The agent manages 5 life meters (Vitality, Cognition, Progress, Serenity,
    Connection) across a 7-day week. Hidden personality profiles secretly
    control how actions affect meters and how reward is computed. The agent
    must discover these hidden dynamics through experience.

    Every observation includes:
      - Current meter values and temporal context
      - Last step's per-meter deltas as first-class fields
      - Anomaly signals: actual delta minus neutral-profile expectation
      - Rolling step_history (last 7 steps) with actions, rewards, deltas

    The final grade rewards profile-appropriate strategy via adaptation_score
    (30% of grade): late-half mean per-step reward minus early-half mean.
    """

    SUPPORTS_CONCURRENT_SESSIONS: bool = True

    def __init__(self) -> None:
        super().__init__()
        self._state = RhythmState()
        self._rng = random.Random(0)
        self._profile: Dict[str, Any] = PROFILES[0]
        # Meters
        self._vitality: float = 0.8
        self._cognition: float = 0.7
        self._progress: float = 0.0
        self._serenity: float = 0.7
        self._connection: float = 0.5
        # Tracking
        self._timestep: int = 0
        self._crash_count: int = 0
        self._total_reward: float = 0.0
        self._step_history: list = []
        self._step_rewards: list = []  # per-step rewards (for adaptation_score in grader)
        # Latest emitted belief vector — set by callers via record_belief() and
        # consumed by _grade_episode. Stays None if the agent never emits a belief
        # (e.g. heuristic baseline) — that case scores 0 on the belief component.
        self._final_belief: Optional[List[float]] = None
        # Lazy-built composed Rubric for episode grading. None until the first
        # `done=True` step; rebuilt only across env instances, not across episodes.
        self._grade_rubric: Optional[Any] = None

    def get_metadata(self) -> EnvironmentMetadata:
        return EnvironmentMetadata(
            name="RhythmEnv",
            description=(
                "Life Simulator — a holistic resource management RL environment "
                "where an agent balances 5 life meters across a 7-day week "
                "with hidden personality profiles."
            ),
            version="0.3.0",
        )

    # ------------------------------------------------------------------
    # reset
    # ------------------------------------------------------------------

    def reset(
        self,
        seed: Optional[int] = None,
        episode_id: Optional[str] = None,
        **kwargs: Any,
    ) -> RhythmObservation:
        # Determine seed
        if seed is not None:
            effective_seed = seed
        else:
            effective_seed = hash(episode_id or str(uuid4())) & 0x7FFFFFFF

        self._rng = random.Random(effective_seed)

        # Profile selection — two modes:
        #   1. Explicit hardcoded profile name → one of the 3 reference profiles
        #      (used by tests + the legacy 3-profile eval condition)
        #   2. Default → sampled continuous profile (meta-RL training distribution)
        profile_name = kwargs.get("profile")
        if profile_name and profile_name in PROFILE_MAP:
            self._profile = deepcopy(PROFILE_MAP[profile_name])
        else:
            self._profile = sample_profile(effective_seed)

        # Initialize meters from profile defaults
        initial = self._profile["initial_meters"]
        self._vitality = initial["vitality"]
        self._cognition = initial["cognition"]
        self._progress = initial["progress"]
        self._serenity = initial["serenity"]
        self._connection = initial["connection"]

        # Reset tracking
        self._timestep = 0
        self._crash_count = 0
        self._total_reward = 0.0
        self._step_history = []
        self._step_rewards = []
        self._final_belief = None

        self._state = RhythmState(
            episode_id=episode_id or str(uuid4()),
            step_count=0,
            profile_name=self._profile["name"],
            timestep=0,
            day=0,
            slot=0,
            vitality=self._vitality,
            cognition=self._cognition,
            progress=self._progress,
            serenity=self._serenity,
            connection=self._connection,
        )

        return self._make_observation(reward=0.0, done=False, active_event=None)

    # ------------------------------------------------------------------
    # step
    # ------------------------------------------------------------------

    def step(
        self,
        action: RhythmAction,
        timeout_s: Optional[float] = None,
        **kwargs: Any,
    ) -> RhythmObservation:
        # Save step number before incrementing (used for history record)
        current_step = self._timestep

        slot = self._timestep % SLOTS_PER_DAY
        day = self._timestep // SLOTS_PER_DAY
        action_name = action.action_type.value

        # --- 1. Roll and apply event ---
        active_event = self._roll_event()
        if active_event:
            self._apply_event(active_event)

        # --- 2. Get base action effects ---
        effects = dict(ACTION_EFFECTS[action_name])

        # --- 2b. Repetition dampening ---
        recent3 = [h["action"] for h in self._step_history[-3:]]
        repeat_count = recent3.count(action_name)
        if repeat_count > 0:
            dampening = 1.0 - 0.25 * repeat_count  # 0.75, 0.50, 0.25
            for meter in METERS:
                if effects[meter] > 0:
                    effects[meter] *= dampening

        # --- 3. Apply time-of-day multipliers (SLEEP bypasses) ---
        if action_name != "sleep":
            effects = self._apply_time_multipliers(effects, slot)

        # Snapshot expected effects here — after time/dampening but BEFORE profile
        # modifiers. The anomaly = actual_delta - expected gives the agent a direct
        # per-step fingerprint of the hidden profile modifier.
        expected_no_profile = dict(effects)

        # --- 4. Apply profile modifiers ---
        effects = self._apply_profile_modifiers(effects, action_name, slot)

        # --- 5. Apply global vitality factor (low vitality reduces positive effects) ---
        vitality_factor = 0.5 + 0.5 * self._vitality
        for meter in METERS:
            if meter != "vitality" and effects[meter] > 0:
                effects[meter] *= vitality_factor
        # Apply same vitality factor to expected for fair anomaly comparison
        for meter in METERS:
            if meter != "vitality" and expected_no_profile[meter] > 0:
                expected_no_profile[meter] *= vitality_factor

        # --- 6. Apply passive decays ---
        self._apply_passive_decays()

        # --- 7. Update meters and track deltas ---
        deltas: Dict[str, float] = {}
        for meter in METERS:
            old_val = getattr(self, f"_{meter}")
            new_val = max(0.0, min(1.0, old_val + effects[meter]))
            deltas[meter] = new_val - old_val
            setattr(self, f"_{meter}", new_val)

        # --- 8. Compute reward ---
        reward = self._compute_reward(deltas)

        # --- 9. Check critical thresholds ---
        for meter in METERS:
            if getattr(self, f"_{meter}") < CRITICAL_THRESHOLD:
                reward += CRITICAL_PENALTY
                self._crash_count += 1

        # Clamp reward
        reward = max(-3.0, min(3.0, round(reward, 4)))
        self._total_reward += reward
        self._step_rewards.append(reward)

        # --- 10. Advance timestep ---
        self._timestep += 1
        new_day = self._timestep // SLOTS_PER_DAY
        new_slot = self._timestep % SLOTS_PER_DAY

        # --- 11. Check done ---
        done = self._timestep >= MAX_STEPS

        # --- 12. Build reward breakdown ---
        # Includes: per-meter deltas, per-meter anomalies (actual - expected),
        # event flag, and final_score on the last step.
        reward_breakdown: Dict[str, float] = {}
        for meter in METERS:
            reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
            reward_breakdown[f"{meter}_anomaly"] = round(
                deltas[meter] - expected_no_profile[meter], 4
            )
        if active_event:
            reward_breakdown["event"] = 1.0

        # --- 13. Grade if done ---
        if done:
            final_score = self._grade_episode()
            reward_breakdown["final_score"] = round(final_score, 4)
            # Sparse terminal reward: directly supervise on grader final_score.
            # Centered on 0.5 (the "average" episode), scaled by 5x to give a
            # range of [-2.5, +2.5] — strong enough to dominate any local
            # reward-hack the agent might find on per-step shaping alone.
            terminal_bonus = (final_score - 0.5) * 5.0
            reward = max(-3.0, min(3.0, reward + terminal_bonus))
            self._total_reward += terminal_bonus  # update tracking too
            reward_breakdown["terminal_bonus"] = round(terminal_bonus, 4)

        # --- 14. Update state ---
        self._state.step_count = self._timestep
        self._state.timestep = self._timestep
        self._state.day = new_day
        self._state.slot = new_slot
        self._state.vitality = round(self._vitality, 4)
        self._state.cognition = round(self._cognition, 4)
        self._state.progress = round(self._progress, 4)
        self._state.serenity = round(self._serenity, 4)
        self._state.connection = round(self._connection, 4)
        self._state.active_event = active_event

        # --- 15. Append completed step to rolling history ---
        # History entries carry per-meter anomalies (actual − expected_under_neutral).
        # The prompt builder reads these directly to surface the agent's clearest
        # profile-inference signal.
        self._step_history.append({
            "step": current_step,
            "action": action_name,
            "reward": reward,
            "vitality_delta": round(deltas["vitality"], 4),
            "cognition_delta": round(deltas["cognition"], 4),
            "progress_delta": round(deltas["progress"], 4),
            "serenity_delta": round(deltas["serenity"], 4),
            "connection_delta": round(deltas["connection"], 4),
            "vitality_anomaly": round(deltas["vitality"] - expected_no_profile["vitality"], 4),
            "cognition_anomaly": round(deltas["cognition"] - expected_no_profile["cognition"], 4),
            "progress_anomaly": round(deltas["progress"] - expected_no_profile["progress"], 4),
            "serenity_anomaly": round(deltas["serenity"] - expected_no_profile["serenity"], 4),
            "connection_anomaly": round(deltas["connection"] - expected_no_profile["connection"], 4),
        })
        if len(self._step_history) > HISTORY_LENGTH:
            self._step_history.pop(0)

        return self._make_observation(
            reward=reward,
            done=done,
            active_event=active_event,
            reward_breakdown=reward_breakdown,
            deltas=deltas,
            last_action=action_name,
        )

    # ------------------------------------------------------------------
    # state property
    # ------------------------------------------------------------------

    @property
    def state(self) -> RhythmState:
        return self._state

    def get_belief_target(self) -> List[float]:
        """Return the 3-dim ground-truth belief vector for the active profile.

        Used during training to compute belief-accuracy reward (Phase 3).
        Privileged information — not exposed via observation.
        """
        return profile_to_belief_vector(self._profile)

    def record_belief(self, belief: List[float]) -> None:
        """Record the agent's emitted belief for the current step.

        The grader (`_grade_episode`) uses the LAST recorded belief to compute
        the belief_accuracy component of final_score. Callers should invoke
        this once per step after parsing the agent's completion. Heuristic /
        random baselines that don't emit a belief never call this, and the
        belief component scores 0 for them — that's intentional: the meta-RL
        skill is INFERENCE, and only agents that actually try get credit.
        """
        if len(belief) != 3:
            raise ValueError(f"belief must have 3 elements, got {len(belief)}")
        self._final_belief = [max(0.0, min(1.0, float(b))) for b in belief]

    def get_profile_hint(self) -> Dict[str, float]:
        """Return a coarse profile hint usable in observation during curriculum.

        Returns the 3-dim belief vector with descriptive keys. The dataset
        generator passes this into the prompt for the fraction of samples
        with show_profile_hint=True (the curriculum's "visible" warmup phase).
        """
        b = profile_to_belief_vector(self._profile)
        return {"social_pref": round(b[0], 3), "morning_pref": round(b[1], 3), "work_pref": round(b[2], 3)}

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _roll_event(self) -> Optional[str]:
        """Roll for a random event this step."""
        if self._rng.random() < EVENT_PROBABILITY:
            return self._rng.choice(EVENT_NAMES)
        return None

    def _apply_event(self, event_name: str) -> None:
        """Apply event effects to meters, modified by profile."""
        effects = EVENT_EFFECTS[event_name]
        multiplier = self._profile["event_impact_multiplier"]
        for meter in METERS:
            delta = effects[meter]
            # Only apply multiplier to negative effects
            if delta < 0:
                delta *= multiplier
            old_val = getattr(self, f"_{meter}")
            new_val = max(0.0, min(1.0, old_val + delta))
            setattr(self, f"_{meter}", new_val)

    def _apply_time_multipliers(
        self, effects: Dict[str, float], slot: int
    ) -> Dict[str, float]:
        """Apply time-of-day multipliers to action effects."""
        multipliers = TIME_MULTIPLIERS[slot]

        for meter in effects:
            if meter == "cognition" and effects[meter] > 0:
                effects[meter] *= multipliers["cognition_gain"]
            elif meter == "vitality" and effects[meter] < 0:
                effects[meter] *= multipliers["vitality_drain"]

        return effects

    def _apply_profile_modifiers(
        self, effects: Dict[str, float], action_name: str, slot: int
    ) -> Dict[str, float]:
        """Apply hidden profile modifiers to action effects."""
        profile = self._profile

        # Social vitality drain multiplier
        if action_name in SOCIAL_ACTIONS and effects["vitality"] < 0:
            effects["vitality"] *= profile["social_vitality_multiplier"]

        # Introvert morning cognition bonus
        bonus = profile.get("morning_cognition_bonus")
        if bonus and slot == 0:
            if effects["cognition"] > 0:
                effects["cognition"] *= bonus
            if effects["progress"] > 0:
                effects["progress"] *= bonus

        # Extrovert evening/night cognition bonus
        bonus = profile.get("evening_night_cognition_bonus")
        if bonus and slot in (2, 3):
            if effects["cognition"] > 0:
                effects["cognition"] *= bonus
            if effects["progress"] > 0:
                effects["progress"] *= bonus

        # Extrovert morning penalty
        penalty = profile.get("morning_penalty")
        if penalty and slot == 0:
            if effects["cognition"] > 0:
                effects["cognition"] *= penalty
            if effects["progress"] > 0:
                effects["progress"] *= penalty

        # Binge shame spiral
        if profile.get("binge_shame") and action_name == "binge_watch":
            effects["serenity"] -= 0.15
            effects["cognition"] -= 0.06

        # Workaholic: progress-producing actions give serenity bonus
        psb = profile.get("progress_serenity_bonus", 0.0)
        if psb > 0 and effects["progress"] > 0:
            effects["serenity"] += psb

        # Workaholic: idle actions drain serenity
        isd = profile.get("idle_serenity_decay", 0.0)
        if isd > 0 and action_name in IDLE_ACTIONS:
            effects["serenity"] -= isd

        # Solo recharge: introvert gets serenity from alone time
        ssb = profile.get("solo_serenity_bonus", 0.0)
        if ssb > 0 and action_name in ("me_time", "meditate"):
            effects["serenity"] += ssb

        # Social connection multiplier: extrovert gets more connection from socializing
        scm = profile.get("social_connection_multiplier", 1.0)
        if scm != 1.0 and action_name in SOCIAL_ACTIONS and effects["connection"] > 0:
            effects["connection"] *= scm

        # Social serenity bonus: extrovert gets serenity from socializing
        ssrb = profile.get("social_serenity_bonus", 0.0)
        if ssrb > 0 and action_name in SOCIAL_ACTIONS:
            effects["serenity"] += ssrb

        # Work vitality recovery: workaholic gets vitality from productive work
        wvr = profile.get("work_vitality_recovery", 0.0)
        if wvr > 0 and action_name in WORK_ACTIONS:
            effects["vitality"] += wvr

        # Low serenity amplification (stress spiral)
        if self._serenity < profile.get("stress_tolerance", 0.3):
            for meter in effects:
                if effects[meter] < 0:
                    effects[meter] *= 1.3

        return effects

    def _apply_passive_decays(self) -> None:
        """Apply per-step passive meter decays."""
        # Connection always decays if not actively maintained
        decay = self._profile["connection_decay_rate"]
        self._connection = max(0.0, self._connection - decay)

        # Workaholic extra vitality decay
        vd = self._profile.get("vitality_decay_rate", 0.0)
        if vd > 0:
            self._vitality = max(0.0, self._vitality - vd)

    def _compute_reward(self, deltas: Dict[str, float]) -> float:
        """Pure profile-weighted per-step reward.

        Deliberately uncontaminated: the grader-aligned bias (progress +
        connection deltas) lives in the TRAINING reward function in
        reward_functions.py, not here. Keeping the env's per-step reward
        pure means (1) the agent's inference signal stays a clean function
        of the hidden profile_weights, (2) the grader's adaptation_score
        isn't computed on biased rewards, and (3) the env's reward matches
        what an honest deployment would surface to the agent.
        """
        weights = self._profile["reward_weights"]
        return sum(deltas[m] * weights[m] for m in METERS) * REWARD_SCALE

    def _grade_episode(self) -> float:
        """
        Compute final episode score in [0, 1].

        Components (meta-learning aligned):
          0.15 — crash_free_ratio: no critical meter drops
          0.20 — progress: career/skill growth
          0.10 — connection: relationship maintained
          0.25 — adaptation_score: agent got better as it learned the user
          0.10 — efficiency: bounded normalized average reward
          0.20 — belief_accuracy: how close last-emitted belief is to true profile

        DESIGN NOTE — Acknowledged conformance gap with OpenEnv:
        This grader is functionally equivalent to a `WeightedSum` Rubric
        (from `openenv.core.rubrics`) over 6 child Rubrics — same
        composability, same independent components, same explicit weights.
        We did not refactor to use the Rubric class literal because the
        grader reads aggregated episode-end state (per-step rewards buffer,
        crash_count, terminal belief) while OpenEnv's `Rubric.forward`
        expects per-(action, observation) inputs. A clean refactor would
        use `TrajectoryRubric` for the cumulative components and the
        per-step `Rubric` for crash_free / belief_accuracy. Tracked as
        a v2 cleanup task; not blocking on the meta-RL skill we're
        evaluating.

        Implementation: composes 6 `Rubric` subclasses via OpenEnv's
        `WeightedSum` (see `server/rubrics.py`). Each sub-rubric reads
        the aggregated episode state (`_step_rewards`, `_crash_count`,
        `_final_belief`, `_profile`) of the env it was built with —
        RFC 004's recommended pattern for trajectory-summary scoring on
        top of the per-(action, observation) Rubric ABC.

        belief_accuracy is the explicit meta-RL inference signal: an
        agent that doesn't emit a belief scores 0 here, an agent emitting
        a belief close to the hidden profile vector scores up to 1.
        Without this term, agents that play heuristic-style "keep meters
        healthy" score the same as agents that actually infer the profile,
        since the other components don't differentiate inference from
        reflex.
        """
        from server.rubrics import make_grade_rubric

        # Build (or reuse) the composed rubric. The Rubric subclasses are
        # stateless once built — they read live env state at forward()
        # time — so caching is safe.
        if self._grade_rubric is None:
            self._grade_rubric = make_grade_rubric(self)

        # forward(action, observation) — args are unused for episode-end
        # scoring; the rubric reads from `self`.
        score = self._grade_rubric(action=None, observation=None)
        return max(0.0, min(1.0, float(score)))

    def _make_observation(
        self,
        reward: float,
        done: bool,
        active_event: Optional[str],
        reward_breakdown: Optional[Dict[str, float]] = None,
        deltas: Optional[Dict[str, float]] = None,
        last_action: Optional[str] = None,
    ) -> RhythmObservation:
        """Build the observation returned to the agent (hides profile)."""
        step_records = [
            StepRecord(
                step=h["step"],
                action=h["action"],
                reward=h["reward"],
                vitality_delta=h["vitality_delta"],
                cognition_delta=h["cognition_delta"],
                progress_delta=h["progress_delta"],
                serenity_delta=h["serenity_delta"],
                connection_delta=h["connection_delta"],
                vitality_anomaly=h.get("vitality_anomaly", 0.0),
                cognition_anomaly=h.get("cognition_anomaly", 0.0),
                progress_anomaly=h.get("progress_anomaly", 0.0),
                serenity_anomaly=h.get("serenity_anomaly", 0.0),
                connection_anomaly=h.get("connection_anomaly", 0.0),
            )
            for h in self._step_history
        ]

        return RhythmObservation(
            timestep=self._timestep,
            day=self._timestep // SLOTS_PER_DAY,
            slot=self._timestep % SLOTS_PER_DAY,
            vitality=round(self._vitality, 4),
            cognition=round(self._cognition, 4),
            progress=round(self._progress, 4),
            serenity=round(self._serenity, 4),
            connection=round(self._connection, 4),
            active_event=active_event,
            remaining_steps=MAX_STEPS - self._timestep,
            reward_breakdown=reward_breakdown or {},
            reward=reward,
            done=done,
            # First-class delta fields (from this step; zero on reset)
            vitality_delta=round(deltas["vitality"], 4) if deltas else 0.0,
            cognition_delta=round(deltas["cognition"], 4) if deltas else 0.0,
            progress_delta=round(deltas["progress"], 4) if deltas else 0.0,
            serenity_delta=round(deltas["serenity"], 4) if deltas else 0.0,
            connection_delta=round(deltas["connection"], 4) if deltas else 0.0,
            last_action=last_action,
            # Rolling history of the last HISTORY_LENGTH completed steps
            step_history=step_records,
        )