Spaces:
Sleeping
Sleeping
File size: 37,518 Bytes
c07f15e 025774a cc6473a 025774a cc6473a 025774a cc6473a 0a15ab5 ece0bbe 0a15ab5 025774a cc6473a 025774a 0a15ab5 c07f15e 0a15ab5 025774a cc6473a 0a15ab5 025774a cc6473a 025774a cc6473a 025774a cc6473a 025774a cc6473a 0a15ab5 025774a ecbe0d8 ece0bbe bb2a9c7 ecbe0d8 025774a cc6473a 025774a cc6473a 0a15ab5 ece0bbe 025774a cc6473a 025774a cc6473a 0a15ab5 ecbe0d8 ece0bbe f0ca22d 025774a cc6473a 025774a 0a15ab5 025774a cc6473a 025774a ece0bbe cc6473a ecbe0d8 cc6473a 025774a cc6473a 0a15ab5 ecbe0d8 ece0bbe 025774a cc6473a 025774a cc6473a 025774a cc6473a 025774a 0a15ab5 cc6473a ece0bbe cc6473a 0a15ab5 cc6473a 0a15ab5 cc6473a ecbe0d8 cc6473a 025774a cc6473a 0a15ab5 cc6473a 0a15ab5 cc6473a 025774a ece0bbe 64d24b3 025774a cc6473a 025774a cc6473a 025774a 0a15ab5 ece0bbe 0a15ab5 bb2a9c7 0a15ab5 025774a cc6473a 0a15ab5 025774a ecbe0d8 ece0bbe ecbe0d8 ece0bbe ecbe0d8 025774a cc6473a 025774a cc6473a 0a15ab5 cc6473a ece0bbe 64d24b3 cc6473a bb2a9c7 025774a 0a15ab5 ecbe0d8 ece0bbe ecbe0d8 ece0bbe 025774a dc5658d f0ca22d ecbe0d8 f0ca22d 025774a cc6473a 0a15ab5 025774a cc6473a 0a15ab5 bb2a9c7 0a15ab5 025774a cc6473a 025774a cc6473a 025774a 0a15ab5 025774a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
RhythmEnv Life Simulator β Environment Implementation.
A holistic life resource management RL environment. The agent balances
5 life meters across a 7-day week (28 steps) while hidden personality
profiles secretly control how actions affect meters and how reward is
computed. The agent must discover these hidden dynamics through experience.
1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.
Key design principles for learnability:
- step_history: last 7 steps of (action, reward, deltas) are included
in every observation so the agent can detect personality anomalies
- *_anomaly fields: per-meter deviation from neutral-profile expectation,
giving a direct fingerprint of the hidden profile each step
- adaptation_score: 30% of final grade β late-half mean per-step reward
minus early-half mean (gated by absolute late-half quality). Rewards
the agent for getting better as it learns the user.
- Profile assignment uses a scrambled seed to prevent memorization
of seed β profile mappings during training
"""
import random
from copy import deepcopy
from typing import Any, Dict, List, Optional
from uuid import uuid4
from openenv.core.env_server import Environment
from openenv.core.env_server.types import EnvironmentMetadata
try:
from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
except (ImportError, ModuleNotFoundError):
from models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
MAX_STEPS = 28
DAYS = 7
SLOTS_PER_DAY = 4
SLOT_NAMES = ["morning", "afternoon", "evening", "night"]
METERS = ["vitality", "cognition", "progress", "serenity", "connection"]
EVENT_PROBABILITY = 0.08
CRITICAL_THRESHOLD = 0.1
CRITICAL_PENALTY = -0.3
REWARD_SCALE = 15.0
HISTORY_LENGTH = 7 # number of past steps included in every observation
# ---------------------------------------------------------------------------
# Action-Effect Matrix (base deltas per action on each meter)
# ---------------------------------------------------------------------------
ACTION_EFFECTS: Dict[str, Dict[str, float]] = {
"deep_work": {"vitality": -0.12, "cognition": -0.10, "progress": 0.18, "serenity": -0.05, "connection": 0.00},
"admin_work": {"vitality": -0.06, "cognition": -0.05, "progress": 0.08, "serenity": -0.03, "connection": 0.00},
"learn": {"vitality": -0.08, "cognition": -0.08, "progress": 0.12, "serenity": 0.02, "connection": 0.00},
"sleep": {"vitality": 0.20, "cognition": 0.10, "progress": 0.00, "serenity": 0.05, "connection": 0.00},
"exercise": {"vitality": 0.12, "cognition": 0.05, "progress": 0.00, "serenity": 0.08, "connection": 0.00},
"meditate": {"vitality": 0.03, "cognition": 0.08, "progress": 0.00, "serenity": 0.15, "connection": 0.00},
"family_time": {"vitality": -0.04, "cognition": -0.02, "progress": 0.00, "serenity": 0.06, "connection": 0.15},
"socialize": {"vitality": -0.06, "cognition": -0.03, "progress": 0.00, "serenity": 0.04, "connection": 0.12},
"me_time": {"vitality": 0.05, "cognition": 0.03, "progress": 0.00, "serenity": 0.10, "connection": -0.02},
"binge_watch": {"vitality": 0.02, "cognition": -0.05, "progress": -0.02, "serenity": 0.06, "connection": -0.03},
}
# ---------------------------------------------------------------------------
# Time-of-Day Multipliers
# ---------------------------------------------------------------------------
TIME_MULTIPLIERS: Dict[int, Dict[str, float]] = {
0: {"cognition_gain": 1.2, "vitality_drain": 0.8}, # Morning
1: {"cognition_gain": 1.0, "vitality_drain": 1.0}, # Afternoon
2: {"cognition_gain": 0.8, "vitality_drain": 1.1}, # Evening
3: {"cognition_gain": 0.6, "vitality_drain": 1.3}, # Night
}
# ---------------------------------------------------------------------------
# Random Events
# ---------------------------------------------------------------------------
EVENT_EFFECTS: Dict[str, Dict[str, float]] = {
"prod_crash": {"vitality": -0.08, "cognition": -0.10, "progress": -0.10, "serenity": -0.15, "connection": 0.00},
"family_emergency": {"vitality": -0.05, "cognition": -0.08, "progress": 0.00, "serenity": -0.12, "connection": -0.10},
"illness": {"vitality": -0.20, "cognition": -0.10, "progress": 0.00, "serenity": -0.05, "connection": 0.00},
"good_news": {"vitality": 0.05, "cognition": 0.03, "progress": 0.00, "serenity": 0.10, "connection": 0.05},
}
EVENT_NAMES: List[str] = list(EVENT_EFFECTS.keys())
# ---------------------------------------------------------------------------
# Hidden Personality Profiles
# ---------------------------------------------------------------------------
PROFILES: List[Dict[str, Any]] = [
{
"name": "introvert_morning",
"social_vitality_multiplier": 3.0,
"morning_cognition_bonus": 2.0,
"evening_night_cognition_bonus": None,
"morning_penalty": None,
"binge_shame": True,
"progress_serenity_bonus": 0.0,
"idle_serenity_decay": 0.0,
"vitality_decay_rate": 0.0,
"stress_tolerance": 0.3,
"event_impact_multiplier": 1.0,
"connection_decay_rate": 0.01,
"solo_serenity_bonus": 0.10,
"social_connection_multiplier": 1.0,
"social_serenity_bonus": 0.0,
"work_vitality_recovery": 0.0,
"initial_meters": {
"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
"serenity": 0.7, "connection": 0.5,
},
"reward_weights": {
"vitality": 0.05, "cognition": 0.05, "progress": 0.20,
"serenity": 0.60, "connection": 0.10,
},
},
{
"name": "extrovert_night_owl",
"social_vitality_multiplier": 0.2,
"morning_cognition_bonus": None,
"evening_night_cognition_bonus": 1.8,
"morning_penalty": 0.4,
"binge_shame": False,
"progress_serenity_bonus": 0.0,
"idle_serenity_decay": 0.0,
"vitality_decay_rate": 0.0,
"stress_tolerance": 0.2,
"event_impact_multiplier": 0.8,
"connection_decay_rate": 0.01,
"solo_serenity_bonus": 0.0,
"social_connection_multiplier": 2.0,
"social_serenity_bonus": 0.06,
"work_vitality_recovery": 0.0,
"initial_meters": {
"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
"serenity": 0.7, "connection": 0.5,
},
"reward_weights": {
"vitality": 0.05, "cognition": 0.05, "progress": 0.10,
"serenity": 0.05, "connection": 0.75,
},
},
{
"name": "workaholic_stoic",
"social_vitality_multiplier": 1.0,
"morning_cognition_bonus": None,
"evening_night_cognition_bonus": None,
"morning_penalty": None,
"binge_shame": False,
"progress_serenity_bonus": 0.10,
"idle_serenity_decay": 0.10,
"vitality_decay_rate": 0.04,
"stress_tolerance": 0.15,
"event_impact_multiplier": 0.5,
"connection_decay_rate": 0.02,
"solo_serenity_bonus": 0.0,
"social_connection_multiplier": 1.0,
"social_serenity_bonus": 0.0,
"work_vitality_recovery": 0.06,
"initial_meters": {
"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
"serenity": 0.7, "connection": 0.5,
},
"reward_weights": {
"vitality": 0.05, "cognition": 0.05, "progress": 0.70,
"serenity": 0.10, "connection": 0.10,
},
},
]
PROFILE_MAP: Dict[str, Dict[str, Any]] = {p["name"]: p for p in PROFILES}
# Social actions for modifier checks
SOCIAL_ACTIONS = {"family_time", "socialize"}
IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
WORK_ACTIONS = {"deep_work", "learn", "admin_work"}
# ---------------------------------------------------------------------------
# Continuous profile sampling (meta-RL training distribution)
# ---------------------------------------------------------------------------
#
# Hardcoded profiles are 3 fixed personalities β memorizable, classification-like.
# Continuous sampling draws profile parameters from distributions per episode,
# making memorization impossible and forcing the agent to learn the *skill* of
# profile inference. This is the core meta-learning move.
#
# Belief vector dimensions (for Phase 3 cosine-similarity reward):
# social_pref: 0 = hates social (introvert), 1 = loves social (extrovert)
# morning_pref: 0 = night owl, 1 = morning person
# work_pref: 0 = work-averse, 1 = workaholic
def sample_profile(seed: int) -> Dict[str, Any]:
"""Sample a continuous profile deterministically from a seed.
Reward weights drawn from a Dirichlet biased toward non-infrastructure
meters (progress, serenity, connection). Per-action modifiers drawn from
bounded uniforms so any sampled profile is playable.
"""
rng = random.Random(seed ^ 0xA3C5F729)
# Reward weights via Dirichlet (Ξ±<1 produces sparse weights β "personalities")
alphas = [0.5, 0.5, 1.5, 1.5, 1.5] # [vit, cog, prog, ser, conn]
raw = [rng.gammavariate(a, 1.0) for a in alphas]
total = sum(raw)
weights = [w / total for w in raw]
# Cap each weight at 0.45 so every sampled profile weights 3+ meters
# meaningfully. With an 0.80 cap, single-meter-dominant profiles let
# SLEEP-spam (or any single recovery action) be optimal β the env wasn't
# lying, the agent was right to spam. Forcing balance makes belief
# inference matter for action selection.
weights = [max(0.05, min(0.45, w)) for w in weights]
total = sum(weights)
weights = [w / total for w in weights]
return {
"name": f"sampled_{seed}",
"social_vitality_multiplier": rng.uniform(0.2, 3.0),
"morning_cognition_bonus": rng.uniform(0.4, 2.0) if rng.random() < 0.5 else None,
"evening_night_cognition_bonus": rng.uniform(0.6, 1.8) if rng.random() < 0.5 else None,
"morning_penalty": rng.uniform(0.4, 0.9) if rng.random() < 0.3 else None,
"binge_shame": rng.random() < 0.5,
"progress_serenity_bonus": rng.uniform(0.0, 0.10),
"idle_serenity_decay": rng.uniform(0.0, 0.10),
"vitality_decay_rate": rng.uniform(0.0, 0.04),
"stress_tolerance": rng.uniform(0.15, 0.30),
"event_impact_multiplier": rng.uniform(0.5, 1.0),
"connection_decay_rate": rng.uniform(0.005, 0.02),
"solo_serenity_bonus": rng.uniform(0.0, 0.10),
"social_connection_multiplier": rng.uniform(1.0, 2.0),
"social_serenity_bonus": rng.uniform(0.0, 0.06),
"work_vitality_recovery": rng.uniform(0.0, 0.06),
"initial_meters": {
"vitality": 0.7, "cognition": 0.7, "progress": 0.0,
"serenity": 0.7, "connection": 0.5,
},
"reward_weights": dict(zip(METERS, weights)),
}
def profile_to_belief_vector(profile: Dict[str, Any]) -> List[float]:
"""Reduce a profile to a 3-dim trait vector [social, morning, work] in [0, 1].
Used as the ground-truth target for the agent's belief output (Phase 3).
Aggregates the most-diagnostic modifiers per trait.
"""
# social_pref: low if social drains vitality a lot, high if social bonuses are big
# social_vitality_multiplier in [0.2, 3.0]: lower = more extroverted
sm = profile.get("social_vitality_multiplier", 1.0)
social_drain_norm = 1.0 - max(0.0, min(1.0, (sm - 0.2) / 2.8)) # invert
scm = profile.get("social_connection_multiplier", 1.0)
social_conn_norm = max(0.0, min(1.0, (scm - 1.0) / 1.0))
ssb = profile.get("social_serenity_bonus", 0.0)
social_ser_norm = max(0.0, min(1.0, ssb / 0.06))
social_pref = 0.5 * social_drain_norm + 0.3 * social_conn_norm + 0.2 * social_ser_norm
# morning_pref: high if morning bonus exists, low if morning penalty exists
mcb = profile.get("morning_cognition_bonus")
mp = profile.get("morning_penalty")
morning_pref = 0.5
if mcb is not None:
morning_pref = 0.5 + 0.5 * max(0.0, min(1.0, (mcb - 0.4) / 1.6))
if mp is not None:
morning_pref = min(morning_pref, 0.5 - 0.5 * (1.0 - mp))
# work_pref: high if work recovers vitality + progress gives serenity + progress weight high
wvr = profile.get("work_vitality_recovery", 0.0)
wvr_norm = max(0.0, min(1.0, wvr / 0.06))
psb = profile.get("progress_serenity_bonus", 0.0)
psb_norm = max(0.0, min(1.0, psb / 0.10))
pw = profile.get("reward_weights", {}).get("progress", 0.2)
pw_norm = max(0.0, min(1.0, (pw - 0.05) / 0.65))
work_pref = 0.4 * wvr_norm + 0.3 * psb_norm + 0.3 * pw_norm
return [
max(0.0, min(1.0, social_pref)),
max(0.0, min(1.0, morning_pref)),
max(0.0, min(1.0, work_pref)),
]
class RhythmEnvironment(Environment):
"""
Life Simulator RL Environment.
The agent manages 5 life meters (Vitality, Cognition, Progress, Serenity,
Connection) across a 7-day week. Hidden personality profiles secretly
control how actions affect meters and how reward is computed. The agent
must discover these hidden dynamics through experience.
Every observation includes:
- Current meter values and temporal context
- Last step's per-meter deltas as first-class fields
- Anomaly signals: actual delta minus neutral-profile expectation
- Rolling step_history (last 7 steps) with actions, rewards, deltas
The final grade rewards profile-appropriate strategy via adaptation_score
(30% of grade): late-half mean per-step reward minus early-half mean.
"""
SUPPORTS_CONCURRENT_SESSIONS: bool = True
def __init__(self) -> None:
super().__init__()
self._state = RhythmState()
self._rng = random.Random(0)
self._profile: Dict[str, Any] = PROFILES[0]
# Meters
self._vitality: float = 0.8
self._cognition: float = 0.7
self._progress: float = 0.0
self._serenity: float = 0.7
self._connection: float = 0.5
# Tracking
self._timestep: int = 0
self._crash_count: int = 0
self._total_reward: float = 0.0
self._step_history: list = []
self._step_rewards: list = [] # per-step rewards (for adaptation_score in grader)
# Latest emitted belief vector β set by callers via record_belief() and
# consumed by _grade_episode. Stays None if the agent never emits a belief
# (e.g. heuristic baseline) β that case scores 0 on the belief component.
self._final_belief: Optional[List[float]] = None
# Lazy-built composed Rubric for episode grading. None until the first
# `done=True` step; rebuilt only across env instances, not across episodes.
self._grade_rubric: Optional[Any] = None
def get_metadata(self) -> EnvironmentMetadata:
return EnvironmentMetadata(
name="RhythmEnv",
description=(
"Life Simulator β a holistic resource management RL environment "
"where an agent balances 5 life meters across a 7-day week "
"with hidden personality profiles."
),
version="0.3.0",
)
# ------------------------------------------------------------------
# reset
# ------------------------------------------------------------------
def reset(
self,
seed: Optional[int] = None,
episode_id: Optional[str] = None,
**kwargs: Any,
) -> RhythmObservation:
# Determine seed
if seed is not None:
effective_seed = seed
else:
effective_seed = hash(episode_id or str(uuid4())) & 0x7FFFFFFF
self._rng = random.Random(effective_seed)
# Profile selection β two modes:
# 1. Explicit hardcoded profile name β one of the 3 reference profiles
# (used by tests + the legacy 3-profile eval condition)
# 2. Default β sampled continuous profile (meta-RL training distribution)
profile_name = kwargs.get("profile")
if profile_name and profile_name in PROFILE_MAP:
self._profile = deepcopy(PROFILE_MAP[profile_name])
else:
self._profile = sample_profile(effective_seed)
# Initialize meters from profile defaults
initial = self._profile["initial_meters"]
self._vitality = initial["vitality"]
self._cognition = initial["cognition"]
self._progress = initial["progress"]
self._serenity = initial["serenity"]
self._connection = initial["connection"]
# Reset tracking
self._timestep = 0
self._crash_count = 0
self._total_reward = 0.0
self._step_history = []
self._step_rewards = []
self._final_belief = None
self._state = RhythmState(
episode_id=episode_id or str(uuid4()),
step_count=0,
profile_name=self._profile["name"],
timestep=0,
day=0,
slot=0,
vitality=self._vitality,
cognition=self._cognition,
progress=self._progress,
serenity=self._serenity,
connection=self._connection,
)
return self._make_observation(reward=0.0, done=False, active_event=None)
# ------------------------------------------------------------------
# step
# ------------------------------------------------------------------
def step(
self,
action: RhythmAction,
timeout_s: Optional[float] = None,
**kwargs: Any,
) -> RhythmObservation:
# Save step number before incrementing (used for history record)
current_step = self._timestep
slot = self._timestep % SLOTS_PER_DAY
day = self._timestep // SLOTS_PER_DAY
action_name = action.action_type.value
# --- 1. Roll and apply event ---
active_event = self._roll_event()
if active_event:
self._apply_event(active_event)
# --- 2. Get base action effects ---
effects = dict(ACTION_EFFECTS[action_name])
# --- 2b. Repetition dampening ---
recent3 = [h["action"] for h in self._step_history[-3:]]
repeat_count = recent3.count(action_name)
if repeat_count > 0:
dampening = 1.0 - 0.25 * repeat_count # 0.75, 0.50, 0.25
for meter in METERS:
if effects[meter] > 0:
effects[meter] *= dampening
# --- 3. Apply time-of-day multipliers (SLEEP bypasses) ---
if action_name != "sleep":
effects = self._apply_time_multipliers(effects, slot)
# Snapshot expected effects here β after time/dampening but BEFORE profile
# modifiers. The anomaly = actual_delta - expected gives the agent a direct
# per-step fingerprint of the hidden profile modifier.
expected_no_profile = dict(effects)
# --- 4. Apply profile modifiers ---
effects = self._apply_profile_modifiers(effects, action_name, slot)
# --- 5. Apply global vitality factor (low vitality reduces positive effects) ---
vitality_factor = 0.5 + 0.5 * self._vitality
for meter in METERS:
if meter != "vitality" and effects[meter] > 0:
effects[meter] *= vitality_factor
# Apply same vitality factor to expected for fair anomaly comparison
for meter in METERS:
if meter != "vitality" and expected_no_profile[meter] > 0:
expected_no_profile[meter] *= vitality_factor
# --- 6. Apply passive decays ---
self._apply_passive_decays()
# --- 7. Update meters and track deltas ---
deltas: Dict[str, float] = {}
for meter in METERS:
old_val = getattr(self, f"_{meter}")
new_val = max(0.0, min(1.0, old_val + effects[meter]))
deltas[meter] = new_val - old_val
setattr(self, f"_{meter}", new_val)
# --- 8. Compute reward ---
reward = self._compute_reward(deltas)
# --- 9. Check critical thresholds ---
for meter in METERS:
if getattr(self, f"_{meter}") < CRITICAL_THRESHOLD:
reward += CRITICAL_PENALTY
self._crash_count += 1
# Clamp reward
reward = max(-3.0, min(3.0, round(reward, 4)))
self._total_reward += reward
self._step_rewards.append(reward)
# --- 10. Advance timestep ---
self._timestep += 1
new_day = self._timestep // SLOTS_PER_DAY
new_slot = self._timestep % SLOTS_PER_DAY
# --- 11. Check done ---
done = self._timestep >= MAX_STEPS
# --- 12. Build reward breakdown ---
# Includes: per-meter deltas, per-meter anomalies (actual - expected),
# event flag, and final_score on the last step.
reward_breakdown: Dict[str, float] = {}
for meter in METERS:
reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
reward_breakdown[f"{meter}_anomaly"] = round(
deltas[meter] - expected_no_profile[meter], 4
)
if active_event:
reward_breakdown["event"] = 1.0
# --- 13. Grade if done ---
if done:
final_score = self._grade_episode()
reward_breakdown["final_score"] = round(final_score, 4)
# Sparse terminal reward: directly supervise on grader final_score.
# Centered on 0.5 (the "average" episode), scaled by 5x to give a
# range of [-2.5, +2.5] β strong enough to dominate any local
# reward-hack the agent might find on per-step shaping alone.
terminal_bonus = (final_score - 0.5) * 5.0
reward = max(-3.0, min(3.0, reward + terminal_bonus))
self._total_reward += terminal_bonus # update tracking too
reward_breakdown["terminal_bonus"] = round(terminal_bonus, 4)
# --- 14. Update state ---
self._state.step_count = self._timestep
self._state.timestep = self._timestep
self._state.day = new_day
self._state.slot = new_slot
self._state.vitality = round(self._vitality, 4)
self._state.cognition = round(self._cognition, 4)
self._state.progress = round(self._progress, 4)
self._state.serenity = round(self._serenity, 4)
self._state.connection = round(self._connection, 4)
self._state.active_event = active_event
# --- 15. Append completed step to rolling history ---
# History entries carry per-meter anomalies (actual β expected_under_neutral).
# The prompt builder reads these directly to surface the agent's clearest
# profile-inference signal.
self._step_history.append({
"step": current_step,
"action": action_name,
"reward": reward,
"vitality_delta": round(deltas["vitality"], 4),
"cognition_delta": round(deltas["cognition"], 4),
"progress_delta": round(deltas["progress"], 4),
"serenity_delta": round(deltas["serenity"], 4),
"connection_delta": round(deltas["connection"], 4),
"vitality_anomaly": round(deltas["vitality"] - expected_no_profile["vitality"], 4),
"cognition_anomaly": round(deltas["cognition"] - expected_no_profile["cognition"], 4),
"progress_anomaly": round(deltas["progress"] - expected_no_profile["progress"], 4),
"serenity_anomaly": round(deltas["serenity"] - expected_no_profile["serenity"], 4),
"connection_anomaly": round(deltas["connection"] - expected_no_profile["connection"], 4),
})
if len(self._step_history) > HISTORY_LENGTH:
self._step_history.pop(0)
return self._make_observation(
reward=reward,
done=done,
active_event=active_event,
reward_breakdown=reward_breakdown,
deltas=deltas,
last_action=action_name,
)
# ------------------------------------------------------------------
# state property
# ------------------------------------------------------------------
@property
def state(self) -> RhythmState:
return self._state
def get_belief_target(self) -> List[float]:
"""Return the 3-dim ground-truth belief vector for the active profile.
Used during training to compute belief-accuracy reward (Phase 3).
Privileged information β not exposed via observation.
"""
return profile_to_belief_vector(self._profile)
def record_belief(self, belief: List[float]) -> None:
"""Record the agent's emitted belief for the current step.
The grader (`_grade_episode`) uses the LAST recorded belief to compute
the belief_accuracy component of final_score. Callers should invoke
this once per step after parsing the agent's completion. Heuristic /
random baselines that don't emit a belief never call this, and the
belief component scores 0 for them β that's intentional: the meta-RL
skill is INFERENCE, and only agents that actually try get credit.
"""
if len(belief) != 3:
raise ValueError(f"belief must have 3 elements, got {len(belief)}")
self._final_belief = [max(0.0, min(1.0, float(b))) for b in belief]
def get_profile_hint(self) -> Dict[str, float]:
"""Return a coarse profile hint usable in observation during curriculum.
Returns the 3-dim belief vector with descriptive keys. The dataset
generator passes this into the prompt for the fraction of samples
with show_profile_hint=True (the curriculum's "visible" warmup phase).
"""
b = profile_to_belief_vector(self._profile)
return {"social_pref": round(b[0], 3), "morning_pref": round(b[1], 3), "work_pref": round(b[2], 3)}
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _roll_event(self) -> Optional[str]:
"""Roll for a random event this step."""
if self._rng.random() < EVENT_PROBABILITY:
return self._rng.choice(EVENT_NAMES)
return None
def _apply_event(self, event_name: str) -> None:
"""Apply event effects to meters, modified by profile."""
effects = EVENT_EFFECTS[event_name]
multiplier = self._profile["event_impact_multiplier"]
for meter in METERS:
delta = effects[meter]
# Only apply multiplier to negative effects
if delta < 0:
delta *= multiplier
old_val = getattr(self, f"_{meter}")
new_val = max(0.0, min(1.0, old_val + delta))
setattr(self, f"_{meter}", new_val)
def _apply_time_multipliers(
self, effects: Dict[str, float], slot: int
) -> Dict[str, float]:
"""Apply time-of-day multipliers to action effects."""
multipliers = TIME_MULTIPLIERS[slot]
for meter in effects:
if meter == "cognition" and effects[meter] > 0:
effects[meter] *= multipliers["cognition_gain"]
elif meter == "vitality" and effects[meter] < 0:
effects[meter] *= multipliers["vitality_drain"]
return effects
def _apply_profile_modifiers(
self, effects: Dict[str, float], action_name: str, slot: int
) -> Dict[str, float]:
"""Apply hidden profile modifiers to action effects."""
profile = self._profile
# Social vitality drain multiplier
if action_name in SOCIAL_ACTIONS and effects["vitality"] < 0:
effects["vitality"] *= profile["social_vitality_multiplier"]
# Introvert morning cognition bonus
bonus = profile.get("morning_cognition_bonus")
if bonus and slot == 0:
if effects["cognition"] > 0:
effects["cognition"] *= bonus
if effects["progress"] > 0:
effects["progress"] *= bonus
# Extrovert evening/night cognition bonus
bonus = profile.get("evening_night_cognition_bonus")
if bonus and slot in (2, 3):
if effects["cognition"] > 0:
effects["cognition"] *= bonus
if effects["progress"] > 0:
effects["progress"] *= bonus
# Extrovert morning penalty
penalty = profile.get("morning_penalty")
if penalty and slot == 0:
if effects["cognition"] > 0:
effects["cognition"] *= penalty
if effects["progress"] > 0:
effects["progress"] *= penalty
# Binge shame spiral
if profile.get("binge_shame") and action_name == "binge_watch":
effects["serenity"] -= 0.15
effects["cognition"] -= 0.06
# Workaholic: progress-producing actions give serenity bonus
psb = profile.get("progress_serenity_bonus", 0.0)
if psb > 0 and effects["progress"] > 0:
effects["serenity"] += psb
# Workaholic: idle actions drain serenity
isd = profile.get("idle_serenity_decay", 0.0)
if isd > 0 and action_name in IDLE_ACTIONS:
effects["serenity"] -= isd
# Solo recharge: introvert gets serenity from alone time
ssb = profile.get("solo_serenity_bonus", 0.0)
if ssb > 0 and action_name in ("me_time", "meditate"):
effects["serenity"] += ssb
# Social connection multiplier: extrovert gets more connection from socializing
scm = profile.get("social_connection_multiplier", 1.0)
if scm != 1.0 and action_name in SOCIAL_ACTIONS and effects["connection"] > 0:
effects["connection"] *= scm
# Social serenity bonus: extrovert gets serenity from socializing
ssrb = profile.get("social_serenity_bonus", 0.0)
if ssrb > 0 and action_name in SOCIAL_ACTIONS:
effects["serenity"] += ssrb
# Work vitality recovery: workaholic gets vitality from productive work
wvr = profile.get("work_vitality_recovery", 0.0)
if wvr > 0 and action_name in WORK_ACTIONS:
effects["vitality"] += wvr
# Low serenity amplification (stress spiral)
if self._serenity < profile.get("stress_tolerance", 0.3):
for meter in effects:
if effects[meter] < 0:
effects[meter] *= 1.3
return effects
def _apply_passive_decays(self) -> None:
"""Apply per-step passive meter decays."""
# Connection always decays if not actively maintained
decay = self._profile["connection_decay_rate"]
self._connection = max(0.0, self._connection - decay)
# Workaholic extra vitality decay
vd = self._profile.get("vitality_decay_rate", 0.0)
if vd > 0:
self._vitality = max(0.0, self._vitality - vd)
def _compute_reward(self, deltas: Dict[str, float]) -> float:
"""Pure profile-weighted per-step reward.
Deliberately uncontaminated: the grader-aligned bias (progress +
connection deltas) lives in the TRAINING reward function in
reward_functions.py, not here. Keeping the env's per-step reward
pure means (1) the agent's inference signal stays a clean function
of the hidden profile_weights, (2) the grader's adaptation_score
isn't computed on biased rewards, and (3) the env's reward matches
what an honest deployment would surface to the agent.
"""
weights = self._profile["reward_weights"]
return sum(deltas[m] * weights[m] for m in METERS) * REWARD_SCALE
def _grade_episode(self) -> float:
"""
Compute final episode score in [0, 1].
Components (meta-learning aligned):
0.15 β crash_free_ratio: no critical meter drops
0.20 β progress: career/skill growth
0.10 β connection: relationship maintained
0.25 β adaptation_score: agent got better as it learned the user
0.10 β efficiency: bounded normalized average reward
0.20 β belief_accuracy: how close last-emitted belief is to true profile
DESIGN NOTE β Acknowledged conformance gap with OpenEnv:
This grader is functionally equivalent to a `WeightedSum` Rubric
(from `openenv.core.rubrics`) over 6 child Rubrics β same
composability, same independent components, same explicit weights.
We did not refactor to use the Rubric class literal because the
grader reads aggregated episode-end state (per-step rewards buffer,
crash_count, terminal belief) while OpenEnv's `Rubric.forward`
expects per-(action, observation) inputs. A clean refactor would
use `TrajectoryRubric` for the cumulative components and the
per-step `Rubric` for crash_free / belief_accuracy. Tracked as
a v2 cleanup task; not blocking on the meta-RL skill we're
evaluating.
Implementation: composes 6 `Rubric` subclasses via OpenEnv's
`WeightedSum` (see `server/rubrics.py`). Each sub-rubric reads
the aggregated episode state (`_step_rewards`, `_crash_count`,
`_final_belief`, `_profile`) of the env it was built with β
RFC 004's recommended pattern for trajectory-summary scoring on
top of the per-(action, observation) Rubric ABC.
belief_accuracy is the explicit meta-RL inference signal: an
agent that doesn't emit a belief scores 0 here, an agent emitting
a belief close to the hidden profile vector scores up to 1.
Without this term, agents that play heuristic-style "keep meters
healthy" score the same as agents that actually infer the profile,
since the other components don't differentiate inference from
reflex.
"""
from server.rubrics import make_grade_rubric
# Build (or reuse) the composed rubric. The Rubric subclasses are
# stateless once built β they read live env state at forward()
# time β so caching is safe.
if self._grade_rubric is None:
self._grade_rubric = make_grade_rubric(self)
# forward(action, observation) β args are unused for episode-end
# scoring; the rubric reads from `self`.
score = self._grade_rubric(action=None, observation=None)
return max(0.0, min(1.0, float(score)))
def _make_observation(
self,
reward: float,
done: bool,
active_event: Optional[str],
reward_breakdown: Optional[Dict[str, float]] = None,
deltas: Optional[Dict[str, float]] = None,
last_action: Optional[str] = None,
) -> RhythmObservation:
"""Build the observation returned to the agent (hides profile)."""
step_records = [
StepRecord(
step=h["step"],
action=h["action"],
reward=h["reward"],
vitality_delta=h["vitality_delta"],
cognition_delta=h["cognition_delta"],
progress_delta=h["progress_delta"],
serenity_delta=h["serenity_delta"],
connection_delta=h["connection_delta"],
vitality_anomaly=h.get("vitality_anomaly", 0.0),
cognition_anomaly=h.get("cognition_anomaly", 0.0),
progress_anomaly=h.get("progress_anomaly", 0.0),
serenity_anomaly=h.get("serenity_anomaly", 0.0),
connection_anomaly=h.get("connection_anomaly", 0.0),
)
for h in self._step_history
]
return RhythmObservation(
timestep=self._timestep,
day=self._timestep // SLOTS_PER_DAY,
slot=self._timestep % SLOTS_PER_DAY,
vitality=round(self._vitality, 4),
cognition=round(self._cognition, 4),
progress=round(self._progress, 4),
serenity=round(self._serenity, 4),
connection=round(self._connection, 4),
active_event=active_event,
remaining_steps=MAX_STEPS - self._timestep,
reward_breakdown=reward_breakdown or {},
reward=reward,
done=done,
# First-class delta fields (from this step; zero on reset)
vitality_delta=round(deltas["vitality"], 4) if deltas else 0.0,
cognition_delta=round(deltas["cognition"], 4) if deltas else 0.0,
progress_delta=round(deltas["progress"], 4) if deltas else 0.0,
serenity_delta=round(deltas["serenity"], 4) if deltas else 0.0,
connection_delta=round(deltas["connection"], 4) if deltas else 0.0,
last_action=last_action,
# Rolling history of the last HISTORY_LENGTH completed steps
step_history=step_records,
)
|