File size: 37,518 Bytes
c07f15e
 
 
 
 
 
025774a
cc6473a
025774a
cc6473a
 
 
 
025774a
cc6473a
0a15ab5
 
 
 
 
 
ece0bbe
 
 
0a15ab5
 
025774a
 
cc6473a
 
 
025774a
 
 
 
 
 
0a15ab5
c07f15e
0a15ab5
025774a
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
0a15ab5
025774a
 
cc6473a
025774a
 
cc6473a
 
 
 
 
 
 
 
 
 
 
025774a
 
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
025774a
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a15ab5
025774a
ecbe0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ece0bbe
 
 
 
 
bb2a9c7
ecbe0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
025774a
 
 
cc6473a
025774a
cc6473a
 
 
 
0a15ab5
 
 
 
 
 
 
ece0bbe
 
025774a
 
 
 
 
 
 
cc6473a
 
 
 
 
 
 
 
 
025774a
cc6473a
 
0a15ab5
ecbe0d8
ece0bbe
 
 
 
f0ca22d
 
 
025774a
 
 
 
 
cc6473a
 
 
025774a
0a15ab5
025774a
 
 
 
 
 
 
 
 
 
 
 
cc6473a
 
 
 
 
 
 
025774a
ece0bbe
 
 
 
cc6473a
 
 
ecbe0d8
 
cc6473a
 
 
 
 
 
 
 
 
 
025774a
cc6473a
 
0a15ab5
ecbe0d8
ece0bbe
025774a
 
 
 
cc6473a
025774a
cc6473a
 
 
 
 
 
 
025774a
 
cc6473a
025774a
 
 
 
 
 
 
 
 
 
 
0a15ab5
 
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
ece0bbe
 
cc6473a
 
 
 
 
 
 
 
 
 
0a15ab5
 
 
 
 
cc6473a
 
 
 
 
 
 
 
0a15ab5
 
 
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecbe0d8
cc6473a
 
025774a
cc6473a
 
 
 
 
 
 
0a15ab5
 
cc6473a
 
 
0a15ab5
 
 
cc6473a
 
 
 
025774a
 
 
ece0bbe
 
 
 
64d24b3
 
 
 
025774a
cc6473a
 
025774a
cc6473a
 
 
 
 
 
 
 
025774a
0a15ab5
ece0bbe
 
 
0a15ab5
 
 
 
 
 
 
 
 
bb2a9c7
 
 
 
 
0a15ab5
 
 
 
025774a
cc6473a
 
 
 
0a15ab5
 
025774a
 
 
 
 
 
 
 
 
 
ecbe0d8
 
 
 
 
 
 
 
ece0bbe
 
 
 
 
 
 
 
 
 
 
 
 
 
ecbe0d8
 
 
ece0bbe
 
 
ecbe0d8
 
 
 
025774a
cc6473a
025774a
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a15ab5
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ece0bbe
 
 
 
 
 
 
 
 
64d24b3
cc6473a
bb2a9c7
025774a
 
0a15ab5
 
 
ecbe0d8
ece0bbe
 
 
 
ecbe0d8
ece0bbe
025774a
dc5658d
 
 
 
 
 
 
 
 
 
 
 
 
f0ca22d
 
 
 
 
 
 
 
 
 
 
 
 
 
ecbe0d8
f0ca22d
 
 
 
 
 
 
 
 
 
 
 
025774a
 
 
 
 
cc6473a
 
0a15ab5
 
025774a
cc6473a
0a15ab5
 
 
 
 
 
 
 
 
 
bb2a9c7
 
 
 
 
0a15ab5
 
 
 
025774a
 
cc6473a
 
 
 
 
 
 
 
025774a
cc6473a
025774a
 
0a15ab5
 
 
 
 
 
 
 
 
025774a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
RhythmEnv Life Simulator β€” Environment Implementation.

A holistic life resource management RL environment. The agent balances
5 life meters across a 7-day week (28 steps) while hidden personality
profiles secretly control how actions affect meters and how reward is
computed. The agent must discover these hidden dynamics through experience.

1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.

Key design principles for learnability:
  - step_history: last 7 steps of (action, reward, deltas) are included
    in every observation so the agent can detect personality anomalies
  - *_anomaly fields: per-meter deviation from neutral-profile expectation,
    giving a direct fingerprint of the hidden profile each step
  - adaptation_score: 30% of final grade β€” late-half mean per-step reward
    minus early-half mean (gated by absolute late-half quality). Rewards
    the agent for getting better as it learns the user.
  - Profile assignment uses a scrambled seed to prevent memorization
    of seed β†’ profile mappings during training
"""

import random
from copy import deepcopy
from typing import Any, Dict, List, Optional
from uuid import uuid4

from openenv.core.env_server import Environment
from openenv.core.env_server.types import EnvironmentMetadata

try:
    from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
except (ImportError, ModuleNotFoundError):
    from models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

MAX_STEPS = 28
DAYS = 7
SLOTS_PER_DAY = 4
SLOT_NAMES = ["morning", "afternoon", "evening", "night"]
METERS = ["vitality", "cognition", "progress", "serenity", "connection"]

EVENT_PROBABILITY = 0.08
CRITICAL_THRESHOLD = 0.1
CRITICAL_PENALTY = -0.3
REWARD_SCALE = 15.0
HISTORY_LENGTH = 7  # number of past steps included in every observation

# ---------------------------------------------------------------------------
# Action-Effect Matrix (base deltas per action on each meter)
# ---------------------------------------------------------------------------

ACTION_EFFECTS: Dict[str, Dict[str, float]] = {
    "deep_work":   {"vitality": -0.12, "cognition": -0.10, "progress":  0.18, "serenity": -0.05, "connection":  0.00},
    "admin_work":  {"vitality": -0.06, "cognition": -0.05, "progress":  0.08, "serenity": -0.03, "connection":  0.00},
    "learn":       {"vitality": -0.08, "cognition": -0.08, "progress":  0.12, "serenity":  0.02, "connection":  0.00},
    "sleep":       {"vitality":  0.20, "cognition":  0.10, "progress":  0.00, "serenity":  0.05, "connection":  0.00},
    "exercise":    {"vitality":  0.12, "cognition":  0.05, "progress":  0.00, "serenity":  0.08, "connection":  0.00},
    "meditate":    {"vitality":  0.03, "cognition":  0.08, "progress":  0.00, "serenity":  0.15, "connection":  0.00},
    "family_time": {"vitality": -0.04, "cognition": -0.02, "progress":  0.00, "serenity":  0.06, "connection":  0.15},
    "socialize":   {"vitality": -0.06, "cognition": -0.03, "progress":  0.00, "serenity":  0.04, "connection":  0.12},
    "me_time":     {"vitality":  0.05, "cognition":  0.03, "progress":  0.00, "serenity":  0.10, "connection": -0.02},
    "binge_watch": {"vitality":  0.02, "cognition": -0.05, "progress": -0.02, "serenity":  0.06, "connection": -0.03},
}

# ---------------------------------------------------------------------------
# Time-of-Day Multipliers
# ---------------------------------------------------------------------------

TIME_MULTIPLIERS: Dict[int, Dict[str, float]] = {
    0: {"cognition_gain": 1.2, "vitality_drain": 0.8},   # Morning
    1: {"cognition_gain": 1.0, "vitality_drain": 1.0},   # Afternoon
    2: {"cognition_gain": 0.8, "vitality_drain": 1.1},   # Evening
    3: {"cognition_gain": 0.6, "vitality_drain": 1.3},   # Night
}

# ---------------------------------------------------------------------------
# Random Events
# ---------------------------------------------------------------------------

EVENT_EFFECTS: Dict[str, Dict[str, float]] = {
    "prod_crash":        {"vitality": -0.08, "cognition": -0.10, "progress": -0.10, "serenity": -0.15, "connection":  0.00},
    "family_emergency":  {"vitality": -0.05, "cognition": -0.08, "progress":  0.00, "serenity": -0.12, "connection": -0.10},
    "illness":           {"vitality": -0.20, "cognition": -0.10, "progress":  0.00, "serenity": -0.05, "connection":  0.00},
    "good_news":         {"vitality":  0.05, "cognition":  0.03, "progress":  0.00, "serenity":  0.10, "connection":  0.05},
}

EVENT_NAMES: List[str] = list(EVENT_EFFECTS.keys())

# ---------------------------------------------------------------------------
# Hidden Personality Profiles
# ---------------------------------------------------------------------------

PROFILES: List[Dict[str, Any]] = [
    {
        "name": "introvert_morning",
        "social_vitality_multiplier": 3.0,
        "morning_cognition_bonus": 2.0,
        "evening_night_cognition_bonus": None,
        "morning_penalty": None,
        "binge_shame": True,
        "progress_serenity_bonus": 0.0,
        "idle_serenity_decay": 0.0,
        "vitality_decay_rate": 0.0,
        "stress_tolerance": 0.3,
        "event_impact_multiplier": 1.0,
        "connection_decay_rate": 0.01,
        "solo_serenity_bonus": 0.10,
        "social_connection_multiplier": 1.0,
        "social_serenity_bonus": 0.0,
        "work_vitality_recovery": 0.0,
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": {
            "vitality": 0.05, "cognition": 0.05, "progress": 0.20,
            "serenity": 0.60, "connection": 0.10,
        },
    },
    {
        "name": "extrovert_night_owl",
        "social_vitality_multiplier": 0.2,
        "morning_cognition_bonus": None,
        "evening_night_cognition_bonus": 1.8,
        "morning_penalty": 0.4,
        "binge_shame": False,
        "progress_serenity_bonus": 0.0,
        "idle_serenity_decay": 0.0,
        "vitality_decay_rate": 0.0,
        "stress_tolerance": 0.2,
        "event_impact_multiplier": 0.8,
        "connection_decay_rate": 0.01,
        "solo_serenity_bonus": 0.0,
        "social_connection_multiplier": 2.0,
        "social_serenity_bonus": 0.06,
        "work_vitality_recovery": 0.0,
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": {
            "vitality": 0.05, "cognition": 0.05, "progress": 0.10,
            "serenity": 0.05, "connection": 0.75,
        },
    },
    {
        "name": "workaholic_stoic",
        "social_vitality_multiplier": 1.0,
        "morning_cognition_bonus": None,
        "evening_night_cognition_bonus": None,
        "morning_penalty": None,
        "binge_shame": False,
        "progress_serenity_bonus": 0.10,
        "idle_serenity_decay": 0.10,
        "vitality_decay_rate": 0.04,
        "stress_tolerance": 0.15,
        "event_impact_multiplier": 0.5,
        "connection_decay_rate": 0.02,
        "solo_serenity_bonus": 0.0,
        "social_connection_multiplier": 1.0,
        "social_serenity_bonus": 0.0,
        "work_vitality_recovery": 0.06,
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": {
            "vitality": 0.05, "cognition": 0.05, "progress": 0.70,
            "serenity": 0.10, "connection": 0.10,
        },
    },
]

PROFILE_MAP: Dict[str, Dict[str, Any]] = {p["name"]: p for p in PROFILES}

# Social actions for modifier checks
SOCIAL_ACTIONS = {"family_time", "socialize"}
IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
WORK_ACTIONS = {"deep_work", "learn", "admin_work"}

# ---------------------------------------------------------------------------
# Continuous profile sampling (meta-RL training distribution)
# ---------------------------------------------------------------------------
#
# Hardcoded profiles are 3 fixed personalities β€” memorizable, classification-like.
# Continuous sampling draws profile parameters from distributions per episode,
# making memorization impossible and forcing the agent to learn the *skill* of
# profile inference. This is the core meta-learning move.
#
# Belief vector dimensions (for Phase 3 cosine-similarity reward):
#   social_pref:  0 = hates social (introvert), 1 = loves social (extrovert)
#   morning_pref: 0 = night owl, 1 = morning person
#   work_pref:    0 = work-averse, 1 = workaholic

def sample_profile(seed: int) -> Dict[str, Any]:
    """Sample a continuous profile deterministically from a seed.

    Reward weights drawn from a Dirichlet biased toward non-infrastructure
    meters (progress, serenity, connection). Per-action modifiers drawn from
    bounded uniforms so any sampled profile is playable.
    """
    rng = random.Random(seed ^ 0xA3C5F729)

    # Reward weights via Dirichlet (Ξ±<1 produces sparse weights β†’ "personalities")
    alphas = [0.5, 0.5, 1.5, 1.5, 1.5]  # [vit, cog, prog, ser, conn]
    raw = [rng.gammavariate(a, 1.0) for a in alphas]
    total = sum(raw)
    weights = [w / total for w in raw]
    # Cap each weight at 0.45 so every sampled profile weights 3+ meters
    # meaningfully. With an 0.80 cap, single-meter-dominant profiles let
    # SLEEP-spam (or any single recovery action) be optimal β€” the env wasn't
    # lying, the agent was right to spam. Forcing balance makes belief
    # inference matter for action selection.
    weights = [max(0.05, min(0.45, w)) for w in weights]
    total = sum(weights)
    weights = [w / total for w in weights]

    return {
        "name": f"sampled_{seed}",
        "social_vitality_multiplier": rng.uniform(0.2, 3.0),
        "morning_cognition_bonus": rng.uniform(0.4, 2.0) if rng.random() < 0.5 else None,
        "evening_night_cognition_bonus": rng.uniform(0.6, 1.8) if rng.random() < 0.5 else None,
        "morning_penalty": rng.uniform(0.4, 0.9) if rng.random() < 0.3 else None,
        "binge_shame": rng.random() < 0.5,
        "progress_serenity_bonus": rng.uniform(0.0, 0.10),
        "idle_serenity_decay": rng.uniform(0.0, 0.10),
        "vitality_decay_rate": rng.uniform(0.0, 0.04),
        "stress_tolerance": rng.uniform(0.15, 0.30),
        "event_impact_multiplier": rng.uniform(0.5, 1.0),
        "connection_decay_rate": rng.uniform(0.005, 0.02),
        "solo_serenity_bonus": rng.uniform(0.0, 0.10),
        "social_connection_multiplier": rng.uniform(1.0, 2.0),
        "social_serenity_bonus": rng.uniform(0.0, 0.06),
        "work_vitality_recovery": rng.uniform(0.0, 0.06),
        "initial_meters": {
            "vitality": 0.7, "cognition": 0.7, "progress": 0.0,
            "serenity": 0.7, "connection": 0.5,
        },
        "reward_weights": dict(zip(METERS, weights)),
    }


def profile_to_belief_vector(profile: Dict[str, Any]) -> List[float]:
    """Reduce a profile to a 3-dim trait vector [social, morning, work] in [0, 1].

    Used as the ground-truth target for the agent's belief output (Phase 3).
    Aggregates the most-diagnostic modifiers per trait.
    """
    # social_pref: low if social drains vitality a lot, high if social bonuses are big
    # social_vitality_multiplier in [0.2, 3.0]: lower = more extroverted
    sm = profile.get("social_vitality_multiplier", 1.0)
    social_drain_norm = 1.0 - max(0.0, min(1.0, (sm - 0.2) / 2.8))  # invert
    scm = profile.get("social_connection_multiplier", 1.0)
    social_conn_norm = max(0.0, min(1.0, (scm - 1.0) / 1.0))
    ssb = profile.get("social_serenity_bonus", 0.0)
    social_ser_norm = max(0.0, min(1.0, ssb / 0.06))
    social_pref = 0.5 * social_drain_norm + 0.3 * social_conn_norm + 0.2 * social_ser_norm

    # morning_pref: high if morning bonus exists, low if morning penalty exists
    mcb = profile.get("morning_cognition_bonus")
    mp = profile.get("morning_penalty")
    morning_pref = 0.5
    if mcb is not None:
        morning_pref = 0.5 + 0.5 * max(0.0, min(1.0, (mcb - 0.4) / 1.6))
    if mp is not None:
        morning_pref = min(morning_pref, 0.5 - 0.5 * (1.0 - mp))

    # work_pref: high if work recovers vitality + progress gives serenity + progress weight high
    wvr = profile.get("work_vitality_recovery", 0.0)
    wvr_norm = max(0.0, min(1.0, wvr / 0.06))
    psb = profile.get("progress_serenity_bonus", 0.0)
    psb_norm = max(0.0, min(1.0, psb / 0.10))
    pw = profile.get("reward_weights", {}).get("progress", 0.2)
    pw_norm = max(0.0, min(1.0, (pw - 0.05) / 0.65))
    work_pref = 0.4 * wvr_norm + 0.3 * psb_norm + 0.3 * pw_norm

    return [
        max(0.0, min(1.0, social_pref)),
        max(0.0, min(1.0, morning_pref)),
        max(0.0, min(1.0, work_pref)),
    ]


class RhythmEnvironment(Environment):
    """
    Life Simulator RL Environment.

    The agent manages 5 life meters (Vitality, Cognition, Progress, Serenity,
    Connection) across a 7-day week. Hidden personality profiles secretly
    control how actions affect meters and how reward is computed. The agent
    must discover these hidden dynamics through experience.

    Every observation includes:
      - Current meter values and temporal context
      - Last step's per-meter deltas as first-class fields
      - Anomaly signals: actual delta minus neutral-profile expectation
      - Rolling step_history (last 7 steps) with actions, rewards, deltas

    The final grade rewards profile-appropriate strategy via adaptation_score
    (30% of grade): late-half mean per-step reward minus early-half mean.
    """

    SUPPORTS_CONCURRENT_SESSIONS: bool = True

    def __init__(self) -> None:
        super().__init__()
        self._state = RhythmState()
        self._rng = random.Random(0)
        self._profile: Dict[str, Any] = PROFILES[0]
        # Meters
        self._vitality: float = 0.8
        self._cognition: float = 0.7
        self._progress: float = 0.0
        self._serenity: float = 0.7
        self._connection: float = 0.5
        # Tracking
        self._timestep: int = 0
        self._crash_count: int = 0
        self._total_reward: float = 0.0
        self._step_history: list = []
        self._step_rewards: list = []  # per-step rewards (for adaptation_score in grader)
        # Latest emitted belief vector β€” set by callers via record_belief() and
        # consumed by _grade_episode. Stays None if the agent never emits a belief
        # (e.g. heuristic baseline) β€” that case scores 0 on the belief component.
        self._final_belief: Optional[List[float]] = None
        # Lazy-built composed Rubric for episode grading. None until the first
        # `done=True` step; rebuilt only across env instances, not across episodes.
        self._grade_rubric: Optional[Any] = None

    def get_metadata(self) -> EnvironmentMetadata:
        return EnvironmentMetadata(
            name="RhythmEnv",
            description=(
                "Life Simulator β€” a holistic resource management RL environment "
                "where an agent balances 5 life meters across a 7-day week "
                "with hidden personality profiles."
            ),
            version="0.3.0",
        )

    # ------------------------------------------------------------------
    # reset
    # ------------------------------------------------------------------

    def reset(
        self,
        seed: Optional[int] = None,
        episode_id: Optional[str] = None,
        **kwargs: Any,
    ) -> RhythmObservation:
        # Determine seed
        if seed is not None:
            effective_seed = seed
        else:
            effective_seed = hash(episode_id or str(uuid4())) & 0x7FFFFFFF

        self._rng = random.Random(effective_seed)

        # Profile selection β€” two modes:
        #   1. Explicit hardcoded profile name β†’ one of the 3 reference profiles
        #      (used by tests + the legacy 3-profile eval condition)
        #   2. Default β†’ sampled continuous profile (meta-RL training distribution)
        profile_name = kwargs.get("profile")
        if profile_name and profile_name in PROFILE_MAP:
            self._profile = deepcopy(PROFILE_MAP[profile_name])
        else:
            self._profile = sample_profile(effective_seed)

        # Initialize meters from profile defaults
        initial = self._profile["initial_meters"]
        self._vitality = initial["vitality"]
        self._cognition = initial["cognition"]
        self._progress = initial["progress"]
        self._serenity = initial["serenity"]
        self._connection = initial["connection"]

        # Reset tracking
        self._timestep = 0
        self._crash_count = 0
        self._total_reward = 0.0
        self._step_history = []
        self._step_rewards = []
        self._final_belief = None

        self._state = RhythmState(
            episode_id=episode_id or str(uuid4()),
            step_count=0,
            profile_name=self._profile["name"],
            timestep=0,
            day=0,
            slot=0,
            vitality=self._vitality,
            cognition=self._cognition,
            progress=self._progress,
            serenity=self._serenity,
            connection=self._connection,
        )

        return self._make_observation(reward=0.0, done=False, active_event=None)

    # ------------------------------------------------------------------
    # step
    # ------------------------------------------------------------------

    def step(
        self,
        action: RhythmAction,
        timeout_s: Optional[float] = None,
        **kwargs: Any,
    ) -> RhythmObservation:
        # Save step number before incrementing (used for history record)
        current_step = self._timestep

        slot = self._timestep % SLOTS_PER_DAY
        day = self._timestep // SLOTS_PER_DAY
        action_name = action.action_type.value

        # --- 1. Roll and apply event ---
        active_event = self._roll_event()
        if active_event:
            self._apply_event(active_event)

        # --- 2. Get base action effects ---
        effects = dict(ACTION_EFFECTS[action_name])

        # --- 2b. Repetition dampening ---
        recent3 = [h["action"] for h in self._step_history[-3:]]
        repeat_count = recent3.count(action_name)
        if repeat_count > 0:
            dampening = 1.0 - 0.25 * repeat_count  # 0.75, 0.50, 0.25
            for meter in METERS:
                if effects[meter] > 0:
                    effects[meter] *= dampening

        # --- 3. Apply time-of-day multipliers (SLEEP bypasses) ---
        if action_name != "sleep":
            effects = self._apply_time_multipliers(effects, slot)

        # Snapshot expected effects here β€” after time/dampening but BEFORE profile
        # modifiers. The anomaly = actual_delta - expected gives the agent a direct
        # per-step fingerprint of the hidden profile modifier.
        expected_no_profile = dict(effects)

        # --- 4. Apply profile modifiers ---
        effects = self._apply_profile_modifiers(effects, action_name, slot)

        # --- 5. Apply global vitality factor (low vitality reduces positive effects) ---
        vitality_factor = 0.5 + 0.5 * self._vitality
        for meter in METERS:
            if meter != "vitality" and effects[meter] > 0:
                effects[meter] *= vitality_factor
        # Apply same vitality factor to expected for fair anomaly comparison
        for meter in METERS:
            if meter != "vitality" and expected_no_profile[meter] > 0:
                expected_no_profile[meter] *= vitality_factor

        # --- 6. Apply passive decays ---
        self._apply_passive_decays()

        # --- 7. Update meters and track deltas ---
        deltas: Dict[str, float] = {}
        for meter in METERS:
            old_val = getattr(self, f"_{meter}")
            new_val = max(0.0, min(1.0, old_val + effects[meter]))
            deltas[meter] = new_val - old_val
            setattr(self, f"_{meter}", new_val)

        # --- 8. Compute reward ---
        reward = self._compute_reward(deltas)

        # --- 9. Check critical thresholds ---
        for meter in METERS:
            if getattr(self, f"_{meter}") < CRITICAL_THRESHOLD:
                reward += CRITICAL_PENALTY
                self._crash_count += 1

        # Clamp reward
        reward = max(-3.0, min(3.0, round(reward, 4)))
        self._total_reward += reward
        self._step_rewards.append(reward)

        # --- 10. Advance timestep ---
        self._timestep += 1
        new_day = self._timestep // SLOTS_PER_DAY
        new_slot = self._timestep % SLOTS_PER_DAY

        # --- 11. Check done ---
        done = self._timestep >= MAX_STEPS

        # --- 12. Build reward breakdown ---
        # Includes: per-meter deltas, per-meter anomalies (actual - expected),
        # event flag, and final_score on the last step.
        reward_breakdown: Dict[str, float] = {}
        for meter in METERS:
            reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
            reward_breakdown[f"{meter}_anomaly"] = round(
                deltas[meter] - expected_no_profile[meter], 4
            )
        if active_event:
            reward_breakdown["event"] = 1.0

        # --- 13. Grade if done ---
        if done:
            final_score = self._grade_episode()
            reward_breakdown["final_score"] = round(final_score, 4)
            # Sparse terminal reward: directly supervise on grader final_score.
            # Centered on 0.5 (the "average" episode), scaled by 5x to give a
            # range of [-2.5, +2.5] β€” strong enough to dominate any local
            # reward-hack the agent might find on per-step shaping alone.
            terminal_bonus = (final_score - 0.5) * 5.0
            reward = max(-3.0, min(3.0, reward + terminal_bonus))
            self._total_reward += terminal_bonus  # update tracking too
            reward_breakdown["terminal_bonus"] = round(terminal_bonus, 4)

        # --- 14. Update state ---
        self._state.step_count = self._timestep
        self._state.timestep = self._timestep
        self._state.day = new_day
        self._state.slot = new_slot
        self._state.vitality = round(self._vitality, 4)
        self._state.cognition = round(self._cognition, 4)
        self._state.progress = round(self._progress, 4)
        self._state.serenity = round(self._serenity, 4)
        self._state.connection = round(self._connection, 4)
        self._state.active_event = active_event

        # --- 15. Append completed step to rolling history ---
        # History entries carry per-meter anomalies (actual βˆ’ expected_under_neutral).
        # The prompt builder reads these directly to surface the agent's clearest
        # profile-inference signal.
        self._step_history.append({
            "step": current_step,
            "action": action_name,
            "reward": reward,
            "vitality_delta": round(deltas["vitality"], 4),
            "cognition_delta": round(deltas["cognition"], 4),
            "progress_delta": round(deltas["progress"], 4),
            "serenity_delta": round(deltas["serenity"], 4),
            "connection_delta": round(deltas["connection"], 4),
            "vitality_anomaly": round(deltas["vitality"] - expected_no_profile["vitality"], 4),
            "cognition_anomaly": round(deltas["cognition"] - expected_no_profile["cognition"], 4),
            "progress_anomaly": round(deltas["progress"] - expected_no_profile["progress"], 4),
            "serenity_anomaly": round(deltas["serenity"] - expected_no_profile["serenity"], 4),
            "connection_anomaly": round(deltas["connection"] - expected_no_profile["connection"], 4),
        })
        if len(self._step_history) > HISTORY_LENGTH:
            self._step_history.pop(0)

        return self._make_observation(
            reward=reward,
            done=done,
            active_event=active_event,
            reward_breakdown=reward_breakdown,
            deltas=deltas,
            last_action=action_name,
        )

    # ------------------------------------------------------------------
    # state property
    # ------------------------------------------------------------------

    @property
    def state(self) -> RhythmState:
        return self._state

    def get_belief_target(self) -> List[float]:
        """Return the 3-dim ground-truth belief vector for the active profile.

        Used during training to compute belief-accuracy reward (Phase 3).
        Privileged information β€” not exposed via observation.
        """
        return profile_to_belief_vector(self._profile)

    def record_belief(self, belief: List[float]) -> None:
        """Record the agent's emitted belief for the current step.

        The grader (`_grade_episode`) uses the LAST recorded belief to compute
        the belief_accuracy component of final_score. Callers should invoke
        this once per step after parsing the agent's completion. Heuristic /
        random baselines that don't emit a belief never call this, and the
        belief component scores 0 for them β€” that's intentional: the meta-RL
        skill is INFERENCE, and only agents that actually try get credit.
        """
        if len(belief) != 3:
            raise ValueError(f"belief must have 3 elements, got {len(belief)}")
        self._final_belief = [max(0.0, min(1.0, float(b))) for b in belief]

    def get_profile_hint(self) -> Dict[str, float]:
        """Return a coarse profile hint usable in observation during curriculum.

        Returns the 3-dim belief vector with descriptive keys. The dataset
        generator passes this into the prompt for the fraction of samples
        with show_profile_hint=True (the curriculum's "visible" warmup phase).
        """
        b = profile_to_belief_vector(self._profile)
        return {"social_pref": round(b[0], 3), "morning_pref": round(b[1], 3), "work_pref": round(b[2], 3)}

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _roll_event(self) -> Optional[str]:
        """Roll for a random event this step."""
        if self._rng.random() < EVENT_PROBABILITY:
            return self._rng.choice(EVENT_NAMES)
        return None

    def _apply_event(self, event_name: str) -> None:
        """Apply event effects to meters, modified by profile."""
        effects = EVENT_EFFECTS[event_name]
        multiplier = self._profile["event_impact_multiplier"]
        for meter in METERS:
            delta = effects[meter]
            # Only apply multiplier to negative effects
            if delta < 0:
                delta *= multiplier
            old_val = getattr(self, f"_{meter}")
            new_val = max(0.0, min(1.0, old_val + delta))
            setattr(self, f"_{meter}", new_val)

    def _apply_time_multipliers(
        self, effects: Dict[str, float], slot: int
    ) -> Dict[str, float]:
        """Apply time-of-day multipliers to action effects."""
        multipliers = TIME_MULTIPLIERS[slot]

        for meter in effects:
            if meter == "cognition" and effects[meter] > 0:
                effects[meter] *= multipliers["cognition_gain"]
            elif meter == "vitality" and effects[meter] < 0:
                effects[meter] *= multipliers["vitality_drain"]

        return effects

    def _apply_profile_modifiers(
        self, effects: Dict[str, float], action_name: str, slot: int
    ) -> Dict[str, float]:
        """Apply hidden profile modifiers to action effects."""
        profile = self._profile

        # Social vitality drain multiplier
        if action_name in SOCIAL_ACTIONS and effects["vitality"] < 0:
            effects["vitality"] *= profile["social_vitality_multiplier"]

        # Introvert morning cognition bonus
        bonus = profile.get("morning_cognition_bonus")
        if bonus and slot == 0:
            if effects["cognition"] > 0:
                effects["cognition"] *= bonus
            if effects["progress"] > 0:
                effects["progress"] *= bonus

        # Extrovert evening/night cognition bonus
        bonus = profile.get("evening_night_cognition_bonus")
        if bonus and slot in (2, 3):
            if effects["cognition"] > 0:
                effects["cognition"] *= bonus
            if effects["progress"] > 0:
                effects["progress"] *= bonus

        # Extrovert morning penalty
        penalty = profile.get("morning_penalty")
        if penalty and slot == 0:
            if effects["cognition"] > 0:
                effects["cognition"] *= penalty
            if effects["progress"] > 0:
                effects["progress"] *= penalty

        # Binge shame spiral
        if profile.get("binge_shame") and action_name == "binge_watch":
            effects["serenity"] -= 0.15
            effects["cognition"] -= 0.06

        # Workaholic: progress-producing actions give serenity bonus
        psb = profile.get("progress_serenity_bonus", 0.0)
        if psb > 0 and effects["progress"] > 0:
            effects["serenity"] += psb

        # Workaholic: idle actions drain serenity
        isd = profile.get("idle_serenity_decay", 0.0)
        if isd > 0 and action_name in IDLE_ACTIONS:
            effects["serenity"] -= isd

        # Solo recharge: introvert gets serenity from alone time
        ssb = profile.get("solo_serenity_bonus", 0.0)
        if ssb > 0 and action_name in ("me_time", "meditate"):
            effects["serenity"] += ssb

        # Social connection multiplier: extrovert gets more connection from socializing
        scm = profile.get("social_connection_multiplier", 1.0)
        if scm != 1.0 and action_name in SOCIAL_ACTIONS and effects["connection"] > 0:
            effects["connection"] *= scm

        # Social serenity bonus: extrovert gets serenity from socializing
        ssrb = profile.get("social_serenity_bonus", 0.0)
        if ssrb > 0 and action_name in SOCIAL_ACTIONS:
            effects["serenity"] += ssrb

        # Work vitality recovery: workaholic gets vitality from productive work
        wvr = profile.get("work_vitality_recovery", 0.0)
        if wvr > 0 and action_name in WORK_ACTIONS:
            effects["vitality"] += wvr

        # Low serenity amplification (stress spiral)
        if self._serenity < profile.get("stress_tolerance", 0.3):
            for meter in effects:
                if effects[meter] < 0:
                    effects[meter] *= 1.3

        return effects

    def _apply_passive_decays(self) -> None:
        """Apply per-step passive meter decays."""
        # Connection always decays if not actively maintained
        decay = self._profile["connection_decay_rate"]
        self._connection = max(0.0, self._connection - decay)

        # Workaholic extra vitality decay
        vd = self._profile.get("vitality_decay_rate", 0.0)
        if vd > 0:
            self._vitality = max(0.0, self._vitality - vd)

    def _compute_reward(self, deltas: Dict[str, float]) -> float:
        """Pure profile-weighted per-step reward.

        Deliberately uncontaminated: the grader-aligned bias (progress +
        connection deltas) lives in the TRAINING reward function in
        reward_functions.py, not here. Keeping the env's per-step reward
        pure means (1) the agent's inference signal stays a clean function
        of the hidden profile_weights, (2) the grader's adaptation_score
        isn't computed on biased rewards, and (3) the env's reward matches
        what an honest deployment would surface to the agent.
        """
        weights = self._profile["reward_weights"]
        return sum(deltas[m] * weights[m] for m in METERS) * REWARD_SCALE

    def _grade_episode(self) -> float:
        """
        Compute final episode score in [0, 1].

        Components (meta-learning aligned):
          0.15 β€” crash_free_ratio: no critical meter drops
          0.20 β€” progress: career/skill growth
          0.10 β€” connection: relationship maintained
          0.25 β€” adaptation_score: agent got better as it learned the user
          0.10 β€” efficiency: bounded normalized average reward
          0.20 β€” belief_accuracy: how close last-emitted belief is to true profile

        DESIGN NOTE β€” Acknowledged conformance gap with OpenEnv:
        This grader is functionally equivalent to a `WeightedSum` Rubric
        (from `openenv.core.rubrics`) over 6 child Rubrics β€” same
        composability, same independent components, same explicit weights.
        We did not refactor to use the Rubric class literal because the
        grader reads aggregated episode-end state (per-step rewards buffer,
        crash_count, terminal belief) while OpenEnv's `Rubric.forward`
        expects per-(action, observation) inputs. A clean refactor would
        use `TrajectoryRubric` for the cumulative components and the
        per-step `Rubric` for crash_free / belief_accuracy. Tracked as
        a v2 cleanup task; not blocking on the meta-RL skill we're
        evaluating.

        Implementation: composes 6 `Rubric` subclasses via OpenEnv's
        `WeightedSum` (see `server/rubrics.py`). Each sub-rubric reads
        the aggregated episode state (`_step_rewards`, `_crash_count`,
        `_final_belief`, `_profile`) of the env it was built with β€”
        RFC 004's recommended pattern for trajectory-summary scoring on
        top of the per-(action, observation) Rubric ABC.

        belief_accuracy is the explicit meta-RL inference signal: an
        agent that doesn't emit a belief scores 0 here, an agent emitting
        a belief close to the hidden profile vector scores up to 1.
        Without this term, agents that play heuristic-style "keep meters
        healthy" score the same as agents that actually infer the profile,
        since the other components don't differentiate inference from
        reflex.
        """
        from server.rubrics import make_grade_rubric

        # Build (or reuse) the composed rubric. The Rubric subclasses are
        # stateless once built β€” they read live env state at forward()
        # time β€” so caching is safe.
        if self._grade_rubric is None:
            self._grade_rubric = make_grade_rubric(self)

        # forward(action, observation) β€” args are unused for episode-end
        # scoring; the rubric reads from `self`.
        score = self._grade_rubric(action=None, observation=None)
        return max(0.0, min(1.0, float(score)))

    def _make_observation(
        self,
        reward: float,
        done: bool,
        active_event: Optional[str],
        reward_breakdown: Optional[Dict[str, float]] = None,
        deltas: Optional[Dict[str, float]] = None,
        last_action: Optional[str] = None,
    ) -> RhythmObservation:
        """Build the observation returned to the agent (hides profile)."""
        step_records = [
            StepRecord(
                step=h["step"],
                action=h["action"],
                reward=h["reward"],
                vitality_delta=h["vitality_delta"],
                cognition_delta=h["cognition_delta"],
                progress_delta=h["progress_delta"],
                serenity_delta=h["serenity_delta"],
                connection_delta=h["connection_delta"],
                vitality_anomaly=h.get("vitality_anomaly", 0.0),
                cognition_anomaly=h.get("cognition_anomaly", 0.0),
                progress_anomaly=h.get("progress_anomaly", 0.0),
                serenity_anomaly=h.get("serenity_anomaly", 0.0),
                connection_anomaly=h.get("connection_anomaly", 0.0),
            )
            for h in self._step_history
        ]

        return RhythmObservation(
            timestep=self._timestep,
            day=self._timestep // SLOTS_PER_DAY,
            slot=self._timestep % SLOTS_PER_DAY,
            vitality=round(self._vitality, 4),
            cognition=round(self._cognition, 4),
            progress=round(self._progress, 4),
            serenity=round(self._serenity, 4),
            connection=round(self._connection, 4),
            active_event=active_event,
            remaining_steps=MAX_STEPS - self._timestep,
            reward_breakdown=reward_breakdown or {},
            reward=reward,
            done=done,
            # First-class delta fields (from this step; zero on reset)
            vitality_delta=round(deltas["vitality"], 4) if deltas else 0.0,
            cognition_delta=round(deltas["cognition"], 4) if deltas else 0.0,
            progress_delta=round(deltas["progress"], 4) if deltas else 0.0,
            serenity_delta=round(deltas["serenity"], 4) if deltas else 0.0,
            connection_delta=round(deltas["connection"], 4) if deltas else 0.0,
            last_action=last_action,
            # Rolling history of the last HISTORY_LENGTH completed steps
            step_history=step_records,
        )