File size: 19,753 Bytes
c07f15e
 
 
 
 
 
cc6473a
c07f15e
 
 
cc6473a
c07f15e
cc6473a
c07f15e
cc6473a
 
c07f15e
 
 
 
 
 
 
 
 
cc6473a
 
 
 
c07f15e
cc6473a
c07f15e
 
cc6473a
c07f15e
cc6473a
 
 
c07f15e
cc6473a
 
 
c07f15e
 
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c07f15e
 
cc6473a
 
 
 
c07f15e
cc6473a
c07f15e
 
 
cc6473a
c07f15e
 
cc6473a
c07f15e
cc6473a
 
 
c07f15e
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c07f15e
cc6473a
 
 
 
 
 
 
 
 
 
c07f15e
 
cc6473a
 
 
c07f15e
cc6473a
c07f15e
cc6473a
 
 
 
 
c07f15e
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c07f15e
 
 
cc6473a
c07f15e
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c07f15e
cc6473a
 
c07f15e
cc6473a
 
c07f15e
cc6473a
 
c07f15e
cc6473a
c07f15e
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
c07f15e
cc6473a
 
 
 
 
 
c07f15e
cc6473a
 
 
 
 
 
 
 
 
c07f15e
 
cc6473a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c07f15e
cc6473a
 
 
c07f15e
cc6473a
 
 
 
 
 
 
c07f15e
 
 
cc6473a
c07f15e
 
cc6473a
c07f15e
cc6473a
 
 
 
 
 
 
 
ece0bbe
cc6473a
 
ecbe0d8
 
ece0bbe
ecbe0d8
 
ece0bbe
ecbe0d8
cc6473a
 
 
 
 
 
 
 
ece0bbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0ca22d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Tests for the RhythmEnv Life Simulator."""

import pytest

from models import ActionType, RhythmAction, RhythmObservation
from server.rhythm_environment import (
    CRITICAL_THRESHOLD,
    MAX_STEPS,
    METERS,
    PROFILES,
    RhythmEnvironment,
)


@pytest.fixture
def env():
    return RhythmEnvironment()


def make_action(action_type: ActionType) -> RhythmAction:
    return RhythmAction(action_type=action_type)


# ---------------------------------------------------------------------------
# TestReset
# ---------------------------------------------------------------------------


class TestReset:
    def test_returns_valid_observation(self, env):
        obs = env.reset(seed=0)
        assert isinstance(obs, RhythmObservation)
        assert obs.timestep == 0
        assert obs.day == 0
        assert obs.slot == 0
        assert obs.remaining_steps == MAX_STEPS
        assert obs.done is False
        assert obs.reward == 0.0

    def test_meters_initialized(self, env):
        obs = env.reset(seed=0)
        assert 0.0 <= obs.vitality <= 1.0
        assert 0.0 <= obs.cognition <= 1.0
        assert obs.progress == 0.0
        assert 0.0 <= obs.serenity <= 1.0
        assert 0.0 <= obs.connection <= 1.0

    def test_seed_selects_profile(self, env):
        """Different seeds select different profiles."""
        profiles_seen = set()
        for seed in range(3):
            env.reset(seed=seed)
            profiles_seen.add(env.state.profile_name)
        assert len(profiles_seen) == 3

    def test_deterministic_with_same_seed(self, env):
        obs1 = env.reset(seed=42)
        obs2 = env.reset(seed=42)
        assert obs1.vitality == obs2.vitality
        assert obs1.cognition == obs2.cognition
        assert obs1.serenity == obs2.serenity
        assert obs1.connection == obs2.connection

    def test_explicit_profile_selection(self, env):
        env.reset(seed=0, profile="workaholic_stoic")
        assert env.state.profile_name == "workaholic_stoic"

    def test_reset_clears_state(self, env):
        env.reset(seed=0)
        for _ in range(5):
            env.step(make_action(ActionType.DEEP_WORK))
        obs = env.reset(seed=0)
        assert obs.timestep == 0
        assert obs.progress == 0.0


# ---------------------------------------------------------------------------
# TestStep
# ---------------------------------------------------------------------------


class TestStep:
    def test_timestep_advances(self, env):
        env.reset(seed=0)
        obs = env.step(make_action(ActionType.DEEP_WORK))
        assert obs.timestep == 1

    def test_day_and_slot_correct(self, env):
        env.reset(seed=0)
        for _ in range(5):
            obs = env.step(make_action(ActionType.SLEEP))
        assert obs.day == 1
        assert obs.slot == 1

    def test_deep_work_increases_progress(self, env):
        env.reset(seed=0)
        obs = env.step(make_action(ActionType.DEEP_WORK))
        assert obs.progress > 0.0

    def test_deep_work_drains_vitality(self, env):
        env.reset(seed=0)
        initial_vitality = env.state.vitality
        obs = env.step(make_action(ActionType.DEEP_WORK))
        assert obs.vitality < initial_vitality

    def test_sleep_recovers_vitality(self, env):
        env.reset(seed=0)
        for _ in range(3):
            env.step(make_action(ActionType.DEEP_WORK))
        vitality_before_sleep = env.state.vitality
        obs = env.step(make_action(ActionType.SLEEP))
        assert obs.vitality > vitality_before_sleep

    def test_socialize_builds_connection(self, env):
        env.reset(seed=0)
        initial_connection = env.state.connection
        obs = env.step(make_action(ActionType.FAMILY_TIME))
        assert obs.connection > initial_connection - 0.05

    def test_episode_ends_at_max_steps(self, env):
        env.reset(seed=0)
        for i in range(MAX_STEPS):
            obs = env.step(make_action(ActionType.SLEEP))
        assert obs.done is True
        assert obs.timestep == MAX_STEPS

    def test_not_done_before_max_steps(self, env):
        env.reset(seed=0)
        for i in range(MAX_STEPS - 1):
            obs = env.step(make_action(ActionType.SLEEP))
        assert obs.done is False

    def test_meters_stay_in_bounds(self, env):
        """No meter exceeds [0.0, 1.0] regardless of actions."""
        env.reset(seed=0)
        for _ in range(MAX_STEPS):
            obs = env.step(make_action(ActionType.DEEP_WORK))
            for meter in METERS:
                val = getattr(obs, meter)
                assert 0.0 <= val <= 1.0, f"{meter}={val} out of bounds"

    def test_low_vitality_reduces_effectiveness(self, env):
        """Progress gain should be lower when vitality is low."""
        env.reset(seed=0, profile="introvert_morning")
        obs_high = env.step(make_action(ActionType.DEEP_WORK))
        progress_high = obs_high.progress

        env.reset(seed=0, profile="introvert_morning")
        for _ in range(6):
            env.step(make_action(ActionType.DEEP_WORK))
        progress_before = env.state.progress
        env.step(make_action(ActionType.DEEP_WORK))
        progress_gained_low = env.state.progress - progress_before

        assert progress_high > progress_gained_low


# ---------------------------------------------------------------------------
# TestProfiles
# ---------------------------------------------------------------------------


class TestProfiles:
    def test_introvert_social_drains_more(self, env):
        """Introvert loses more vitality from socializing than extrovert."""
        env.reset(seed=0, profile="introvert_morning")
        v_before_intro = env.state.vitality
        env.step(make_action(ActionType.SOCIALIZE))
        intro_drain = v_before_intro - env.state.vitality

        env2 = RhythmEnvironment()
        env2.reset(seed=0, profile="extrovert_night_owl")
        v_before_extro = env2.state.vitality
        env2.step(make_action(ActionType.SOCIALIZE))
        extro_drain = v_before_extro - env2.state.vitality

        assert intro_drain > extro_drain

    def test_workaholic_progress_gives_serenity(self, env):
        """Workaholic has better serenity outcome from deep work than introvert."""
        env.reset(seed=0, profile="workaholic_stoic")
        serenity_before = env.state.serenity
        env.step(make_action(ActionType.DEEP_WORK))
        workaholic_change = env.state.serenity - serenity_before

        env2 = RhythmEnvironment()
        env2.reset(seed=0, profile="introvert_morning")
        serenity_before_intro = env2.state.serenity
        env2.step(make_action(ActionType.DEEP_WORK))
        introvert_change = env2.state.serenity - serenity_before_intro

        assert workaholic_change > introvert_change

    def test_binge_shame_introvert(self, env):
        """Introvert suffers extra serenity loss from binge watching."""
        env.reset(seed=0, profile="introvert_morning")
        serenity_before = env.state.serenity
        env.step(make_action(ActionType.BINGE_WATCH))
        intro_change = env.state.serenity - serenity_before

        env2 = RhythmEnvironment()
        env2.reset(seed=0, profile="extrovert_night_owl")
        serenity_before_ext = env2.state.serenity
        env2.step(make_action(ActionType.BINGE_WATCH))
        ext_change = env2.state.serenity - serenity_before_ext

        assert intro_change < ext_change

    def test_different_rewards_same_action(self, env):
        """Same action produces different rewards for different profiles."""
        rewards = {}
        for profile_name in ["introvert_morning", "extrovert_night_owl", "workaholic_stoic"]:
            e = RhythmEnvironment()
            e.reset(seed=0, profile=profile_name)
            obs = e.step(make_action(ActionType.DEEP_WORK))
            rewards[profile_name] = obs.reward

        values = list(rewards.values())
        assert len(set(round(v, 3) for v in values)) > 1

    def test_extrovert_night_cognition_bonus(self, env):
        """Extrovert gets better cognition gains in evening vs morning."""
        env.reset(seed=0, profile="extrovert_night_owl")
        env.step(make_action(ActionType.SLEEP))  # morning
        env.step(make_action(ActionType.SLEEP))  # afternoon
        cognition_before = env.state.cognition
        env.step(make_action(ActionType.MEDITATE))  # evening
        evening_gain = env.state.cognition - cognition_before

        env.reset(seed=0, profile="extrovert_night_owl")
        cognition_before_m = env.state.cognition
        env.step(make_action(ActionType.MEDITATE))  # morning
        morning_gain = env.state.cognition - cognition_before_m

        assert evening_gain > morning_gain


# ---------------------------------------------------------------------------
# TestEvents
# ---------------------------------------------------------------------------


class TestEvents:
    def test_events_deterministic_with_seed(self, env):
        """Same seed produces same event sequence."""
        events1 = []
        env.reset(seed=99)
        for _ in range(MAX_STEPS):
            obs = env.step(make_action(ActionType.SLEEP))
            events1.append(obs.active_event)

        events2 = []
        env.reset(seed=99)
        for _ in range(MAX_STEPS):
            obs = env.step(make_action(ActionType.SLEEP))
            events2.append(obs.active_event)

        assert events1 == events2

    def test_event_visible_in_observation(self, env):
        """When an event fires, active_event is set in observation."""
        found_event = False
        for seed in range(100):
            env.reset(seed=seed)
            for _ in range(MAX_STEPS):
                obs = env.step(make_action(ActionType.SLEEP))
                if obs.active_event is not None:
                    found_event = True
                    assert obs.active_event in [
                        "prod_crash", "family_emergency", "illness", "good_news"
                    ]
                    break
            if found_event:
                break
        assert found_event, "No events triggered in 100 episodes"

    def test_no_event_when_none(self, env):
        """Most steps should have no event."""
        env.reset(seed=0)
        no_event_count = 0
        for _ in range(MAX_STEPS):
            obs = env.step(make_action(ActionType.SLEEP))
            if obs.active_event is None:
                no_event_count += 1
        assert no_event_count > MAX_STEPS * 0.7


# ---------------------------------------------------------------------------
# TestGrader
# ---------------------------------------------------------------------------


class TestGrader:
    def test_final_score_in_range(self, env):
        env.reset(seed=0)
        for _ in range(MAX_STEPS):
            obs = env.step(make_action(ActionType.SLEEP))
        assert "final_score" in obs.reward_breakdown
        score = obs.reward_breakdown["final_score"]
        assert 0.0 <= score <= 1.0

    def test_balanced_play_beats_all_sleep(self, env):
        """A balanced strategy should score higher than just sleeping."""
        env.reset(seed=0)
        for _ in range(MAX_STEPS):
            obs_sleep = env.step(make_action(ActionType.SLEEP))
        score_sleep = obs_sleep.reward_breakdown["final_score"]

        balanced_actions = [
            ActionType.DEEP_WORK, ActionType.LEARN,
            ActionType.EXERCISE, ActionType.FAMILY_TIME,
        ] * 7
        env.reset(seed=0)
        for action_type in balanced_actions:
            obs_balanced = env.step(make_action(action_type))
        score_balanced = obs_balanced.reward_breakdown["final_score"]

        assert score_balanced > score_sleep

    def test_deterministic_grading(self, env):
        """Same actions produce same final score."""
        scores = []
        for _ in range(2):
            env.reset(seed=42)
            for _ in range(MAX_STEPS):
                obs = env.step(make_action(ActionType.DEEP_WORK))
            scores.append(obs.reward_breakdown["final_score"])
        assert scores[0] == scores[1]

    def test_all_binge_scores_low(self, env):
        """Binge watching everything should produce a low score."""
        env.reset(seed=0)
        for _ in range(MAX_STEPS):
            obs = env.step(make_action(ActionType.BINGE_WATCH))
        score = obs.reward_breakdown["final_score"]
        assert score < 0.5


# ---------------------------------------------------------------------------
# TestEdgeCases
# ---------------------------------------------------------------------------


class TestEdgeCases:
    def test_observation_hides_profile(self, env):
        """Observation should not expose profile_name."""
        obs = env.reset(seed=0)
        obs_dict = obs.model_dump()
        assert "profile_name" not in obs_dict

    def test_state_exposes_profile(self, env):
        """State should include profile_name for debugging."""
        # Default: continuous profile (name like 'sampled_0')
        env.reset(seed=0)
        assert env.state.profile_name != ""
        assert env.state.profile_name.startswith("sampled_")

        # Explicit profile: name matches the requested reference profile
        env.reset(seed=0, profile="workaholic_stoic")
        assert env.state.profile_name == "workaholic_stoic"
        assert env.state.profile_name in [p["name"] for p in PROFILES]

    def test_all_action_types_valid(self, env):
        """Every ActionType should be processable without error."""
        env.reset(seed=0)
        for action_type in ActionType:
            e = RhythmEnvironment()
            e.reset(seed=0)
            obs = e.step(make_action(action_type))
            assert isinstance(obs, RhythmObservation)


# ---------------------------------------------------------------------------
# Belief-accuracy grader component
# ---------------------------------------------------------------------------


class TestBeliefAccuracyGrader:
    """The grader awards 0.20 weight to belief_accuracy. Agents that don't
    emit beliefs get 0 on this component; agents whose final belief matches
    the true profile vector get up to 0.20 added to final_score.
    """

    def _run_episode_with_belief(self, seed, belief, profile=None):
        env = RhythmEnvironment()
        if profile:
            obs = env.reset(seed=seed, profile=profile)
        else:
            obs = env.reset(seed=seed)
        for _ in range(MAX_STEPS):
            if obs.done:
                break
            env.record_belief(belief)
            obs = env.step(make_action(ActionType.SLEEP))
        return obs.reward_breakdown.get("final_score", 0.0)

    def _run_episode_no_belief(self, seed, profile=None):
        env = RhythmEnvironment()
        if profile:
            obs = env.reset(seed=seed, profile=profile)
        else:
            obs = env.reset(seed=seed)
        for _ in range(MAX_STEPS):
            if obs.done:
                break
            obs = env.step(make_action(ActionType.SLEEP))
        return obs.reward_breakdown.get("final_score", 0.0)

    def test_no_belief_means_zero_belief_component(self, env):
        """Agent that never calls record_belief gets 0 on the belief component."""
        score = self._run_episode_no_belief(seed=42)
        # Without belief, max possible score is 0.80 (all weights ex belief).
        # Realistic ceiling is much lower since SLEEP-only doesn't max meters.
        assert score <= 0.80

    def test_perfect_belief_lifts_score(self, env):
        """An agent that emits the TRUE belief vector should score higher
        than the same actions with no belief — by up to +0.20."""
        # Use a known reference profile so we can hand-pick the perfect belief.
        from server.rhythm_environment import (
            PROFILE_MAP,
            profile_to_belief_vector,
        )
        profile_name = "workaholic_stoic"
        true_belief = profile_to_belief_vector(PROFILE_MAP[profile_name])

        no_belief_score = self._run_episode_no_belief(seed=7, profile=profile_name)
        perfect_score = self._run_episode_with_belief(
            seed=7, belief=true_belief, profile=profile_name
        )
        # Perfect belief contributes 0.20 to final_score
        assert perfect_score > no_belief_score
        assert (perfect_score - no_belief_score) == pytest.approx(0.20, abs=0.01)

    def test_wrong_belief_scores_less_than_perfect(self, env):
        """Wrong belief still counts (0 ≤ score ≤ 1) but less than perfect."""
        from server.rhythm_environment import (
            PROFILE_MAP,
            profile_to_belief_vector,
        )
        profile_name = "introvert_morning"
        true_belief = profile_to_belief_vector(PROFILE_MAP[profile_name])
        wrong_belief = [1.0 - b for b in true_belief]  # opposite

        perfect_score = self._run_episode_with_belief(
            seed=7, belief=true_belief, profile=profile_name
        )
        wrong_score = self._run_episode_with_belief(
            seed=7, belief=wrong_belief, profile=profile_name
        )
        assert perfect_score > wrong_score

    def test_record_belief_validates_length(self, env):
        env.reset(seed=0)
        with pytest.raises(ValueError):
            env.record_belief([0.5, 0.5])  # wrong length
        with pytest.raises(ValueError):
            env.record_belief([0.5, 0.5, 0.5, 0.5])  # too long

    def test_record_belief_clamps_to_unit_interval(self, env):
        """Beliefs outside [0, 1] should be clamped, not rejected."""
        env.reset(seed=0)
        env.record_belief([-0.5, 1.5, 0.5])
        # Internal state should be clamped
        assert env._final_belief == [0.0, 1.0, 0.5]

    def test_grader_uses_openenv_weighted_sum_rubric(self, env):
        """Grader composes child rubrics via openenv.core.rubrics.WeightedSum."""
        from openenv.core.rubrics import Rubric, WeightedSum
        from server.rubrics import (
            CrashFreeRubric, ProgressRubric, ConnectionRubric,
            AdaptationRubric, EfficiencyRubric, BeliefAccuracyRubric,
            GRADE_WEIGHTS, make_grade_rubric,
        )

        # Trigger a full episode so _grade_episode runs and builds the rubric
        obs = env.reset(seed=0)
        for _ in range(MAX_STEPS):
            if obs.done:
                break
            obs = env.step(make_action(ActionType.SLEEP))

        rubric = env._grade_rubric
        assert isinstance(rubric, WeightedSum), "grader must use WeightedSum"
        assert isinstance(rubric, Rubric)

        # 6 children, one per scoring component
        children = list(rubric.children())
        assert len(children) == 6
        types = {type(c).__name__ for c in children}
        assert types == {
            "CrashFreeRubric", "ProgressRubric", "ConnectionRubric",
            "AdaptationRubric", "EfficiencyRubric", "BeliefAccuracyRubric",
        }

        # Weights must sum to 1.0 (WeightedSum enforces; sanity check the keys)
        assert abs(sum(GRADE_WEIGHTS.values()) - 1.0) < 1e-6

    def test_make_grade_rubric_is_pure_function(self, env):
        """make_grade_rubric should produce equivalent rubrics across calls."""
        from server.rubrics import make_grade_rubric

        env.reset(seed=42)
        r1 = make_grade_rubric(env)
        r2 = make_grade_rubric(env)
        # Same shape, fresh object
        assert len(list(r1.children())) == len(list(r2.children())) == 6
        assert r1 is not r2
        # Same weights
        assert r1._weights == r2._weights