Spaces:
Sleeping
Refactor grader to use openenv.core.rubrics.WeightedSum + Rubric subclasses
Browse filesCloses the acknowledged conformance gap. The functional behavior is
preserved exactly (52/52 tests pass, including 2 new tests verifying
the grader literally uses WeightedSum with 6 named child rubrics).
Architecture:
server/rubrics.py β 6 Rubric subclasses, one per scored axis:
CrashFreeRubric, ProgressRubric, ConnectionRubric, AdaptationRubric,
EfficiencyRubric, BeliefAccuracyRubric
Each holds a reference to the env in __init__; forward(action, obs)
ignores the per-step args (RFC 004 pattern for trajectory-summary
scoring) and reads aggregated env state.
make_grade_rubric(env) returns a WeightedSum composing all 6 with
weights summing to 1.0 (0.15 + 0.20 + 0.10 + 0.25 + 0.10 + 0.20).
RhythmEnvironment._grade_episode now lazy-builds and delegates to
the WeightedSum on done=True.
Also updated:
- server/rhythm_environment.py: cached _grade_rubric field on the env
- tests/test_rhythm_env.py: 2 tests verifying WeightedSum is used
- docs/iterations.md: replaced 'acknowledged gap' with 'refactor done'
- scripts/train_on_hf.py: support MODEL_NAME env var so we can refine
SFT'd checkpoints from HF Hub (needed for GRPO-on-top-of-SFT)
NOT pushed; awaiting user approval after morning review.
- docs/iterations.md +22 -13
- scripts/train_on_hf.py +7 -0
- server/rhythm_environment.py +29 -66
- server/rubrics.py +193 -0
- tests/test_rhythm_env.py +45 -0
|
@@ -408,16 +408,25 @@ didn't measure inference. Reading the model's reasoning surfaced the
|
|
| 408 |
mismatch. Fixing the grader and switching to Algorithm Distillation got
|
| 409 |
us a real result. The journey is the writeup.
|
| 410 |
|
| 411 |
-
##
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
mismatch. Fixing the grader and switching to Algorithm Distillation got
|
| 409 |
us a real result. The journey is the writeup.
|
| 410 |
|
| 411 |
+
## OpenEnv Rubric system (refactor complete, post-deadline)
|
| 412 |
+
|
| 413 |
+
Originally we ran with a custom `_grade_episode` and an honest
|
| 414 |
+
acknowledged gap. After the submission deadline we returned and did
|
| 415 |
+
the proper refactor (see `server/rubrics.py`):
|
| 416 |
+
|
| 417 |
+
- 6 `Rubric` subclasses, one per scored axis
|
| 418 |
+
(`CrashFreeRubric`, `ProgressRubric`, `ConnectionRubric`,
|
| 419 |
+
`AdaptationRubric`, `EfficiencyRubric`, `BeliefAccuracyRubric`)
|
| 420 |
+
- Composed via `openenv.core.rubrics.WeightedSum` with weights summing
|
| 421 |
+
to 1.0 (matching the original 0.15 / 0.20 / 0.10 / 0.25 / 0.10 / 0.20)
|
| 422 |
+
- `_grade_episode` now delegates to `make_grade_rubric(self)(None, None)`
|
| 423 |
+
|
| 424 |
+
Each sub-rubric reads aggregated episode-end env state via a reference
|
| 425 |
+
held in `__init__` β the recommended pattern from RFC 004 for
|
| 426 |
+
trajectory-summary scoring on top of the per-(action, observation)
|
| 427 |
+
Rubric ABC.
|
| 428 |
+
|
| 429 |
+
Two new tests in `tests/test_rhythm_env.py` verify that the grader
|
| 430 |
+
literally uses `WeightedSum` and that the 6 child rubrics are present
|
| 431 |
+
with the expected names (not just functionally equivalent β actually
|
| 432 |
+
using the framework primitive). All 52 tests pass.
|
|
@@ -114,8 +114,14 @@ def main():
|
|
| 114 |
# ---------------------------------------------------------------
|
| 115 |
# 2. Train
|
| 116 |
# ---------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
train_args = [
|
| 118 |
"python", "training/train.py",
|
|
|
|
| 119 |
"--max_steps", str(MAX_STEPS),
|
| 120 |
"--num_episodes", str(NUM_EPISODES),
|
| 121 |
"--max_samples", str(MAX_SAMPLES),
|
|
@@ -125,6 +131,7 @@ def main():
|
|
| 125 |
"--learning_rate", str(LEARNING_RATE),
|
| 126 |
"--output_dir", OUTPUT_DIR,
|
| 127 |
]
|
|
|
|
| 128 |
run(train_args)
|
| 129 |
|
| 130 |
# ---------------------------------------------------------------
|
|
|
|
| 114 |
# ---------------------------------------------------------------
|
| 115 |
# 2. Train
|
| 116 |
# ---------------------------------------------------------------
|
| 117 |
+
# MODEL_NAME env var lets us refine an existing trained model (e.g. SFT'd
|
| 118 |
+
# checkpoint on HF Hub) instead of starting from the base Qwen. Default
|
| 119 |
+
# is the original base model.
|
| 120 |
+
base_model = os.environ.get("MODEL_NAME", "unsloth/Qwen2.5-3B-Instruct")
|
| 121 |
+
|
| 122 |
train_args = [
|
| 123 |
"python", "training/train.py",
|
| 124 |
+
"--model_name", base_model,
|
| 125 |
"--max_steps", str(MAX_STEPS),
|
| 126 |
"--num_episodes", str(NUM_EPISODES),
|
| 127 |
"--max_samples", str(MAX_SAMPLES),
|
|
|
|
| 131 |
"--learning_rate", str(LEARNING_RATE),
|
| 132 |
"--output_dir", OUTPUT_DIR,
|
| 133 |
]
|
| 134 |
+
print(f"Starting from model: {base_model}")
|
| 135 |
run(train_args)
|
| 136 |
|
| 137 |
# ---------------------------------------------------------------
|
|
@@ -333,6 +333,9 @@ class RhythmEnvironment(Environment):
|
|
| 333 |
# consumed by _grade_episode. Stays None if the agent never emits a belief
|
| 334 |
# (e.g. heuristic baseline) β that case scores 0 on the belief component.
|
| 335 |
self._final_belief: Optional[List[float]] = None
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
def get_metadata(self) -> EnvironmentMetadata:
|
| 338 |
return EnvironmentMetadata(
|
|
@@ -766,73 +769,33 @@ class RhythmEnvironment(Environment):
|
|
| 766 |
a v2 cleanup task; not blocking on the meta-RL skill we're
|
| 767 |
evaluating.
|
| 768 |
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
|
|
|
|
|
|
|
|
|
| 780 |
"""
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
#
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
#
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
# Split rewards in halves; positive only if late half is non-negative
|
| 794 |
-
# AND late > early. Normalized to [0, 1].
|
| 795 |
-
half = max(steps // 2, 1)
|
| 796 |
-
early = self._step_rewards[:half]
|
| 797 |
-
late = self._step_rewards[half:]
|
| 798 |
-
if early and late:
|
| 799 |
-
mean_early = sum(early) / len(early)
|
| 800 |
-
mean_late = sum(late) / len(late)
|
| 801 |
-
# Per-step rewards are clamped to [-3, +3] in step(), so normalize
|
| 802 |
-
# late_quality with the [-3, +3] range (NOT [-1, +1]) β otherwise
|
| 803 |
-
# the gate saturates at 1.0 for any mean_late β₯ 1 and the grader
|
| 804 |
-
# cannot distinguish good from excellent late-half quality.
|
| 805 |
-
late_quality = max(0.0, min(1.0, (mean_late + 3.0) / 6.0))
|
| 806 |
-
gain = mean_late - mean_early
|
| 807 |
-
# gain in [-6, +6]; normalize to [0, 1] (only positive gain counts)
|
| 808 |
-
gain_norm = max(0.0, min(1.0, gain / 3.0))
|
| 809 |
-
adaptation_score = gain_norm * late_quality
|
| 810 |
-
else:
|
| 811 |
-
adaptation_score = 0.0
|
| 812 |
-
|
| 813 |
-
# 5. Efficiency (0.10): bounded normalized average reward
|
| 814 |
-
avg_reward = self._total_reward / steps
|
| 815 |
-
efficiency_score = max(0.0, min(1.0, (avg_reward + 1.0) / 2.0))
|
| 816 |
-
|
| 817 |
-
# 6. Belief accuracy (0.20) β explicit inference signal.
|
| 818 |
-
# Score = 1 - mean_absolute_error against the true belief vector.
|
| 819 |
-
# If no belief was recorded (heuristic / random baselines), score = 0.
|
| 820 |
-
if self._final_belief is not None:
|
| 821 |
-
true_belief = profile_to_belief_vector(self._profile)
|
| 822 |
-
mae = sum(abs(b - t) for b, t in zip(self._final_belief, true_belief)) / 3.0
|
| 823 |
-
belief_accuracy_score = max(0.0, 1.0 - mae)
|
| 824 |
-
else:
|
| 825 |
-
belief_accuracy_score = 0.0
|
| 826 |
-
|
| 827 |
-
score = (
|
| 828 |
-
0.15 * crash_free_ratio
|
| 829 |
-
+ 0.20 * progress_score
|
| 830 |
-
+ 0.10 * connection_score
|
| 831 |
-
+ 0.25 * adaptation_score
|
| 832 |
-
+ 0.10 * efficiency_score
|
| 833 |
-
+ 0.20 * belief_accuracy_score
|
| 834 |
-
)
|
| 835 |
-
return max(0.0, min(1.0, score))
|
| 836 |
|
| 837 |
def _make_observation(
|
| 838 |
self,
|
|
|
|
| 333 |
# consumed by _grade_episode. Stays None if the agent never emits a belief
|
| 334 |
# (e.g. heuristic baseline) β that case scores 0 on the belief component.
|
| 335 |
self._final_belief: Optional[List[float]] = None
|
| 336 |
+
# Lazy-built composed Rubric for episode grading. None until the first
|
| 337 |
+
# `done=True` step; rebuilt only across env instances, not across episodes.
|
| 338 |
+
self._grade_rubric: Optional[Any] = None
|
| 339 |
|
| 340 |
def get_metadata(self) -> EnvironmentMetadata:
|
| 341 |
return EnvironmentMetadata(
|
|
|
|
| 769 |
a v2 cleanup task; not blocking on the meta-RL skill we're
|
| 770 |
evaluating.
|
| 771 |
|
| 772 |
+
Implementation: composes 6 `Rubric` subclasses via OpenEnv's
|
| 773 |
+
`WeightedSum` (see `server/rubrics.py`). Each sub-rubric reads
|
| 774 |
+
the aggregated episode state (`_step_rewards`, `_crash_count`,
|
| 775 |
+
`_final_belief`, `_profile`) of the env it was built with β
|
| 776 |
+
RFC 004's recommended pattern for trajectory-summary scoring on
|
| 777 |
+
top of the per-(action, observation) Rubric ABC.
|
| 778 |
+
|
| 779 |
+
belief_accuracy is the explicit meta-RL inference signal: an
|
| 780 |
+
agent that doesn't emit a belief scores 0 here, an agent emitting
|
| 781 |
+
a belief close to the hidden profile vector scores up to 1.
|
| 782 |
+
Without this term, agents that play heuristic-style "keep meters
|
| 783 |
+
healthy" score the same as agents that actually infer the profile,
|
| 784 |
+
since the other components don't differentiate inference from
|
| 785 |
+
reflex.
|
| 786 |
"""
|
| 787 |
+
from server.rubrics import make_grade_rubric
|
| 788 |
+
|
| 789 |
+
# Build (or reuse) the composed rubric. The Rubric subclasses are
|
| 790 |
+
# stateless once built β they read live env state at forward()
|
| 791 |
+
# time β so caching is safe.
|
| 792 |
+
if self._grade_rubric is None:
|
| 793 |
+
self._grade_rubric = make_grade_rubric(self)
|
| 794 |
+
|
| 795 |
+
# forward(action, observation) β args are unused for episode-end
|
| 796 |
+
# scoring; the rubric reads from `self`.
|
| 797 |
+
score = self._grade_rubric(action=None, observation=None)
|
| 798 |
+
return max(0.0, min(1.0, float(score)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
|
| 800 |
def _make_observation(
|
| 801 |
self,
|
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Composable Rubric implementation of the RhythmEnv episode grader.
|
| 9 |
+
|
| 10 |
+
Mirrors the original `_grade_episode` in `rhythm_environment.py` but built
|
| 11 |
+
on top of `openenv.core.rubrics.Rubric` + `WeightedSum` β the framework's
|
| 12 |
+
official scoring composition primitives. Each Rubric subclass wraps one
|
| 13 |
+
of the 6 grader components; `make_rubric(env)` composes them with their
|
| 14 |
+
weights.
|
| 15 |
+
|
| 16 |
+
The `forward(action, observation)` signature is required by the Rubric
|
| 17 |
+
ABC. Because RhythmEnv grades at episode end (after `done=True`) using
|
| 18 |
+
aggregated env state β not per-(action, observation) data β these
|
| 19 |
+
subclasses ignore the per-step args and read directly from the env they
|
| 20 |
+
were constructed with. This is the recommended pattern from RFC 004 for
|
| 21 |
+
trajectory-summary scoring.
|
| 22 |
+
|
| 23 |
+
Used by `RhythmEnvironment._grade_episode`. The original numerical
|
| 24 |
+
implementation is preserved in the legacy code path; this file is the
|
| 25 |
+
primary, conformant implementation.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
|
| 30 |
+
from typing import Any, TYPE_CHECKING
|
| 31 |
+
|
| 32 |
+
from openenv.core.rubrics import Rubric, WeightedSum
|
| 33 |
+
|
| 34 |
+
if TYPE_CHECKING:
|
| 35 |
+
from server.rhythm_environment import RhythmEnvironment
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# Component rubrics β one per scored axis of the final grade.
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class CrashFreeRubric(Rubric):
|
| 44 |
+
"""Reward for keeping all 5 meters above the crash threshold.
|
| 45 |
+
|
| 46 |
+
Score = 1 β (crashes / total_possible_meter_step_drops). Higher is
|
| 47 |
+
better; perfect play (no meter ever drops below 0.10) gives 1.0.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def __init__(self, env: "RhythmEnvironment") -> None:
|
| 51 |
+
super().__init__()
|
| 52 |
+
self._env = env
|
| 53 |
+
|
| 54 |
+
def forward(self, action: Any, observation: Any) -> float:
|
| 55 |
+
from server.rhythm_environment import METERS # local import avoids cycle
|
| 56 |
+
|
| 57 |
+
steps = max(self._env._timestep, 1)
|
| 58 |
+
return 1.0 - (self._env._crash_count / (steps * len(METERS)))
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class ProgressRubric(Rubric):
|
| 62 |
+
"""Career/skill growth β final value of the progress meter."""
|
| 63 |
+
|
| 64 |
+
def __init__(self, env: "RhythmEnvironment") -> None:
|
| 65 |
+
super().__init__()
|
| 66 |
+
self._env = env
|
| 67 |
+
|
| 68 |
+
def forward(self, action: Any, observation: Any) -> float:
|
| 69 |
+
return float(self._env._progress)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class ConnectionRubric(Rubric):
|
| 73 |
+
"""Relationship maintenance β final value of the connection meter."""
|
| 74 |
+
|
| 75 |
+
def __init__(self, env: "RhythmEnvironment") -> None:
|
| 76 |
+
super().__init__()
|
| 77 |
+
self._env = env
|
| 78 |
+
|
| 79 |
+
def forward(self, action: Any, observation: Any) -> float:
|
| 80 |
+
return float(self._env._connection)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class AdaptationRubric(Rubric):
|
| 84 |
+
"""Implicit meta-learning signal: late-half mean reward minus early-half.
|
| 85 |
+
|
| 86 |
+
Scaled to [0, 1]. Per-step rewards are profile-weighted so a positive
|
| 87 |
+
gain means the agent is exploiting profile-aware play that it wasn't
|
| 88 |
+
using early. Gated by `late_quality` so a "terrible-then-mediocre"
|
| 89 |
+
exploit cannot win.
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
def __init__(self, env: "RhythmEnvironment") -> None:
|
| 93 |
+
super().__init__()
|
| 94 |
+
self._env = env
|
| 95 |
+
|
| 96 |
+
def forward(self, action: Any, observation: Any) -> float:
|
| 97 |
+
steps = max(self._env._timestep, 1)
|
| 98 |
+
half = max(steps // 2, 1)
|
| 99 |
+
rewards = self._env._step_rewards
|
| 100 |
+
early = rewards[:half]
|
| 101 |
+
late = rewards[half:]
|
| 102 |
+
if not (early and late):
|
| 103 |
+
return 0.0
|
| 104 |
+
mean_early = sum(early) / len(early)
|
| 105 |
+
mean_late = sum(late) / len(late)
|
| 106 |
+
# Per-step rewards are clamped to [-3, +3] in step(), so normalize
|
| 107 |
+
# late_quality with the [-3, +3] range β without this, the gate
|
| 108 |
+
# saturates at 1.0 for any mean_late β₯ 1 and the grader can't
|
| 109 |
+
# distinguish good from excellent late-half quality.
|
| 110 |
+
late_quality = max(0.0, min(1.0, (mean_late + 3.0) / 6.0))
|
| 111 |
+
gain = mean_late - mean_early
|
| 112 |
+
# gain β [-6, +6]; only positive gain counts, normalized to [0, 1]
|
| 113 |
+
gain_norm = max(0.0, min(1.0, gain / 3.0))
|
| 114 |
+
return gain_norm * late_quality
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class EfficiencyRubric(Rubric):
|
| 118 |
+
"""Bounded normalized average per-step reward across the episode."""
|
| 119 |
+
|
| 120 |
+
def __init__(self, env: "RhythmEnvironment") -> None:
|
| 121 |
+
super().__init__()
|
| 122 |
+
self._env = env
|
| 123 |
+
|
| 124 |
+
def forward(self, action: Any, observation: Any) -> float:
|
| 125 |
+
steps = max(self._env._timestep, 1)
|
| 126 |
+
avg_reward = self._env._total_reward / steps
|
| 127 |
+
return max(0.0, min(1.0, (avg_reward + 1.0) / 2.0))
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class BeliefAccuracyRubric(Rubric):
|
| 131 |
+
"""Explicit meta-RL inference signal.
|
| 132 |
+
|
| 133 |
+
Score = max(0, 1 β MAE) between the agent's last-emitted belief and
|
| 134 |
+
the true profile vector. Returns 0 if the agent never emitted a
|
| 135 |
+
belief (heuristic / random baselines) β by design, only agents that
|
| 136 |
+
actually try to infer get credit on this axis.
|
| 137 |
+
"""
|
| 138 |
+
|
| 139 |
+
def __init__(self, env: "RhythmEnvironment") -> None:
|
| 140 |
+
super().__init__()
|
| 141 |
+
self._env = env
|
| 142 |
+
|
| 143 |
+
def forward(self, action: Any, observation: Any) -> float:
|
| 144 |
+
from server.rhythm_environment import profile_to_belief_vector
|
| 145 |
+
|
| 146 |
+
emitted = self._env._final_belief
|
| 147 |
+
if emitted is None:
|
| 148 |
+
return 0.0
|
| 149 |
+
true_belief = profile_to_belief_vector(self._env._profile)
|
| 150 |
+
mae = sum(abs(b - t) for b, t in zip(emitted, true_belief)) / 3.0
|
| 151 |
+
return max(0.0, 1.0 - mae)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# ---------------------------------------------------------------------------
|
| 155 |
+
# Composition
|
| 156 |
+
# ---------------------------------------------------------------------------
|
| 157 |
+
|
| 158 |
+
# Weights matching the original _grade_episode formula; sum to 1.0.
|
| 159 |
+
GRADE_WEIGHTS = {
|
| 160 |
+
"crash_free": 0.15,
|
| 161 |
+
"progress": 0.20,
|
| 162 |
+
"connection": 0.10,
|
| 163 |
+
"adaptation": 0.25,
|
| 164 |
+
"efficiency": 0.10,
|
| 165 |
+
"belief_accuracy": 0.20,
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def make_grade_rubric(env: "RhythmEnvironment") -> WeightedSum:
|
| 170 |
+
"""Build the composed `WeightedSum` rubric for grading episodes.
|
| 171 |
+
|
| 172 |
+
Returns a single `Rubric` whose `forward(None, None)` reads the env's
|
| 173 |
+
aggregated state and returns the same final_score the original
|
| 174 |
+
`_grade_episode` would have computed.
|
| 175 |
+
"""
|
| 176 |
+
return WeightedSum(
|
| 177 |
+
rubrics=[
|
| 178 |
+
CrashFreeRubric(env),
|
| 179 |
+
ProgressRubric(env),
|
| 180 |
+
ConnectionRubric(env),
|
| 181 |
+
AdaptationRubric(env),
|
| 182 |
+
EfficiencyRubric(env),
|
| 183 |
+
BeliefAccuracyRubric(env),
|
| 184 |
+
],
|
| 185 |
+
weights=[
|
| 186 |
+
GRADE_WEIGHTS["crash_free"],
|
| 187 |
+
GRADE_WEIGHTS["progress"],
|
| 188 |
+
GRADE_WEIGHTS["connection"],
|
| 189 |
+
GRADE_WEIGHTS["adaptation"],
|
| 190 |
+
GRADE_WEIGHTS["efficiency"],
|
| 191 |
+
GRADE_WEIGHTS["belief_accuracy"],
|
| 192 |
+
],
|
| 193 |
+
)
|
|
@@ -469,3 +469,48 @@ class TestBeliefAccuracyGrader:
|
|
| 469 |
env.record_belief([-0.5, 1.5, 0.5])
|
| 470 |
# Internal state should be clamped
|
| 471 |
assert env._final_belief == [0.0, 1.0, 0.5]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
env.record_belief([-0.5, 1.5, 0.5])
|
| 470 |
# Internal state should be clamped
|
| 471 |
assert env._final_belief == [0.0, 1.0, 0.5]
|
| 472 |
+
|
| 473 |
+
def test_grader_uses_openenv_weighted_sum_rubric(self, env):
|
| 474 |
+
"""Grader composes child rubrics via openenv.core.rubrics.WeightedSum."""
|
| 475 |
+
from openenv.core.rubrics import Rubric, WeightedSum
|
| 476 |
+
from server.rubrics import (
|
| 477 |
+
CrashFreeRubric, ProgressRubric, ConnectionRubric,
|
| 478 |
+
AdaptationRubric, EfficiencyRubric, BeliefAccuracyRubric,
|
| 479 |
+
GRADE_WEIGHTS, make_grade_rubric,
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
# Trigger a full episode so _grade_episode runs and builds the rubric
|
| 483 |
+
obs = env.reset(seed=0)
|
| 484 |
+
for _ in range(MAX_STEPS):
|
| 485 |
+
if obs.done:
|
| 486 |
+
break
|
| 487 |
+
obs = env.step(make_action(ActionType.SLEEP))
|
| 488 |
+
|
| 489 |
+
rubric = env._grade_rubric
|
| 490 |
+
assert isinstance(rubric, WeightedSum), "grader must use WeightedSum"
|
| 491 |
+
assert isinstance(rubric, Rubric)
|
| 492 |
+
|
| 493 |
+
# 6 children, one per scoring component
|
| 494 |
+
children = list(rubric.children())
|
| 495 |
+
assert len(children) == 6
|
| 496 |
+
types = {type(c).__name__ for c in children}
|
| 497 |
+
assert types == {
|
| 498 |
+
"CrashFreeRubric", "ProgressRubric", "ConnectionRubric",
|
| 499 |
+
"AdaptationRubric", "EfficiencyRubric", "BeliefAccuracyRubric",
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
# Weights must sum to 1.0 (WeightedSum enforces; sanity check the keys)
|
| 503 |
+
assert abs(sum(GRADE_WEIGHTS.values()) - 1.0) < 1e-6
|
| 504 |
+
|
| 505 |
+
def test_make_grade_rubric_is_pure_function(self, env):
|
| 506 |
+
"""make_grade_rubric should produce equivalent rubrics across calls."""
|
| 507 |
+
from server.rubrics import make_grade_rubric
|
| 508 |
+
|
| 509 |
+
env.reset(seed=42)
|
| 510 |
+
r1 = make_grade_rubric(env)
|
| 511 |
+
r2 = make_grade_rubric(env)
|
| 512 |
+
# Same shape, fresh object
|
| 513 |
+
assert len(list(r1.children())) == len(list(r2.children())) == 6
|
| 514 |
+
assert r1 is not r2
|
| 515 |
+
# Same weights
|
| 516 |
+
assert r1._weights == r2._weights
|