rhythm_env / models.py
InosLihka's picture
Algorithm Distillation: grader v2 with belief_accuracy + SFT pipeline
ece0bbe
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Data models for RhythmEnv Life Simulator.
A holistic life resource management RL environment where an agent balances
5 life meters (Vitality, Cognition, Progress, Serenity, Connection) across
a 7-day week with hidden personality profiles.
"""
from __future__ import annotations
from enum import Enum
from typing import Dict, List, Optional
from openenv.core.env_server import Action, Observation, State
from pydantic import BaseModel, Field
class ActionType(str, Enum):
"""Available actions for the life simulator agent."""
# Productivity
DEEP_WORK = "deep_work"
ADMIN_WORK = "admin_work"
LEARN = "learn"
# Recovery
SLEEP = "sleep"
EXERCISE = "exercise"
MEDITATE = "meditate"
# Social
FAMILY_TIME = "family_time"
SOCIALIZE = "socialize"
# Leisure
ME_TIME = "me_time"
BINGE_WATCH = "binge_watch"
class RhythmAction(Action):
"""Action for the Life Simulator. Agent chooses one of 10 activity types."""
action_type: ActionType
class StepRecord(BaseModel):
"""
Record of one completed step included in step_history.
Contains the action taken, the reward received, per-meter deltas, AND
per-meter ANOMALIES (actual_delta - expected_delta_under_neutral_profile).
The anomalies are the cleanest profile-inference signal — they tell the
agent how much THIS person's response deviates from the average person.
Without them, the agent has to back out the profile from raw deltas
without a baseline to compare against (much harder).
"""
step: int
action: str
reward: float
vitality_delta: float = 0.0
cognition_delta: float = 0.0
progress_delta: float = 0.0
serenity_delta: float = 0.0
connection_delta: float = 0.0
# Per-meter anomalies: actual_delta minus expected_delta_under_neutral_profile.
# Surfaced to the agent in the prompt — the cleanest profile-inference signal.
vitality_anomaly: float = 0.0
cognition_anomaly: float = 0.0
progress_anomaly: float = 0.0
serenity_anomaly: float = 0.0
connection_anomaly: float = 0.0
class RhythmObservation(Observation):
"""
Observation returned to the agent each step.
The agent sees all 5 meters, temporal context, last-step deltas,
anomaly signals (actual vs expected meter changes), and a rolling
history of the last 7 steps. The hidden personality profile and
reward weight decomposition are NOT included.
The step_history and *_anomaly fields in reward_breakdown together
give the agent everything it needs to infer the hidden profile:
- step_history: raw action/reward/delta trajectory for pattern matching
- *_anomaly: how much each meter deviated from neutral-profile expectation
"""
timestep: int = 0
day: int = 0
slot: int = 0
vitality: float = 0.8
cognition: float = 0.7
progress: float = 0.0
serenity: float = 0.7
connection: float = 0.5
active_event: Optional[str] = None
remaining_steps: int = 28
reward: float = 0.0
done: bool = False
reward_breakdown: Dict[str, float] = Field(default_factory=dict)
# Last step's per-meter deltas as first-class fields (not just buried in reward_breakdown)
vitality_delta: float = 0.0
cognition_delta: float = 0.0
progress_delta: float = 0.0
serenity_delta: float = 0.0
connection_delta: float = 0.0
last_action: Optional[str] = None
# Rolling history of the last HISTORY_LENGTH steps
step_history: List[StepRecord] = Field(default_factory=list)
class RhythmState(State):
"""
Internal state for the Life Simulator.
Includes profile_name which is hidden from the observation.
"""
timestep: int = 0
day: int = 0
slot: int = 0
profile_name: str = ""
vitality: float = 0.8
cognition: float = 0.7
progress: float = 0.0
serenity: float = 0.7
connection: float = 0.5
active_event: Optional[str] = None