Spaces:

InosLihka
/

rhythm_env

Sleeping

File size: 13,969 Bytes

cc6473a
ecbe0d8
cc6473a
ecbe0d8
64d24b3
ecbe0d8
ece0bbe
 
ecbe0d8
 
ece0bbe
ecbe0d8
 
 
 
64d24b3
 
ece0bbe
64d24b3
ece0bbe
 
 
 
cc6473a
 
 
 
 
 
ecbe0d8
cc6473a
 
 
 
 
 
 
 
 
ecbe0d8
 
cc6473a
ecbe0d8
ece0bbe
 
 
 
 
 
ecbe0d8
ece0bbe
 
 
 
 
 
 
 
 
 
ecbe0d8
 
 
ece0bbe
 
 
ecbe0d8
cc6473a
ecbe0d8
 
ece0bbe
 
 
 
 
64d24b3
ece0bbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc6473a
 
ecbe0d8
 
 
 
cc6473a
 
 
 
ecbe0d8
cc6473a
ecbe0d8
 
 
cc6473a
 
 
 
ecbe0d8
 
 
 
 
 
 
cc6473a
 
 
 
 
 
 
ece0bbe
 
 
 
 
cc6473a
 
 
 
 
e21a960
cc6473a
 
 
ecbe0d8
 
 
 
 
 
 
 
 
cc6473a
 
 
 
 
 
ecbe0d8
cc6473a
 
 
 
 
 
ecbe0d8
cc6473a
 
ecbe0d8
cc6473a
 
 
 
 
 
 
 
 
 
 
ecbe0d8
cc6473a
 
 
ecbe0d8
cc6473a
 
 
 
 
ecbe0d8
cc6473a
ece0bbe
 
 
e21a960
cc6473a
ecbe0d8
cc6473a
 
ecbe0d8
bb2a9c7
 
 
ecbe0d8
e21a960
64d24b3
 
ece0bbe
 
 
 
bb2a9c7
 
 
 
ece0bbe
 
 
 
 
e21a960
64d24b3
bb2a9c7
ece0bbe
64d24b3
 
 
 
ece0bbe
64d24b3
 
 
 
ece0bbe
bb2a9c7
ece0bbe
 
 
 
bb2a9c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64d24b3
e21a960
ecbe0d8
 
cc6473a
ecbe0d8
cc6473a
 
ecbe0d8
 
 
 
 
 
 
 
 
ece0bbe
ecbe0d8
ece0bbe
 
 
 
 
 
bb2a9c7
ece0bbe
 
ecbe0d8
 
 
 
 
 
 
bb2a9c7
ecbe0d8
 
bb2a9c7
ecbe0d8
 
 
 
bb2a9c7
ecbe0d8
 
 
 
ece0bbe
 
 
bb2a9c7
 
 
 
ecbe0d8
 
 
 
bb2a9c7
 
 
 
 
cc6473a
ecbe0d8
cc6473a

"""
Reward functions for RhythmEnv GRPO training (meta-RL version).

Four-layer reward stack:
1. format_valid    — does the LLM output have a parseable belief + action format?
2. action_legal    — is the action one of the 10 valid actions?
3. env_reward      — actual environment reward (seed-replay) for the chosen action,
                     plus grader-aligned bias and diversity/exploration shaping
4. belief_accuracy — how close is the belief vector to the hidden profile's true vector?

Output format: "S M W ACTION_NAME" (belief first)
  - S, M, W: single digits 0-9 representing the agent's belief about the user
    S = social preference (0=hates social, 9=loves social)
    M = morning preference (0=night owl, 9=morning person)
    W = work preference  (0=avoids work, 9=workaholic)
  - ACTION_NAME: one of 10 valid actions

Example: "3 8 7 DEEP_WORK"  →  belief=[0.33, 0.89, 0.78], action=DEEP_WORK

Belief-first matters because tokens generated earlier condition tokens generated
later in causal LMs — the action ends up causally conditioned on the belief, so
the belief is functionally useful for action selection rather than a post-hoc
afterthought. The parser also accepts a legacy "ACTION S M W" ordering as fallback.

Each function returns a list of floats (one per completion).
"""

import os
import re
import sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from models import ActionType, RhythmAction
from server.rhythm_environment import RhythmEnvironment

# All valid action names (uppercase for matching)
VALID_ACTIONS = {at.value.upper(): at for at in ActionType}

# Default belief returned when the LLM doesn't provide one (neutral)
DEFAULT_BELIEF = [0.5, 0.5, 0.5]


_ANSWER_PATTERN = re.compile(
    r"(\d)\s+(\d)\s+(\d)\s+(" + "|".join(at.value.upper() for at in ActionType) + r")\b",
    re.IGNORECASE,
)


def extract_action_and_belief(text: str) -> tuple[ActionType | None, list[float], bool]:
    """Parse the agent's output and extract (action, belief, belief_provided).

    Supports both formats:
      - Plain answer line: "S M W ACTION_NAME"
      - With CoT prefix:   "<reasoning>...</reasoning>\nS M W ACTION_NAME"

    Strategy: search the entire response for the LAST occurrence of the
    "<digit> <digit> <digit> <ACTION>" pattern. Taking the last match handles
    cases where the model mentions an example mid-reasoning then commits to
    a different answer at the end.

    Returns:
        (action, belief, belief_provided):
            action: parsed ActionType or None if no valid action anywhere
            belief: 3-dim vector in [0, 1], DEFAULT_BELIEF if no belief parsed
            belief_provided: True iff a belief was extracted (full S M W ACTION pattern)
    """
    if not text:
        return None, list(DEFAULT_BELIEF), False

    # Primary path: full belief+action pattern, take the LAST occurrence.
    matches = list(_ANSWER_PATTERN.finditer(text))
    if matches:
        last = matches[-1]
        s, m, w, action_name = last.groups()
        try:
            belief = [int(s) / 9.0, int(m) / 9.0, int(w) / 9.0]
            action = ActionType(action_name.lower())
            return action, belief, True
        except (ValueError, KeyError):
            pass  # fall through to action-only fallback

    # Fallback: action-only output (no belief digits). Search for any valid
    # ACTION_NAME token in the response and return that with default belief.
    for line in text.strip().split("\n"):
        for token in line.upper().replace(",", " ").split():
            token = token.strip(".")
            if token in VALID_ACTIONS:
                return VALID_ACTIONS[token], list(DEFAULT_BELIEF), False
            for name, at in VALID_ACTIONS.items():
                if name in token:
                    return at, list(DEFAULT_BELIEF), False

    return None, list(DEFAULT_BELIEF), False


def extract_action(text: str) -> ActionType | None:
    """Backward-compatible action-only extractor (used by env_reward replay)."""
    action, _, _ = extract_action_and_belief(text)
    return action


def format_valid(completions, **kwargs) -> list[float]:
    """
    Layer 1: Is the output a parseable action + (optionally) belief digits?

    +1.0 if action + 3 belief digits, +0.5 if action only, -2.0 if no action.
    Graduated reward pushes the model toward emitting belief without harshly
    punishing action-only output during early training.
    """
    scores = []
    for completion in completions:
        response = completion[0]["content"] if isinstance(completion, list) else completion
        action, _, belief_provided = extract_action_and_belief(response)
        if action is None:
            scores.append(-2.0)
        elif belief_provided:
            scores.append(1.0)
        else:
            scores.append(0.5)
    return scores


def action_legal(completions, **kwargs) -> list[float]:
    """
    Layer 2: Is the parsed action one of the 10 valid actions?

    All 10 actions are always legal in this env (no state-dependent validity),
    so this layer is a pure penalty: 0 for any valid action, -1 for unparseable
    output. Returning 0 (rather than a positive constant) is intentional —
    a constant reward across all completions in a GRPO group contributes
    exactly zero to the advantage and was a major contributor to mode collapse.
    """
    scores = []
    for completion in completions:
        response = completion[0]["content"] if isinstance(completion, list) else completion
        action = extract_action(response)
        scores.append(0.0 if action is not None else -1.0)
    return scores


def _replay_env(seed: int, history: list, profile_mode: str = "continuous") -> RhythmEnvironment:
    """Build a replayed env at the given seed/history/mode."""
    env = RhythmEnvironment()
    env.reset(seed=seed, profile_mode=profile_mode)
    for past_action_name in history:
        env.step(RhythmAction(action_type=ActionType(past_action_name)))
    return env


def env_reward(
    completions,
    prompts=None,
    seed=None,
    step_index=None,
    action_history=None,
    profile_mode=None,
    **kwargs,
) -> list[float]:
    """
    Layer 3: Actual environment reward from stepping with the chosen action.

    Replays the episode to the correct state, then steps with the model's action.
    Returns the raw env reward (already scaled by REWARD_SCALE).

    seed/step_index/action_history are passed as dataset columns by GRPOTrainer.
    profile_mode column controls whether replay uses continuous or discrete profile.
    """
    scores = []

    for i, completion in enumerate(completions):
        response = completion[0]["content"] if isinstance(completion, list) else completion
        action_type = extract_action(response)

        if action_type is None:
            scores.append(-3.0)
            continue

        # Per-row dataset values (preferred path)
        if seed is not None and i < len(seed):
            ep_seed = seed[i]
            ep_history = action_history[i] if action_history is not None else []
            ep_mode = profile_mode[i] if (profile_mode is not None and i < len(profile_mode)) else "continuous"
        else:
            prompt_data = prompts[i] if prompts and i < len(prompts) else None
            if prompt_data and isinstance(prompt_data, dict) and "seed" in prompt_data:
                ep_seed = prompt_data["seed"]
                ep_history = prompt_data.get("action_history", [])
                ep_mode = prompt_data.get("profile_mode", "continuous")
            else:
                # Fallback seed mixes index with a prime to break deterministic
                # seed clusters — without it, all completions in a position-class
                # land on the same env_reward and contribute zero to GRPO advantage.
                ep_seed = (i * 17) ^ 0xBEEF
                ep_history = []
                ep_mode = "continuous"

        try:
            env = _replay_env(ep_seed, ep_history, ep_mode)
            # Capture pre-step meters so we can compute deltas for the bias
            pre_progress = env._progress
            pre_connection = env._connection
            obs = env.step(RhythmAction(action_type=action_type))
            reward = obs.reward
            chosen = action_type.value

            # Grader-aligned bias (progress + connection deltas): shapes only
            # the GRPO-visible training reward, not the env's per-step reward.
            # Lives here rather than in env.step() so that the env's per-step
            # reward (used by adaptation_score in the grader) stays pure.
            progress_delta = env._progress - pre_progress
            connection_delta = env._connection - pre_connection
            reward += 0.5 * progress_delta + 0.4 * connection_delta

            # Diversity shaping: small nudges (~1/3 the magnitude of the env signal)
            # so they don't dominate it. Three terms:
            #   - repetition penalty (action ≥2× in last 3)
            #   - low-entropy window penalty (last 6 actions ≤2 unique)
            #   - new-action exploration bonus (until 6 distinct actions tried)
            if ep_history and len(ep_history) >= 2:
                recent3 = ep_history[-3:]
                if recent3.count(chosen) >= 2:
                    reward -= 0.10

            if ep_history and len(ep_history) >= 5:
                last6 = ep_history[-5:] + [chosen]
                if len(set(last6)) <= 2:
                    reward -= 0.15

            if ep_history is not None:
                seen = set(ep_history)
                if chosen not in seen and len(seen) < 6:
                    reward += 0.07

            # Belief-action coupling: rewards consistency between the agent's
            # emitted belief and its chosen action. Without this term, the
            # belief-first format only enforces consistency via causal attention
            # (weak signal); this provides an explicit gradient.
            _, b, b_provided = extract_action_and_belief(response)
            if b_provided:
                s_pref, m_pref, w_pref = b
                # High social → social actions; low social → solo actions
                if s_pref > 0.65 and chosen in {"socialize", "family_time"}:
                    reward += 0.15
                elif s_pref < 0.35 and chosen in {"meditate", "me_time"}:
                    reward += 0.15
                elif s_pref > 0.65 and chosen in {"meditate", "me_time"}:
                    reward -= 0.10  # belief says extrovert, action says solo
                elif s_pref < 0.35 and chosen in {"socialize", "family_time"}:
                    reward -= 0.10  # belief says introvert, action says social

                # High morning + morning slot + work → bonus
                slot = obs.slot if hasattr(obs, "slot") else 0
                if m_pref > 0.65 and slot == 0 and chosen in {"deep_work", "learn"}:
                    reward += 0.15
                elif m_pref < 0.35 and slot in (2, 3) and chosen in {"deep_work", "learn"}:
                    reward += 0.15

                # High work → work actions
                if w_pref > 0.65 and chosen in {"deep_work", "learn", "admin_work"}:
                    reward += 0.15

            scores.append(reward)
        except Exception:
            scores.append(-3.0)

    return scores


def belief_accuracy(
    completions,
    seed=None,
    step_index=None,
    action_history=None,
    profile_mode=None,
    **kwargs,
) -> list[float]:
    """
    Layer 4: Belief-vector accuracy reward (the meta-learning signal).

    Reward = (1 − belief_mae) − constant_baseline_similarity, where the
    constant baseline is what a "5 5 5" emission would score for THIS profile.
    Subtracting the baseline matters: without it, a constant neutral emission
    earns positive reward on every step (~+0.34 × the layer weight) for zero
    actual learning, which silently re-creates the iter-1 mode-collapse pull.
    With it: constant emission ≈ 0, better-than-baseline > 0, worse < 0.

    Belief reward is also skipped at step 0 — the agent has no information yet,
    so penalizing belief-vs-target there just biases toward a constant prior.
    """
    scores = []
    for i, completion in enumerate(completions):
        response = completion[0]["content"] if isinstance(completion, list) else completion
        _, belief, belief_provided = extract_action_and_belief(response)

        if not belief_provided:
            scores.append(-0.1)  # weak push toward emitting belief
            continue

        # Resolve seed/mode/step for replay
        if seed is not None and i < len(seed):
            ep_seed = seed[i]
            ep_history = action_history[i] if action_history is not None else []
            ep_mode = profile_mode[i] if (profile_mode is not None and i < len(profile_mode)) else "continuous"
            ep_step = step_index[i] if (step_index is not None and i < len(step_index)) else 0
        else:
            scores.append(0.0)
            continue

        # Step 0: agent has no information yet — committing a belief here
        # would just pull the policy toward whatever constant prior the
        # base model emits. Skip the reward for that step.
        if ep_step == 0:
            scores.append(0.0)
            continue

        try:
            env = _replay_env(ep_seed, ep_history, ep_mode)
            true_belief = env.get_belief_target()
            mae = sum(abs(b - t) for b, t in zip(belief, true_belief)) / 3.0
            similarity = 1.0 - mae
            baseline_mae = sum(abs(0.5 - t) for t in true_belief) / 3.0
            baseline_similarity = 1.0 - baseline_mae
            # Reward = how much better than the constant-emit baseline
            scores.append(similarity - baseline_similarity)
        except Exception:
            scores.append(0.0)

    return scores