Spaces:

InosLihka
/

rhythm_env

Sleeping

App Files Files Community

rhythm_env / training /reward_functions.py

InosLihka

Algorithm Distillation: grader v2 with belief_accuracy + SFT pipeline

ece0bbe 12 days ago

raw

history blame contribute delete

14 kB

	"""
	Reward functions for RhythmEnv GRPO training (meta-RL version).

	Four-layer reward stack:
	1. format_valid — does the LLM output have a parseable belief + action format?
	2. action_legal — is the action one of the 10 valid actions?
	3. env_reward — actual environment reward (seed-replay) for the chosen action,
	plus grader-aligned bias and diversity/exploration shaping
	4. belief_accuracy — how close is the belief vector to the hidden profile's true vector?

	Output format: "S M W ACTION_NAME" (belief first)
	- S, M, W: single digits 0-9 representing the agent's belief about the user
	S = social preference (0=hates social, 9=loves social)
	M = morning preference (0=night owl, 9=morning person)
	W = work preference (0=avoids work, 9=workaholic)
	- ACTION_NAME: one of 10 valid actions

	Example: "3 8 7 DEEP_WORK" → belief=[0.33, 0.89, 0.78], action=DEEP_WORK

	Belief-first matters because tokens generated earlier condition tokens generated
	later in causal LMs — the action ends up causally conditioned on the belief, so
	the belief is functionally useful for action selection rather than a post-hoc
	afterthought. The parser also accepts a legacy "ACTION S M W" ordering as fallback.

	Each function returns a list of floats (one per completion).
	"""

	import os
	import re
	import sys

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	from models import ActionType, RhythmAction
	from server.rhythm_environment import RhythmEnvironment

	# All valid action names (uppercase for matching)
	VALID_ACTIONS = {at.value.upper(): at for at in ActionType}

	# Default belief returned when the LLM doesn't provide one (neutral)
	DEFAULT_BELIEF = [0.5, 0.5, 0.5]


	_ANSWER_PATTERN = re.compile(
	r"(\d)\s+(\d)\s+(\d)\s+(" + "\|".join(at.value.upper() for at in ActionType) + r")\b",
	re.IGNORECASE,
	)


	def extract_action_and_belief(text: str) -> tuple[ActionType \| None, list[float], bool]:
	"""Parse the agent's output and extract (action, belief, belief_provided).

	Supports both formats:
	- Plain answer line: "S M W ACTION_NAME"
	- With CoT prefix: "<reasoning>...</reasoning>\nS M W ACTION_NAME"

	Strategy: search the entire response for the LAST occurrence of the
	"<digit> <digit> <digit> <ACTION>" pattern. Taking the last match handles
	cases where the model mentions an example mid-reasoning then commits to
	a different answer at the end.

	Returns:
	(action, belief, belief_provided):
	action: parsed ActionType or None if no valid action anywhere
	belief: 3-dim vector in [0, 1], DEFAULT_BELIEF if no belief parsed
	belief_provided: True iff a belief was extracted (full S M W ACTION pattern)
	"""
	if not text:
	return None, list(DEFAULT_BELIEF), False

	# Primary path: full belief+action pattern, take the LAST occurrence.
	matches = list(_ANSWER_PATTERN.finditer(text))
	if matches:
	last = matches[-1]
	s, m, w, action_name = last.groups()
	try:
	belief = [int(s) / 9.0, int(m) / 9.0, int(w) / 9.0]
	action = ActionType(action_name.lower())
	return action, belief, True
	except (ValueError, KeyError):
	pass # fall through to action-only fallback

	# Fallback: action-only output (no belief digits). Search for any valid
	# ACTION_NAME token in the response and return that with default belief.
	for line in text.strip().split("\n"):
	for token in line.upper().replace(",", " ").split():
	token = token.strip(".")
	if token in VALID_ACTIONS:
	return VALID_ACTIONS[token], list(DEFAULT_BELIEF), False
	for name, at in VALID_ACTIONS.items():
	if name in token:
	return at, list(DEFAULT_BELIEF), False

	return None, list(DEFAULT_BELIEF), False


	def extract_action(text: str) -> ActionType \| None:
	"""Backward-compatible action-only extractor (used by env_reward replay)."""
	action, _, _ = extract_action_and_belief(text)
	return action


	def format_valid(completions, **kwargs) -> list[float]:
	"""
	Layer 1: Is the output a parseable action + (optionally) belief digits?

	+1.0 if action + 3 belief digits, +0.5 if action only, -2.0 if no action.
	Graduated reward pushes the model toward emitting belief without harshly
	punishing action-only output during early training.
	"""
	scores = []
	for completion in completions:
	response = completion[0]["content"] if isinstance(completion, list) else completion
	action, _, belief_provided = extract_action_and_belief(response)
	if action is None:
	scores.append(-2.0)
	elif belief_provided:
	scores.append(1.0)
	else:
	scores.append(0.5)
	return scores


	def action_legal(completions, **kwargs) -> list[float]:
	"""
	Layer 2: Is the parsed action one of the 10 valid actions?

	All 10 actions are always legal in this env (no state-dependent validity),
	so this layer is a pure penalty: 0 for any valid action, -1 for unparseable
	output. Returning 0 (rather than a positive constant) is intentional —
	a constant reward across all completions in a GRPO group contributes
	exactly zero to the advantage and was a major contributor to mode collapse.
	"""
	scores = []
	for completion in completions:
	response = completion[0]["content"] if isinstance(completion, list) else completion
	action = extract_action(response)
	scores.append(0.0 if action is not None else -1.0)
	return scores


	def _replay_env(seed: int, history: list, profile_mode: str = "continuous") -> RhythmEnvironment:
	"""Build a replayed env at the given seed/history/mode."""
	env = RhythmEnvironment()
	env.reset(seed=seed, profile_mode=profile_mode)
	for past_action_name in history:
	env.step(RhythmAction(action_type=ActionType(past_action_name)))
	return env


	def env_reward(
	completions,
	prompts=None,
	seed=None,
	step_index=None,
	action_history=None,
	profile_mode=None,
	**kwargs,
	) -> list[float]:
	"""
	Layer 3: Actual environment reward from stepping with the chosen action.

	Replays the episode to the correct state, then steps with the model's action.
	Returns the raw env reward (already scaled by REWARD_SCALE).

	seed/step_index/action_history are passed as dataset columns by GRPOTrainer.
	profile_mode column controls whether replay uses continuous or discrete profile.
	"""
	scores = []

	for i, completion in enumerate(completions):
	response = completion[0]["content"] if isinstance(completion, list) else completion
	action_type = extract_action(response)

	if action_type is None:
	scores.append(-3.0)
	continue

	# Per-row dataset values (preferred path)
	if seed is not None and i < len(seed):
	ep_seed = seed[i]
	ep_history = action_history[i] if action_history is not None else []
	ep_mode = profile_mode[i] if (profile_mode is not None and i < len(profile_mode)) else "continuous"
	else:
	prompt_data = prompts[i] if prompts and i < len(prompts) else None
	if prompt_data and isinstance(prompt_data, dict) and "seed" in prompt_data:
	ep_seed = prompt_data["seed"]
	ep_history = prompt_data.get("action_history", [])
	ep_mode = prompt_data.get("profile_mode", "continuous")
	else:
	# Fallback seed mixes index with a prime to break deterministic
	# seed clusters — without it, all completions in a position-class
	# land on the same env_reward and contribute zero to GRPO advantage.
	ep_seed = (i * 17) ^ 0xBEEF
	ep_history = []
	ep_mode = "continuous"

	try:
	env = _replay_env(ep_seed, ep_history, ep_mode)
	# Capture pre-step meters so we can compute deltas for the bias
	pre_progress = env._progress
	pre_connection = env._connection
	obs = env.step(RhythmAction(action_type=action_type))
	reward = obs.reward
	chosen = action_type.value

	# Grader-aligned bias (progress + connection deltas): shapes only
	# the GRPO-visible training reward, not the env's per-step reward.
	# Lives here rather than in env.step() so that the env's per-step
	# reward (used by adaptation_score in the grader) stays pure.
	progress_delta = env._progress - pre_progress
	connection_delta = env._connection - pre_connection
	reward += 0.5 * progress_delta + 0.4 * connection_delta

	# Diversity shaping: small nudges (~1/3 the magnitude of the env signal)
	# so they don't dominate it. Three terms:
	# - repetition penalty (action ≥2× in last 3)
	# - low-entropy window penalty (last 6 actions ≤2 unique)
	# - new-action exploration bonus (until 6 distinct actions tried)
	if ep_history and len(ep_history) >= 2:
	recent3 = ep_history[-3:]
	if recent3.count(chosen) >= 2:
	reward -= 0.10

	if ep_history and len(ep_history) >= 5:
	last6 = ep_history[-5:] + [chosen]
	if len(set(last6)) <= 2:
	reward -= 0.15

	if ep_history is not None:
	seen = set(ep_history)
	if chosen not in seen and len(seen) < 6:
	reward += 0.07

	# Belief-action coupling: rewards consistency between the agent's
	# emitted belief and its chosen action. Without this term, the
	# belief-first format only enforces consistency via causal attention
	# (weak signal); this provides an explicit gradient.
	_, b, b_provided = extract_action_and_belief(response)
	if b_provided:
	s_pref, m_pref, w_pref = b
	# High social → social actions; low social → solo actions
	if s_pref > 0.65 and chosen in {"socialize", "family_time"}:
	reward += 0.15
	elif s_pref < 0.35 and chosen in {"meditate", "me_time"}:
	reward += 0.15
	elif s_pref > 0.65 and chosen in {"meditate", "me_time"}:
	reward -= 0.10 # belief says extrovert, action says solo
	elif s_pref < 0.35 and chosen in {"socialize", "family_time"}:
	reward -= 0.10 # belief says introvert, action says social

	# High morning + morning slot + work → bonus
	slot = obs.slot if hasattr(obs, "slot") else 0
	if m_pref > 0.65 and slot == 0 and chosen in {"deep_work", "learn"}:
	reward += 0.15
	elif m_pref < 0.35 and slot in (2, 3) and chosen in {"deep_work", "learn"}:
	reward += 0.15

	# High work → work actions
	if w_pref > 0.65 and chosen in {"deep_work", "learn", "admin_work"}:
	reward += 0.15

	scores.append(reward)
	except Exception:
	scores.append(-3.0)

	return scores


	def belief_accuracy(
	completions,
	seed=None,
	step_index=None,
	action_history=None,
	profile_mode=None,
	**kwargs,
	) -> list[float]:
	"""
	Layer 4: Belief-vector accuracy reward (the meta-learning signal).

	Reward = (1 − belief_mae) − constant_baseline_similarity, where the
	constant baseline is what a "5 5 5" emission would score for THIS profile.
	Subtracting the baseline matters: without it, a constant neutral emission
	earns positive reward on every step (~+0.34 × the layer weight) for zero
	actual learning, which silently re-creates the iter-1 mode-collapse pull.
	With it: constant emission ≈ 0, better-than-baseline > 0, worse < 0.

	Belief reward is also skipped at step 0 — the agent has no information yet,
	so penalizing belief-vs-target there just biases toward a constant prior.
	"""
	scores = []
	for i, completion in enumerate(completions):
	response = completion[0]["content"] if isinstance(completion, list) else completion
	_, belief, belief_provided = extract_action_and_belief(response)

	if not belief_provided:
	scores.append(-0.1) # weak push toward emitting belief
	continue

	# Resolve seed/mode/step for replay
	if seed is not None and i < len(seed):
	ep_seed = seed[i]
	ep_history = action_history[i] if action_history is not None else []
	ep_mode = profile_mode[i] if (profile_mode is not None and i < len(profile_mode)) else "continuous"
	ep_step = step_index[i] if (step_index is not None and i < len(step_index)) else 0
	else:
	scores.append(0.0)
	continue

	# Step 0: agent has no information yet — committing a belief here
	# would just pull the policy toward whatever constant prior the
	# base model emits. Skip the reward for that step.
	if ep_step == 0:
	scores.append(0.0)
	continue

	try:
	env = _replay_env(ep_seed, ep_history, ep_mode)
	true_belief = env.get_belief_target()
	mae = sum(abs(b - t) for b, t in zip(belief, true_belief)) / 3.0
	similarity = 1.0 - mae
	baseline_mae = sum(abs(0.5 - t) for t in true_belief) / 3.0
	baseline_similarity = 1.0 - baseline_mae
	# Reward = how much better than the constant-emit baseline
	scores.append(similarity - baseline_similarity)
	except Exception:
	scores.append(0.0)

	return scores