Spaces:

InosLihka
/

rhythm_env

Sleeping

App Files Files Community

rhythm_env / training /dataset.py

InosLihka

Algorithm Distillation: grader v2 with belief_accuracy + SFT pipeline

ece0bbe 12 days ago

raw

history blame contribute delete

11.6 kB

	"""
	Dataset generator for RhythmEnv GRPO training (meta-RL version).

	Plays episodes under a continuously-sampled profile per seed and emits
	observation prompts at each step, paired with the replay metadata
	(seed, step_index, action_history) the reward functions need to
	reconstruct env state deterministically.

	The system prompt asks for "S M W ACTION_NAME" — three belief digits then
	the action. A `hint_fraction` slice of episodes carries a true-belief hint
	in the prompt as a curriculum warmup; the rest force pure inference.
	"""

	import sys
	import os
	import random

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	from models import ActionType, RhythmAction
	from server.rhythm_environment import RhythmEnvironment, MAX_STEPS, METERS

	SLOT_NAMES = ["Morning", "Afternoon", "Evening", "Night"]
	DAY_NAMES = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

	SYSTEM_PROMPT = """You are a life-management agent helping a person whose preferences are HIDDEN.
	You see 5 life meters and a rolling history of recent steps. The same action
	affects different people differently — you must INFER who you're helping from
	rewards, meter changes, and per-meter ANOMALY signals.

	Each step, do TWO things:

	1. Reason briefly about what the observations imply about the person.
	Focus on:
	- Anomalies (actual delta vs neutral-profile expectation): big positive
	social_serenity / connection responses → high S; big morning cognition
	gains → high M; productive work giving vitality back → high W
	- Current meter state: any meter under 0.15 needs urgent recovery
	- What action best fits BOTH the inferred profile and the current state

	2. Output your final answer on the LAST line in this exact format:
	S M W ACTION_NAME
	where S, M, W are belief digits 0-9 (0=low, 9=high) representing your best
	estimate of social_pref, morning_pref, work_pref. ACTION_NAME is one of:
	DEEP_WORK, ADMIN_WORK, LEARN, SLEEP, EXERCISE, MEDITATE, FAMILY_TIME,
	SOCIALIZE, ME_TIME, BINGE_WATCH

	Wrap your reasoning in <reasoning>...</reasoning> tags. Keep reasoning under
	120 tokens. The final answer line MUST be the last line of your response.

	Belief→action quick reference:
	- High S (extrovert): SOCIALIZE, FAMILY_TIME boost connection cheaply
	- High M (morning person): DEEP_WORK / LEARN in early slots gets bonus cognition
	- High W (workaholic): DEEP_WORK, LEARN drive progress and may energize
	- Low S (introvert): MEDITATE, ME_TIME for solo recharge; avoid SOCIALIZE
	- Low M (night owl): DEEP_WORK / LEARN in evening/night slots
	- Watch crashes: any meter under 0.10 = -0.30 penalty per crashed meter
	- Connection decays passively — actively maintain via SOCIALIZE/FAMILY_TIME
	- Don't repeat the same action 3+ times in a row — repetition penalty applies

	Strategy: probe varied actions in the first ~5 steps to gather profile evidence,
	then exploit your sharpened belief by picking actions that match the inferred
	profile + current meter state.

	Example output:
	<reasoning>
	Last step's socialize gave V-0.12 (anom -0.06, much worse than neutral) — high
	social drain, suggests low S. Morning DEEP_WORK earlier gave bonus cognition
	(anom +0.04) → high M. Vitality at 0.6 still ok, serenity dropping. With low S +
	high M, MEDITATE is the recovery play that fits.
	</reasoning>
	2 8 5 MEDITATE"""


	def format_observation_prompt(obs, profile_hint: dict \| None = None) -> str:
	"""Format an observation into a user prompt for the LLM.

	If profile_hint is provided (curriculum's "visible" phase), include it in
	the prompt so the agent learns the skill of using profile signals
	before having to infer them from scratch.
	"""
	day_name = DAY_NAMES[obs.day] if obs.day < 7 else f"Day {obs.day}"
	slot_name = SLOT_NAMES[obs.slot] if obs.slot < 4 else f"Slot {obs.slot}"
	event_str = f"\nActive event: {obs.active_event}" if obs.active_event else ""

	history_lines = []
	for h in (obs.step_history or [])[-5:]: # last 5 only to fit prompt budget
	# Per-meter anomalies (actual_delta − expected_under_neutral_profile)
	# are the cleanest profile-inference signal — they show how this person's
	# response DEVIATES from the average person. Surfacing them here in the
	# prompt is what gives the agent a fingerprint to learn from.
	anom_str = (
	f" [anom V{h.vitality_anomaly:+.2f} C{h.cognition_anomaly:+.2f} "
	f"P{h.progress_anomaly:+.2f} S{h.serenity_anomaly:+.2f} "
	f"Cn{h.connection_anomaly:+.2f}]"
	)
	history_lines.append(
	f" step {h.step}: {h.action} -> reward {h.reward:+.2f} "
	f"(V{h.vitality_delta:+.2f} C{h.cognition_delta:+.2f} "
	f"P{h.progress_delta:+.2f} S{h.serenity_delta:+.2f} Cn{h.connection_delta:+.2f})"
	f"{anom_str}"
	)
	history_str = ""
	if history_lines:
	history_str = (
	"\n\nRecent history (anom = how this person deviated from neutral baseline):\n"
	+ "\n".join(history_lines)
	)

	hint_str = ""
	if profile_hint is not None:
	hint_str = (
	f"\n\nKnown about this person (training hint):\n"
	f" social_pref={profile_hint['social_pref']:.2f}, "
	f"morning_pref={profile_hint['morning_pref']:.2f}, "
	f"work_pref={profile_hint['work_pref']:.2f}"
	)

	return (
	f"Step: {obs.timestep}/{MAX_STEPS} ({day_name} {slot_name})\n"
	f"Remaining steps: {obs.remaining_steps}\n\n"
	f"Meters:\n"
	f" Vitality: {obs.vitality:.2f}\n"
	f" Cognition: {obs.cognition:.2f}\n"
	f" Progress: {obs.progress:.2f}\n"
	f" Serenity: {obs.serenity:.2f}\n"
	f" Connection: {obs.connection:.2f}"
	f"{event_str}"
	f"{history_str}"
	f"{hint_str}\n\n"
	f"Output your belief, then your action (format: S M W ACTION_NAME):"
	)


	def generate_episode_samples(
	seed: int,
	strategy: str = "random",
	profile_mode: str = "continuous",
	show_profile_hint: bool = False,
	) -> list:
	"""Play one episode and return a list of training samples.

	Each sample includes the prompt + replay metadata (seed, step_index,
	action_history, profile_mode) so reward functions can deterministically
	reconstruct the env state.

	Args:
	seed: Episode seed (also determines profile when profile_mode=continuous).
	strategy: "random" or "heuristic" — used to roll out the episode for
	state diversity. The agent's training generations replace these
	actions; we only need the prefix history for replay.
	profile_mode: "continuous" (sampled per seed) or "discrete" (1 of 3
	hardcoded profiles).
	show_profile_hint: If True, include the true belief vector in the prompt.
	Use during the curriculum's "visible" warmup phase.
	"""
	env = RhythmEnvironment()
	obs = env.reset(seed=seed, profile_mode=profile_mode)
	profile_hint = env.get_profile_hint() if show_profile_hint else None
	rng = random.Random(seed + 1000)
	actions_taken = []
	samples = []
	all_actions = list(ActionType)

	for step in range(MAX_STEPS):
	if obs.done:
	break

	prompt = format_observation_prompt(obs, profile_hint=profile_hint)

	samples.append({
	"prompt": [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	],
	"seed": seed,
	"step_index": step,
	"action_history": list(actions_taken),
	"profile_mode": profile_mode,
	"show_profile_hint": show_profile_hint,
	})

	if strategy == "random":
	action_type = rng.choice(all_actions)
	elif strategy == "heuristic":
	action_type = heuristic_action(obs)
	else:
	action_type = rng.choice(all_actions)

	action = RhythmAction(action_type=action_type)
	actions_taken.append(action_type.value)
	obs = env.step(action)

	return samples


	def heuristic_action(obs) -> ActionType:
	"""Priority-based heuristic baseline (profile-blind).

	Used both during dataset generation (to roll out diverse states) and
	by inference_eval as the heuristic baseline strategy.
	"""
	slot = obs.slot
	v, c, p, s, cn = obs.vitality, obs.cognition, obs.progress, obs.serenity, obs.connection

	if v < 0.15:
	return ActionType.SLEEP
	if s < 0.15:
	return ActionType.MEDITATE
	if cn < 0.15:
	return ActionType.FAMILY_TIME
	if slot == 3:
	return ActionType.SLEEP
	if slot == 0:
	return ActionType.DEEP_WORK if (v > 0.4 and c > 0.3) else ActionType.EXERCISE
	if slot == 1:
	if cn < 0.3:
	return ActionType.FAMILY_TIME
	if p < 0.3 and v > 0.3:
	return ActionType.LEARN
	return ActionType.ADMIN_WORK
	if cn < 0.4:
	return ActionType.SOCIALIZE
	if s < 0.5:
	return ActionType.ME_TIME
	return ActionType.MEDITATE


	def generate_dataset(
	num_episodes: int = 200,
	strategy: str = "mixed",
	max_samples: int = 2000,
	profile_mode: str = "continuous",
	hint_fraction: float = 0.2,
	) -> list:
	"""Generate a training dataset by playing multiple episodes.

	Curriculum is baked into the dataset: hint_fraction of samples have the
	true profile visible (visible-phase warmup). After shuffle, GRPOTrainer
	sees a mix early on; we can sort to put hint samples first if needed.

	Args:
	num_episodes: Number of episodes to play.
	strategy: "random", "heuristic", or "mixed" (alternating).
	max_samples: Maximum samples to return.
	profile_mode: "continuous" (default, meta-RL) or "discrete" (3 profiles).
	hint_fraction: Fraction of episodes to play with profile hint visible.
	"""
	all_samples = []
	n_hint_episodes = int(num_episodes * hint_fraction)

	for i in range(num_episodes):
	seed = i
	if strategy == "mixed":
	s = "heuristic" if i % 2 == 0 else "random"
	else:
	s = strategy
	show_hint = i < n_hint_episodes

	episode_samples = generate_episode_samples(
	seed=seed,
	strategy=s,
	profile_mode=profile_mode,
	show_profile_hint=show_hint,
	)
	all_samples.extend(episode_samples)

	if len(all_samples) >= max_samples:
	break

	# Shuffle (curriculum is per-sample via show_profile_hint flag, not order)
	random.shuffle(all_samples)
	all_samples = all_samples[:max_samples]

	n_hint = sum(1 for s in all_samples if s["show_profile_hint"])
	print(
	f"Generated {len(all_samples)} samples from {min(i+1, num_episodes)} episodes "
	f"({n_hint} with profile hint, {len(all_samples) - n_hint} without)"
	)
	return all_samples


	if __name__ == "__main__":
	samples = generate_dataset(num_episodes=20, strategy="mixed", max_samples=80, hint_fraction=0.5)
	print(f"\nFirst sample (with hint):")
	hinted = next((s for s in samples if s["show_profile_hint"]), None)
	if hinted:
	print(hinted["prompt"][1]["content"])
	print(f"\nseed={hinted['seed']}, step={hinted['step_index']}, mode={hinted['profile_mode']}")

	print(f"\nFirst sample (without hint):")
	plain = next((s for s in samples if not s["show_profile_hint"]), None)
	if plain:
	print(plain["prompt"][1]["content"])