Spaces:

mnawfal29
/

landscapeforge

Sleeping

App Files Files Community

landscapeforge / rewards.py

mnawfal29

Upload folder using huggingface_hub

962ad43 verified 14 days ago

raw

history blame contribute delete

6.41 kB

	"""Reward computation for OptCoder and LandscapeForge.

	Matches §9 of LANDSCAPEFORGE_DESIGN.md (v0.2).
	"""

	from __future__ import annotations

	from dataclasses import dataclass

	import numpy as np

	try:
	from .arena import ArenaResult
	except ImportError: # flat layout (HF Space container)
	from arena import ArenaResult # type: ignore


	# Default weights (§9.1)
	W_REGRET = 1.0
	W_CONV = 0.3
	W_ROBUST = 0.3
	W_NOVELTY = 0.1
	W_BUDGET = 0.05
	W_EVAL_FAIL = 0.5

	NOVELTY_GATE = 0.5 # novelty only applied when r_regret > this


	@dataclass
	class OptCoderReward:
	r_total: float
	breakdown: dict[str, float]


	def compute_optcoder_reward(
	arena: ArenaResult,
	adam_arena: ArenaResult,
	actions_used_cost: int, # sum of per-action costs, not count
	budget_total: int,
	novelty_score: float, # AST edit distance / len, clamped to [0, 1]
	convergence_step: int \| None,
	arena_steps: int,
	) -> OptCoderReward:
	"""Compute OptCoder's terminal reward (no `f_min` dependency).

	r_regret is driven by Adam-relative descent:
	my_progress = mean(f_initial - f_final) across seeds for the draft
	adam_progress = same for Adam
	r_regret = clamp(my_progress / max(adam_progress, floor) - 1, -1, +1)

	Interpretation:
	r_regret = +1 → descended ≥ 2× as far as Adam
	r_regret = 0 → matched Adam's descent
	r_regret = -1 → descended ≤ 0 while Adam descended normally
	"""
	my_progress = arena.mean_progress
	adam_progress = adam_arena.mean_progress
	# Denominator floor: if Adam barely descended (e.g. plateau landscape),
	# use ~1% of initial \|f\| to avoid a tiny denominator exploding the ratio.
	denom_floor = 0.01 * adam_arena.mean_initial_scale + 1e-6
	denom = max(adam_progress, denom_floor)

	r_regret_raw = my_progress / denom - 1.0
	r_regret = float(np.clip(r_regret_raw, -1.0, 1.0))

	# Convergence speed: 1 if hit 1% of initial f before N steps
	if convergence_step is None or convergence_step >= arena_steps:
	r_conv = 0.0
	else:
	r_conv = float(np.clip(1.0 - convergence_step / arena_steps, 0.0, 1.0))

	r_robust = arena.robustness

	# Novelty gated on regret performance
	r_novelty = float(novelty_score) if r_regret > NOVELTY_GATE else 0.0

	r_budget = float(np.clip(actions_used_cost / max(budget_total, 1), 0.0, 1.0))
	r_eval_fail = arena.crash_fraction

	total = (
	W_REGRET * r_regret
	+ W_CONV * r_conv
	+ W_ROBUST * r_robust
	+ W_NOVELTY * r_novelty
	- W_BUDGET * r_budget
	- W_EVAL_FAIL * r_eval_fail
	)

	return OptCoderReward(
	r_total=float(total),
	breakdown={
	"r_regret": r_regret,
	"r_convergence": r_conv,
	"r_robustness": r_robust,
	"r_novelty": r_novelty,
	"r_budget": r_budget,
	"r_eval_failures": r_eval_fail,
	"my_progress": float(my_progress),
	"adam_progress": float(adam_progress),
	"speedup_vs_adam": float(my_progress / denom),
	},
	)


	def ast_novelty_score(committed_source: str, reference_sources: list[str]) -> float:
	"""Coarse AST-diff score vs a set of reference optimizers.

	Returns min over references of (edit distance / len of committed), clamped
	to [0, 1]. Near-zero means heavy copy. For v1 we use a simple char-level
	Levenshtein-ish ratio; AST diffing is deferred.
	"""
	if not committed_source:
	return 0.0
	best = 1.0
	for ref in reference_sources:
	ratio = _diff_ratio(committed_source, ref)
	if ratio < best:
	best = ratio
	return float(np.clip(best, 0.0, 1.0))


	def _diff_ratio(a: str, b: str) -> float:
	"""difflib-based ratio: 1 - similarity. Cheap, roughly AST-order-insensitive."""
	import difflib
	sim = difflib.SequenceMatcher(None, a, b).ratio()
	return 1.0 - sim


	# ---------- Stepwise FEEDBACK (not reward) ----------
	# These signals are exposed to the LLM through the observation so it can
	# course-correct mid-episode. They are NOT summed into the training reward —
	# terminal arena reward is the only GRPO signal, to preserve robustness.

	COMPILE_PENALTY_SIGNAL = -0.1
	PHI_SCALE = 10.0 # normalizer for best_draft_f to keep potential in ~[-1, 0]


	def _draft_potential(draft_history: list[dict]) -> float:
	"""Potential: higher = better. No drafts yet → -1.0 (worst) so that the
	first valid draft always emits a positive phi_delta proportional to its
	quality. Clamped to [-1, 0].
	"""
	finals = [
	d["summary"]["final_f"]
	for d in draft_history
	if d.get("summary") and d["summary"].get("final_f") is not None
	]
	if not finals:
	return -1.0 # no valid draft yet → worst-case potential
	best = min(finals)
	normed = min(max(best, 0.0), PHI_SCALE) / PHI_SCALE
	return -normed # lower best_f → higher potential


	def compute_step_reward(prev_draft_history: list[dict],
	new_draft_history: list[dict],
	action_kind: str,
	action_result: dict) -> dict:
	"""Stepwise FEEDBACK signals for one REPL turn.

	Despite the name (kept for API compatibility), this does NOT produce
	training reward. The returned dict is structured for surfacing to the
	LLM's next prompt as part of `last_action_result.feedback`:

	- `phi_delta`: improvement in best-draft-so-far. Positive = the latest
	action moved the best known draft closer to the minimum.
	- `compile_penalty`: -0.1 marker indicating the last draft failed to
	compile. Helps the agent notice structural bugs immediately.

	Caller should NOT add the numeric values to the training reward.
	"""
	breakdown: dict[str, float] = {}

	phi_prev = _draft_potential(prev_draft_history)
	phi_new = _draft_potential(new_draft_history)
	phi_delta = phi_new - phi_prev
	if abs(phi_delta) > 1e-9:
	breakdown["phi_delta"] = float(phi_delta)

	if action_kind == "draft" and action_result.get("compile_error"):
	breakdown["compile_penalty"] = COMPILE_PENALTY_SIGNAL

	# r_step retained for backwards compatibility but caller must ignore it
	# for training purposes (see docstring).
	r_step = float(sum(breakdown.values()))
	return {"r_step": r_step, "breakdown": breakdown}