Spaces:

rishabh16196
/

prompt_golf_env

Sleeping

Don Rishabh

Pre-launch fixes: disable Qwen3 thinking, strip think blocks, degenerate-short guard

5abc867 13 days ago

8.84 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Reward rubric for Prompt Golf.

	Episodes are single-step: the agent's one action (a prompt) is scored, the
	episode terminates, and the reward is a composition of four components:

	1. raw_task_score — target LLM's accuracy on held-out test inputs
	when prompted with the submitted prompt, in [0, 1].
	2. length_factor — 1.0 while the prompt is within budget; decays
	exponentially as it exceeds the budget.
	3. leakage_penalty — 1.0 when the prompt contains no held-out test-input
	n-grams; scales toward 0 when the agent tries to
	paste answers into its prompt.
	4. baseline_bonus — extra credit (weight 0.3) for beating the
	target's zero-shot score on this task with any
	meaningful prompt.

	Final reward:
	base = raw_task_score * length_factor * leakage_penalty
	bonus = max(0, raw_task_score - baseline_zero_shot_score) * length_factor
	reward = clip(base + 0.3 * bonus, 0.0, 1.3)

	We return a dict with all four components so that training code can log
	them separately and compose rubrics if desired.
	"""

	from __future__ import annotations

	import re
	from dataclasses import dataclass
	from typing import Any, Dict, List


	# ---------------------------------------------------------------------------
	# Component calculators
	# ---------------------------------------------------------------------------

	def length_factor(tokens: int, budget: int, decay_k: int = 20) -> float:
	"""Length multiplier that rewards short prompts AND penalizes overshoot.

	- tokens == 0 -> 1.30 (max short-prompt bonus)
	- tokens == budget -> 1.00 (neutral)
	- tokens > budget -> exp(-(tokens - budget) / decay_k) (decays fast)

	The >1.0 region inside budget is what makes "shorter is better" a real
	gradient signal; otherwise truncation alone removes the incentive to
	compress once you fit.
	"""
	if budget <= 0:
	budget = 1
	if tokens <= budget:
	# Linear from 1.30 at 0 tokens -> 1.00 at budget.
	return 1.0 + 0.30 * (1.0 - tokens / budget)
	over = tokens - budget
	import math
	return float(math.exp(-over / max(1, decay_k)))


	def ngram_overlap(prompt: str, held_out_inputs: List[str], n: int = 4) -> float:
	"""Fraction of 4-grams in held-out inputs that appear in the prompt.

	Returns 0.0 when the prompt carries no leakage, up to 1.0 when every
	4-gram from every held-out input is present in the prompt. This is
	what the leakage_penalty multiplier is built from.
	"""
	prompt_norm = _normalize_for_ngrams(prompt)
	prompt_grams = set(_ngrams(prompt_norm.split(), n))
	if not prompt_grams:
	return 0.0

	total = 0
	hits = 0
	for x in held_out_inputs:
	x_norm = _normalize_for_ngrams(x)
	for gram in _ngrams(x_norm.split(), n):
	total += 1
	if gram in prompt_grams:
	hits += 1
	if total == 0:
	return 0.0
	return hits / total


	def leakage_penalty(prompt: str, held_out_inputs: List[str]) -> float:
	"""Convert n-gram overlap to a multiplier in [0, 1].

	1.0 == no overlap; 0.0 == perfect leak. Scales quadratically so small
	accidental overlaps aren't harshly punished but systematic copying is.
	"""
	overlap = ngram_overlap(prompt, held_out_inputs, n=4)
	penalty = max(0.0, 1.0 - overlap * overlap) # 0 leak=>1, full leak=>0
	return penalty


	def _normalize_for_ngrams(s: str) -> str:
	s = s.lower()
	s = re.sub(r"[^a-z0-9\s]", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s


	def _ngrams(tokens: List[str], n: int) -> List[tuple]:
	if len(tokens) < n:
	return []
	return [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]


	# ---------------------------------------------------------------------------
	# Top-level rubric
	# ---------------------------------------------------------------------------

	@dataclass
	class RubricResult:
	reward: float
	raw_task_score: float
	length_factor: float
	leakage_penalty: float
	gain_over_baseline: float
	baseline_bonus_component: float
	submitted_tokens: int
	prompt_budget: int


	class PromptGolfRubric:
	"""Pure-python rubric for Prompt Golf.

	ADDITIVE formulation (v2):
	reward = success_score
	- LAMBDA_LEN * tokens
	- LAMBDA_LEAK * leakage_overlap

	where success_score = raw_task_score - BASELINE_SUBTRACT * baseline.

	Tuning rationale:
	- LAMBDA_LEN = 0.005 → with baseline tokens ~50 and raw_score ~0.25,
	the untrained baseline reward sits near 0.0 (0.25 - 0.250.5 - 0.00550 = 0.0),
	giving smooth gradients in both directions.
	- LAMBDA_LEAK = 1.0 → a fully-leaked prompt (all 4-grams present)
	loses the whole raw_score contribution.
	- BASELINE_SUBTRACT = 0.5 → partially normalize against the target's
	zero-shot ability, so easy-for-target tasks don't saturate reward.

	Old fields kept on RubricResult (length_factor / leakage_penalty) for
	backward-compat logging; they're now derived rather than multiplicative.
	"""

	LAMBDA_LEN: float = 0.002 # softer than v2.0 (was 0.005)
	LAMBDA_LEAK: float = 1.0
	BASELINE_SUBTRACT: float = 0.5
	MIN_TOKENS_FLOOR: int = 5 # prompts below this get a flat penalty
	MIN_TOKENS_PENALTY: float = 0.25 # ← large enough to overcome length_cost savings

	# Keep old clip boundaries so downstream plots don't break
	REWARD_CLIP_LOW: float = -0.5
	REWARD_CLIP_HIGH: float = 1.3

	def grade(
	self,
	*,
	raw_task_score: float,
	baseline_zero_shot_score: float,
	submitted_tokens: int,
	prompt_budget: int,
	prompt_text: str,
	held_out_inputs: List[str],
	) -> RubricResult:
	overlap = ngram_overlap(prompt_text, held_out_inputs, n=4)
	# Quadratic leak penalty so small accidental overlap ≈ free,
	# systematic copying hammers.
	leak_cost = self.LAMBDA_LEAK * (overlap ** 2)

	# Length cost: linear for reasonable-length prompts; hard floor
	# below MIN_TOKENS_FLOOR to prevent degenerate policy collapse
	# to 1-token outputs on tasks where the target can't be steered.
	tokens = max(0, submitted_tokens)
	length_cost = self.LAMBDA_LEN * float(tokens)
	if tokens < self.MIN_TOKENS_FLOOR:
	# Flat penalty shrinks linearly from MIN_TOKENS_PENALTY at 0 tokens
	# to 0 at MIN_TOKENS_FLOOR tokens. Guarantees a >1-token prompt
	# beats a 1-token prompt at equal raw_score.
	short_penalty = self.MIN_TOKENS_PENALTY * (
	1.0 - tokens / max(1, self.MIN_TOKENS_FLOOR)
	)
	length_cost += short_penalty

	success = raw_task_score - self.BASELINE_SUBTRACT * baseline_zero_shot_score
	gain = raw_task_score - baseline_zero_shot_score

	reward = success - length_cost - leak_cost
	reward = float(max(self.REWARD_CLIP_LOW, min(self.REWARD_CLIP_HIGH, reward)))

	# Derived legacy fields (for log continuity with v1 metrics jsonl)
	lf_legacy = length_factor(submitted_tokens, prompt_budget)
	lp_legacy = 1.0 - overlap * overlap # 1.0 == clean, 0.0 == leaked

	return RubricResult(
	reward=reward,
	raw_task_score=float(raw_task_score),
	length_factor=float(lf_legacy),
	leakage_penalty=float(lp_legacy),
	gain_over_baseline=float(gain),
	baseline_bonus_component=float(length_cost), # repurposed: log length_cost
	submitted_tokens=int(submitted_tokens),
	prompt_budget=int(prompt_budget),
	)


	def grade_details_dict(result: RubricResult, task_id: str, passed_threshold: float = 0.5) -> Dict[str, Any]:
	"""Shape the rubric result into the metadata dict the observation exposes."""
	return {
	"task": task_id,
	"reward": round(result.reward, 4),
	"raw_task_score": round(result.raw_task_score, 4),
	"length_factor": round(result.length_factor, 4),
	"leakage_penalty": round(result.leakage_penalty, 4),
	"gain_over_baseline": round(result.gain_over_baseline, 4),
	"baseline_bonus_component": round(result.baseline_bonus_component, 4),
	"submitted_tokens": result.submitted_tokens,
	"prompt_budget": result.prompt_budget,
	"passed": result.reward >= passed_threshold,
	}