prompt_golf_env / server /rubrics.py
Don Rishabh
Pre-launch fixes: disable Qwen3 thinking, strip think blocks, degenerate-short guard
5abc867
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Reward rubric for Prompt Golf.
Episodes are single-step: the agent's one action (a prompt) is scored, the
episode terminates, and the reward is a composition of four components:
1. raw_task_score — target LLM's accuracy on held-out test inputs
when prompted with the submitted prompt, in [0, 1].
2. length_factor — 1.0 while the prompt is within budget; decays
exponentially as it exceeds the budget.
3. leakage_penalty — 1.0 when the prompt contains no held-out test-input
n-grams; scales toward 0 when the agent tries to
paste answers into its prompt.
4. baseline_bonus — extra credit (weight 0.3) for beating the
target's zero-shot score on this task with any
meaningful prompt.
Final reward:
base = raw_task_score * length_factor * leakage_penalty
bonus = max(0, raw_task_score - baseline_zero_shot_score) * length_factor
reward = clip(base + 0.3 * bonus, 0.0, 1.3)
We return a dict with all four components so that training code can log
them separately and compose rubrics if desired.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Any, Dict, List
# ---------------------------------------------------------------------------
# Component calculators
# ---------------------------------------------------------------------------
def length_factor(tokens: int, budget: int, decay_k: int = 20) -> float:
"""Length multiplier that rewards short prompts AND penalizes overshoot.
- tokens == 0 -> 1.30 (max short-prompt bonus)
- tokens == budget -> 1.00 (neutral)
- tokens > budget -> exp(-(tokens - budget) / decay_k) (decays fast)
The >1.0 region inside budget is what makes "shorter is better" a real
gradient signal; otherwise truncation alone removes the incentive to
compress once you fit.
"""
if budget <= 0:
budget = 1
if tokens <= budget:
# Linear from 1.30 at 0 tokens -> 1.00 at budget.
return 1.0 + 0.30 * (1.0 - tokens / budget)
over = tokens - budget
import math
return float(math.exp(-over / max(1, decay_k)))
def ngram_overlap(prompt: str, held_out_inputs: List[str], n: int = 4) -> float:
"""Fraction of 4-grams in held-out inputs that appear in the prompt.
Returns 0.0 when the prompt carries no leakage, up to 1.0 when every
4-gram from every held-out input is present in the prompt. This is
what the leakage_penalty multiplier is built from.
"""
prompt_norm = _normalize_for_ngrams(prompt)
prompt_grams = set(_ngrams(prompt_norm.split(), n))
if not prompt_grams:
return 0.0
total = 0
hits = 0
for x in held_out_inputs:
x_norm = _normalize_for_ngrams(x)
for gram in _ngrams(x_norm.split(), n):
total += 1
if gram in prompt_grams:
hits += 1
if total == 0:
return 0.0
return hits / total
def leakage_penalty(prompt: str, held_out_inputs: List[str]) -> float:
"""Convert n-gram overlap to a multiplier in [0, 1].
1.0 == no overlap; 0.0 == perfect leak. Scales quadratically so small
accidental overlaps aren't harshly punished but systematic copying is.
"""
overlap = ngram_overlap(prompt, held_out_inputs, n=4)
penalty = max(0.0, 1.0 - overlap * overlap) # 0 leak=>1, full leak=>0
return penalty
def _normalize_for_ngrams(s: str) -> str:
s = s.lower()
s = re.sub(r"[^a-z0-9\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def _ngrams(tokens: List[str], n: int) -> List[tuple]:
if len(tokens) < n:
return []
return [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]
# ---------------------------------------------------------------------------
# Top-level rubric
# ---------------------------------------------------------------------------
@dataclass
class RubricResult:
reward: float
raw_task_score: float
length_factor: float
leakage_penalty: float
gain_over_baseline: float
baseline_bonus_component: float
submitted_tokens: int
prompt_budget: int
class PromptGolfRubric:
"""Pure-python rubric for Prompt Golf.
ADDITIVE formulation (v2):
reward = success_score
- LAMBDA_LEN * tokens
- LAMBDA_LEAK * leakage_overlap
where success_score = raw_task_score - BASELINE_SUBTRACT * baseline.
Tuning rationale:
- LAMBDA_LEN = 0.005 → with baseline tokens ~50 and raw_score ~0.25,
the untrained baseline reward sits near 0.0 (0.25 - 0.25*0.5 - 0.005*50 = 0.0),
giving smooth gradients in both directions.
- LAMBDA_LEAK = 1.0 → a fully-leaked prompt (all 4-grams present)
loses the whole raw_score contribution.
- BASELINE_SUBTRACT = 0.5 → partially normalize against the target's
zero-shot ability, so easy-for-target tasks don't saturate reward.
Old fields kept on RubricResult (length_factor / leakage_penalty) for
backward-compat logging; they're now derived rather than multiplicative.
"""
LAMBDA_LEN: float = 0.002 # softer than v2.0 (was 0.005)
LAMBDA_LEAK: float = 1.0
BASELINE_SUBTRACT: float = 0.5
MIN_TOKENS_FLOOR: int = 5 # prompts below this get a flat penalty
MIN_TOKENS_PENALTY: float = 0.25 # ← large enough to overcome length_cost savings
# Keep old clip boundaries so downstream plots don't break
REWARD_CLIP_LOW: float = -0.5
REWARD_CLIP_HIGH: float = 1.3
def grade(
self,
*,
raw_task_score: float,
baseline_zero_shot_score: float,
submitted_tokens: int,
prompt_budget: int,
prompt_text: str,
held_out_inputs: List[str],
) -> RubricResult:
overlap = ngram_overlap(prompt_text, held_out_inputs, n=4)
# Quadratic leak penalty so small accidental overlap ≈ free,
# systematic copying hammers.
leak_cost = self.LAMBDA_LEAK * (overlap ** 2)
# Length cost: linear for reasonable-length prompts; hard floor
# below MIN_TOKENS_FLOOR to prevent degenerate policy collapse
# to 1-token outputs on tasks where the target can't be steered.
tokens = max(0, submitted_tokens)
length_cost = self.LAMBDA_LEN * float(tokens)
if tokens < self.MIN_TOKENS_FLOOR:
# Flat penalty shrinks linearly from MIN_TOKENS_PENALTY at 0 tokens
# to 0 at MIN_TOKENS_FLOOR tokens. Guarantees a >1-token prompt
# beats a 1-token prompt at equal raw_score.
short_penalty = self.MIN_TOKENS_PENALTY * (
1.0 - tokens / max(1, self.MIN_TOKENS_FLOOR)
)
length_cost += short_penalty
success = raw_task_score - self.BASELINE_SUBTRACT * baseline_zero_shot_score
gain = raw_task_score - baseline_zero_shot_score
reward = success - length_cost - leak_cost
reward = float(max(self.REWARD_CLIP_LOW, min(self.REWARD_CLIP_HIGH, reward)))
# Derived legacy fields (for log continuity with v1 metrics jsonl)
lf_legacy = length_factor(submitted_tokens, prompt_budget)
lp_legacy = 1.0 - overlap * overlap # 1.0 == clean, 0.0 == leaked
return RubricResult(
reward=reward,
raw_task_score=float(raw_task_score),
length_factor=float(lf_legacy),
leakage_penalty=float(lp_legacy),
gain_over_baseline=float(gain),
baseline_bonus_component=float(length_cost), # repurposed: log length_cost
submitted_tokens=int(submitted_tokens),
prompt_budget=int(prompt_budget),
)
def grade_details_dict(result: RubricResult, task_id: str, passed_threshold: float = 0.5) -> Dict[str, Any]:
"""Shape the rubric result into the metadata dict the observation exposes."""
return {
"task": task_id,
"reward": round(result.reward, 4),
"raw_task_score": round(result.raw_task_score, 4),
"length_factor": round(result.length_factor, 4),
"leakage_penalty": round(result.leakage_penalty, 4),
"gain_over_baseline": round(result.gain_over_baseline, 4),
"baseline_bonus_component": round(result.baseline_bonus_component, 4),
"submitted_tokens": result.submitted_tokens,
"prompt_budget": result.prompt_budget,
"passed": result.reward >= passed_threshold,
}