Spaces:
Sleeping
Sleeping
File size: 8,839 Bytes
6850dad 3889513 6850dad 5abc867 3889513 5abc867 3889513 5abc867 6850dad 3889513 6850dad 5abc867 3889513 6850dad 3889513 6850dad 3889513 6850dad 3889513 6850dad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Reward rubric for Prompt Golf.
Episodes are single-step: the agent's one action (a prompt) is scored, the
episode terminates, and the reward is a composition of four components:
1. raw_task_score — target LLM's accuracy on held-out test inputs
when prompted with the submitted prompt, in [0, 1].
2. length_factor — 1.0 while the prompt is within budget; decays
exponentially as it exceeds the budget.
3. leakage_penalty — 1.0 when the prompt contains no held-out test-input
n-grams; scales toward 0 when the agent tries to
paste answers into its prompt.
4. baseline_bonus — extra credit (weight 0.3) for beating the
target's zero-shot score on this task with any
meaningful prompt.
Final reward:
base = raw_task_score * length_factor * leakage_penalty
bonus = max(0, raw_task_score - baseline_zero_shot_score) * length_factor
reward = clip(base + 0.3 * bonus, 0.0, 1.3)
We return a dict with all four components so that training code can log
them separately and compose rubrics if desired.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Any, Dict, List
# ---------------------------------------------------------------------------
# Component calculators
# ---------------------------------------------------------------------------
def length_factor(tokens: int, budget: int, decay_k: int = 20) -> float:
"""Length multiplier that rewards short prompts AND penalizes overshoot.
- tokens == 0 -> 1.30 (max short-prompt bonus)
- tokens == budget -> 1.00 (neutral)
- tokens > budget -> exp(-(tokens - budget) / decay_k) (decays fast)
The >1.0 region inside budget is what makes "shorter is better" a real
gradient signal; otherwise truncation alone removes the incentive to
compress once you fit.
"""
if budget <= 0:
budget = 1
if tokens <= budget:
# Linear from 1.30 at 0 tokens -> 1.00 at budget.
return 1.0 + 0.30 * (1.0 - tokens / budget)
over = tokens - budget
import math
return float(math.exp(-over / max(1, decay_k)))
def ngram_overlap(prompt: str, held_out_inputs: List[str], n: int = 4) -> float:
"""Fraction of 4-grams in held-out inputs that appear in the prompt.
Returns 0.0 when the prompt carries no leakage, up to 1.0 when every
4-gram from every held-out input is present in the prompt. This is
what the leakage_penalty multiplier is built from.
"""
prompt_norm = _normalize_for_ngrams(prompt)
prompt_grams = set(_ngrams(prompt_norm.split(), n))
if not prompt_grams:
return 0.0
total = 0
hits = 0
for x in held_out_inputs:
x_norm = _normalize_for_ngrams(x)
for gram in _ngrams(x_norm.split(), n):
total += 1
if gram in prompt_grams:
hits += 1
if total == 0:
return 0.0
return hits / total
def leakage_penalty(prompt: str, held_out_inputs: List[str]) -> float:
"""Convert n-gram overlap to a multiplier in [0, 1].
1.0 == no overlap; 0.0 == perfect leak. Scales quadratically so small
accidental overlaps aren't harshly punished but systematic copying is.
"""
overlap = ngram_overlap(prompt, held_out_inputs, n=4)
penalty = max(0.0, 1.0 - overlap * overlap) # 0 leak=>1, full leak=>0
return penalty
def _normalize_for_ngrams(s: str) -> str:
s = s.lower()
s = re.sub(r"[^a-z0-9\s]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def _ngrams(tokens: List[str], n: int) -> List[tuple]:
if len(tokens) < n:
return []
return [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]
# ---------------------------------------------------------------------------
# Top-level rubric
# ---------------------------------------------------------------------------
@dataclass
class RubricResult:
reward: float
raw_task_score: float
length_factor: float
leakage_penalty: float
gain_over_baseline: float
baseline_bonus_component: float
submitted_tokens: int
prompt_budget: int
class PromptGolfRubric:
"""Pure-python rubric for Prompt Golf.
ADDITIVE formulation (v2):
reward = success_score
- LAMBDA_LEN * tokens
- LAMBDA_LEAK * leakage_overlap
where success_score = raw_task_score - BASELINE_SUBTRACT * baseline.
Tuning rationale:
- LAMBDA_LEN = 0.005 → with baseline tokens ~50 and raw_score ~0.25,
the untrained baseline reward sits near 0.0 (0.25 - 0.25*0.5 - 0.005*50 = 0.0),
giving smooth gradients in both directions.
- LAMBDA_LEAK = 1.0 → a fully-leaked prompt (all 4-grams present)
loses the whole raw_score contribution.
- BASELINE_SUBTRACT = 0.5 → partially normalize against the target's
zero-shot ability, so easy-for-target tasks don't saturate reward.
Old fields kept on RubricResult (length_factor / leakage_penalty) for
backward-compat logging; they're now derived rather than multiplicative.
"""
LAMBDA_LEN: float = 0.002 # softer than v2.0 (was 0.005)
LAMBDA_LEAK: float = 1.0
BASELINE_SUBTRACT: float = 0.5
MIN_TOKENS_FLOOR: int = 5 # prompts below this get a flat penalty
MIN_TOKENS_PENALTY: float = 0.25 # ← large enough to overcome length_cost savings
# Keep old clip boundaries so downstream plots don't break
REWARD_CLIP_LOW: float = -0.5
REWARD_CLIP_HIGH: float = 1.3
def grade(
self,
*,
raw_task_score: float,
baseline_zero_shot_score: float,
submitted_tokens: int,
prompt_budget: int,
prompt_text: str,
held_out_inputs: List[str],
) -> RubricResult:
overlap = ngram_overlap(prompt_text, held_out_inputs, n=4)
# Quadratic leak penalty so small accidental overlap ≈ free,
# systematic copying hammers.
leak_cost = self.LAMBDA_LEAK * (overlap ** 2)
# Length cost: linear for reasonable-length prompts; hard floor
# below MIN_TOKENS_FLOOR to prevent degenerate policy collapse
# to 1-token outputs on tasks where the target can't be steered.
tokens = max(0, submitted_tokens)
length_cost = self.LAMBDA_LEN * float(tokens)
if tokens < self.MIN_TOKENS_FLOOR:
# Flat penalty shrinks linearly from MIN_TOKENS_PENALTY at 0 tokens
# to 0 at MIN_TOKENS_FLOOR tokens. Guarantees a >1-token prompt
# beats a 1-token prompt at equal raw_score.
short_penalty = self.MIN_TOKENS_PENALTY * (
1.0 - tokens / max(1, self.MIN_TOKENS_FLOOR)
)
length_cost += short_penalty
success = raw_task_score - self.BASELINE_SUBTRACT * baseline_zero_shot_score
gain = raw_task_score - baseline_zero_shot_score
reward = success - length_cost - leak_cost
reward = float(max(self.REWARD_CLIP_LOW, min(self.REWARD_CLIP_HIGH, reward)))
# Derived legacy fields (for log continuity with v1 metrics jsonl)
lf_legacy = length_factor(submitted_tokens, prompt_budget)
lp_legacy = 1.0 - overlap * overlap # 1.0 == clean, 0.0 == leaked
return RubricResult(
reward=reward,
raw_task_score=float(raw_task_score),
length_factor=float(lf_legacy),
leakage_penalty=float(lp_legacy),
gain_over_baseline=float(gain),
baseline_bonus_component=float(length_cost), # repurposed: log length_cost
submitted_tokens=int(submitted_tokens),
prompt_budget=int(prompt_budget),
)
def grade_details_dict(result: RubricResult, task_id: str, passed_threshold: float = 0.5) -> Dict[str, Any]:
"""Shape the rubric result into the metadata dict the observation exposes."""
return {
"task": task_id,
"reward": round(result.reward, 4),
"raw_task_score": round(result.raw_task_score, 4),
"length_factor": round(result.length_factor, 4),
"leakage_penalty": round(result.leakage_penalty, 4),
"gain_over_baseline": round(result.gain_over_baseline, 4),
"baseline_bonus_component": round(result.baseline_bonus_component, 4),
"submitted_tokens": result.submitted_tokens,
"prompt_budget": result.prompt_budget,
"passed": result.reward >= passed_threshold,
}
|