Spaces:
Sleeping
Sleeping
Don Rishabh
Pre-launch fixes: disable Qwen3 thinking, strip think blocks, degenerate-short guard
5abc867 | # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """ | |
| Reward rubric for Prompt Golf. | |
| Episodes are single-step: the agent's one action (a prompt) is scored, the | |
| episode terminates, and the reward is a composition of four components: | |
| 1. raw_task_score — target LLM's accuracy on held-out test inputs | |
| when prompted with the submitted prompt, in [0, 1]. | |
| 2. length_factor — 1.0 while the prompt is within budget; decays | |
| exponentially as it exceeds the budget. | |
| 3. leakage_penalty — 1.0 when the prompt contains no held-out test-input | |
| n-grams; scales toward 0 when the agent tries to | |
| paste answers into its prompt. | |
| 4. baseline_bonus — extra credit (weight 0.3) for beating the | |
| target's zero-shot score on this task with any | |
| meaningful prompt. | |
| Final reward: | |
| base = raw_task_score * length_factor * leakage_penalty | |
| bonus = max(0, raw_task_score - baseline_zero_shot_score) * length_factor | |
| reward = clip(base + 0.3 * bonus, 0.0, 1.3) | |
| We return a dict with all four components so that training code can log | |
| them separately and compose rubrics if desired. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass | |
| from typing import Any, Dict, List | |
| # --------------------------------------------------------------------------- | |
| # Component calculators | |
| # --------------------------------------------------------------------------- | |
| def length_factor(tokens: int, budget: int, decay_k: int = 20) -> float: | |
| """Length multiplier that rewards short prompts AND penalizes overshoot. | |
| - tokens == 0 -> 1.30 (max short-prompt bonus) | |
| - tokens == budget -> 1.00 (neutral) | |
| - tokens > budget -> exp(-(tokens - budget) / decay_k) (decays fast) | |
| The >1.0 region inside budget is what makes "shorter is better" a real | |
| gradient signal; otherwise truncation alone removes the incentive to | |
| compress once you fit. | |
| """ | |
| if budget <= 0: | |
| budget = 1 | |
| if tokens <= budget: | |
| # Linear from 1.30 at 0 tokens -> 1.00 at budget. | |
| return 1.0 + 0.30 * (1.0 - tokens / budget) | |
| over = tokens - budget | |
| import math | |
| return float(math.exp(-over / max(1, decay_k))) | |
| def ngram_overlap(prompt: str, held_out_inputs: List[str], n: int = 4) -> float: | |
| """Fraction of 4-grams in held-out inputs that appear in the prompt. | |
| Returns 0.0 when the prompt carries no leakage, up to 1.0 when every | |
| 4-gram from every held-out input is present in the prompt. This is | |
| what the leakage_penalty multiplier is built from. | |
| """ | |
| prompt_norm = _normalize_for_ngrams(prompt) | |
| prompt_grams = set(_ngrams(prompt_norm.split(), n)) | |
| if not prompt_grams: | |
| return 0.0 | |
| total = 0 | |
| hits = 0 | |
| for x in held_out_inputs: | |
| x_norm = _normalize_for_ngrams(x) | |
| for gram in _ngrams(x_norm.split(), n): | |
| total += 1 | |
| if gram in prompt_grams: | |
| hits += 1 | |
| if total == 0: | |
| return 0.0 | |
| return hits / total | |
| def leakage_penalty(prompt: str, held_out_inputs: List[str]) -> float: | |
| """Convert n-gram overlap to a multiplier in [0, 1]. | |
| 1.0 == no overlap; 0.0 == perfect leak. Scales quadratically so small | |
| accidental overlaps aren't harshly punished but systematic copying is. | |
| """ | |
| overlap = ngram_overlap(prompt, held_out_inputs, n=4) | |
| penalty = max(0.0, 1.0 - overlap * overlap) # 0 leak=>1, full leak=>0 | |
| return penalty | |
| def _normalize_for_ngrams(s: str) -> str: | |
| s = s.lower() | |
| s = re.sub(r"[^a-z0-9\s]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def _ngrams(tokens: List[str], n: int) -> List[tuple]: | |
| if len(tokens) < n: | |
| return [] | |
| return [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)] | |
| # --------------------------------------------------------------------------- | |
| # Top-level rubric | |
| # --------------------------------------------------------------------------- | |
| class RubricResult: | |
| reward: float | |
| raw_task_score: float | |
| length_factor: float | |
| leakage_penalty: float | |
| gain_over_baseline: float | |
| baseline_bonus_component: float | |
| submitted_tokens: int | |
| prompt_budget: int | |
| class PromptGolfRubric: | |
| """Pure-python rubric for Prompt Golf. | |
| ADDITIVE formulation (v2): | |
| reward = success_score | |
| - LAMBDA_LEN * tokens | |
| - LAMBDA_LEAK * leakage_overlap | |
| where success_score = raw_task_score - BASELINE_SUBTRACT * baseline. | |
| Tuning rationale: | |
| - LAMBDA_LEN = 0.005 → with baseline tokens ~50 and raw_score ~0.25, | |
| the untrained baseline reward sits near 0.0 (0.25 - 0.25*0.5 - 0.005*50 = 0.0), | |
| giving smooth gradients in both directions. | |
| - LAMBDA_LEAK = 1.0 → a fully-leaked prompt (all 4-grams present) | |
| loses the whole raw_score contribution. | |
| - BASELINE_SUBTRACT = 0.5 → partially normalize against the target's | |
| zero-shot ability, so easy-for-target tasks don't saturate reward. | |
| Old fields kept on RubricResult (length_factor / leakage_penalty) for | |
| backward-compat logging; they're now derived rather than multiplicative. | |
| """ | |
| LAMBDA_LEN: float = 0.002 # softer than v2.0 (was 0.005) | |
| LAMBDA_LEAK: float = 1.0 | |
| BASELINE_SUBTRACT: float = 0.5 | |
| MIN_TOKENS_FLOOR: int = 5 # prompts below this get a flat penalty | |
| MIN_TOKENS_PENALTY: float = 0.25 # ← large enough to overcome length_cost savings | |
| # Keep old clip boundaries so downstream plots don't break | |
| REWARD_CLIP_LOW: float = -0.5 | |
| REWARD_CLIP_HIGH: float = 1.3 | |
| def grade( | |
| self, | |
| *, | |
| raw_task_score: float, | |
| baseline_zero_shot_score: float, | |
| submitted_tokens: int, | |
| prompt_budget: int, | |
| prompt_text: str, | |
| held_out_inputs: List[str], | |
| ) -> RubricResult: | |
| overlap = ngram_overlap(prompt_text, held_out_inputs, n=4) | |
| # Quadratic leak penalty so small accidental overlap ≈ free, | |
| # systematic copying hammers. | |
| leak_cost = self.LAMBDA_LEAK * (overlap ** 2) | |
| # Length cost: linear for reasonable-length prompts; hard floor | |
| # below MIN_TOKENS_FLOOR to prevent degenerate policy collapse | |
| # to 1-token outputs on tasks where the target can't be steered. | |
| tokens = max(0, submitted_tokens) | |
| length_cost = self.LAMBDA_LEN * float(tokens) | |
| if tokens < self.MIN_TOKENS_FLOOR: | |
| # Flat penalty shrinks linearly from MIN_TOKENS_PENALTY at 0 tokens | |
| # to 0 at MIN_TOKENS_FLOOR tokens. Guarantees a >1-token prompt | |
| # beats a 1-token prompt at equal raw_score. | |
| short_penalty = self.MIN_TOKENS_PENALTY * ( | |
| 1.0 - tokens / max(1, self.MIN_TOKENS_FLOOR) | |
| ) | |
| length_cost += short_penalty | |
| success = raw_task_score - self.BASELINE_SUBTRACT * baseline_zero_shot_score | |
| gain = raw_task_score - baseline_zero_shot_score | |
| reward = success - length_cost - leak_cost | |
| reward = float(max(self.REWARD_CLIP_LOW, min(self.REWARD_CLIP_HIGH, reward))) | |
| # Derived legacy fields (for log continuity with v1 metrics jsonl) | |
| lf_legacy = length_factor(submitted_tokens, prompt_budget) | |
| lp_legacy = 1.0 - overlap * overlap # 1.0 == clean, 0.0 == leaked | |
| return RubricResult( | |
| reward=reward, | |
| raw_task_score=float(raw_task_score), | |
| length_factor=float(lf_legacy), | |
| leakage_penalty=float(lp_legacy), | |
| gain_over_baseline=float(gain), | |
| baseline_bonus_component=float(length_cost), # repurposed: log length_cost | |
| submitted_tokens=int(submitted_tokens), | |
| prompt_budget=int(prompt_budget), | |
| ) | |
| def grade_details_dict(result: RubricResult, task_id: str, passed_threshold: float = 0.5) -> Dict[str, Any]: | |
| """Shape the rubric result into the metadata dict the observation exposes.""" | |
| return { | |
| "task": task_id, | |
| "reward": round(result.reward, 4), | |
| "raw_task_score": round(result.raw_task_score, 4), | |
| "length_factor": round(result.length_factor, 4), | |
| "leakage_penalty": round(result.leakage_penalty, 4), | |
| "gain_over_baseline": round(result.gain_over_baseline, 4), | |
| "baseline_bonus_component": round(result.baseline_bonus_component, 4), | |
| "submitted_tokens": result.submitted_tokens, | |
| "prompt_budget": result.prompt_budget, | |
| "passed": result.reward >= passed_threshold, | |
| } | |