| |
| """ |
| Static reward-surface audit for Parlay (read-only, no env rollouts). |
| Analytical notes derived from parlay_env/grader.py, parlay_env/reward.py, game/scenarios.py. |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import sys |
| from pathlib import Path |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser( |
| description="Analytical Parlay reward-hacking and alignment audit (static, no rollouts)" |
| ) |
| parser.add_argument( |
| "--repo-root", |
| type=Path, |
| default=None, |
| help="Project root (default: parent of scripts/)", |
| ) |
| args = parser.parse_args() |
|
|
| root = (args.repo_root or Path(__file__).resolve().parent.parent).resolve() |
| for sub in (root / "parlay_env", root / "game"): |
| if not sub.is_dir(): |
| print(f"Expected directory missing: {sub}") |
| return |
| if str(root) not in sys.path: |
| sys.path.insert(0, str(root)) |
|
|
| from parlay_env import grader as grader_mod |
| from parlay_env import reward as reward_mod |
| from game import scenarios as scenarios_mod |
|
|
| |
| _ = (grader_mod.compute_step_reward, grader_mod.detect_bluff_challenge) |
|
|
| results: list[tuple[str, str, str]] = [] |
|
|
| print("=" * 72) |
| print("1. NOISE TERM (THETA * noise_t)") |
| print("-" * 72) |
| print( |
| "In compute_step_reward, noise_t = 1.0 when cosine_sim(utterance, prior offer text) < 0.3, " |
| "else 0.0. The total applies -THETA*noise (penalty on low similarity, not a bonus)." |
| ) |
| print( |
| "Trivial *positive* side-channel from the noise term does not exist: noise can only add " |
| "a penalty, never increase reward. Avoiding the penalty means keeping utterance " |
| "overlapping the token history of prior offers (e.g. echoing offer-like numbers), not " |
| "necessarily any arbitrary small talk (which can score low overlap and be penalized)." |
| ) |
| print("NOISE TERM: Low hacking risk - the term is a unilateral penalty, not a reward. OK.") |
| results.append(("NOISE TERM (THETA*noise)", "PASS", "Penalty only; no positive exploit")) |
|
|
| print() |
| print("=" * 72) |
| print("2. TOM TERM (BETA * ToM)") |
| print("-" * 72) |
| print( |
| "ToM in compute_step_reward uses the latest belief in next_state.belief_history against " |
| "next_state.hidden_state. The agent's utterance does not directly author beliefs; in the " |
| "runner/server path, beliefs update from observed opponent behavior." |
| ) |
| print("TOM TERM: Not hackable by agent. OK.") |
| results.append(("ToM (BETA*ToM)", "PASS", "Beliefs from observation path, not direct agent edit")) |
|
|
| print() |
| print("=" * 72) |
| print("3. BLUFF BONUS (PSI)") |
| print("-" * 72) |
| print("detect_bluff_challenge() is structured as: (1) if stated/true are None -> False; (2) compute") |
| print(" bluff_threshold = 15% of |true| and require |stated-true| > threshold; (3) only then check") |
| print(" skepticism phrases. There is no partial credit for phrases alone if (2) fails.") |
| print( |
| "In compute_step_reward, bluff_bonus = PSI only when: tactical_move is None, " |
| "state.hidden_state.last_stated_batna is not None, AND detect_bluff_challenge(...)=True " |
| "(which already requires the >15% gap AND a skepticism phrase)." |
| ) |
| print("All conditions are ANDed; there is no independent partial PSI for skepticism only.") |
| print("BLUFF BONUS: Gated correctly. OK.") |
| results.append(("BLUFF BONUS (PSI)", "PASS", "All conjuncts required; no partial PSI")) |
|
|
| print() |
| print("=" * 72) |
| print("4. MEV (MU * MEV) - drift + adaptation") |
| print("-" * 72) |
| print("MEV in compute_step_reward uses drift_event or next_state.drift_event; mev_bonus = MU if a drift") |
| print("marker is present AND the utterance contains an adaptation subphrase (see grader for tokens).") |
| print("The agent does not set drift_event; game/scenarios.py defines trigger_turn per scenario.\n") |
| for sid, sc in sorted(scenarios_mod.SCENARIOS.items()): |
| if not sc.drift_events: |
| print(f" {sid}: (no drift_events)") |
| else: |
| turns = [f"turn {e.trigger_turn}: {e.event!r}" for e in sc.drift_events] |
| print(f" {sid}: {', '.join(turns)}") |
| print() |
| print("MEV TERM: Not hackable. OK.") |
| results.append(("MEV (MU*drift adapt)", "PASS", "Drift is scenario-time-gated, not agent-triggered")) |
|
|
| print() |
| print("=" * 72) |
| print("5. DELTA CONCESSION - offer_amount = None") |
| print("-" * 72) |
| print( |
| "In compute_step_reward: delta_v only updates when action.offer_amount is not None. " |
| "concession_t only runs when state.offer_history and action.offer_amount is not None." |
| ) |
| print( |
| "If offer_amount is always None, delta_v=0 and concession_t=0, so the agent forgoes both " |
| "alpha*deltaV upside and any delta*concession penalty in those terms." |
| ) |
| print( |
| "CONCESSION HACK RISK: Agent can set offer_amount=None every turn to avoid both deltaV reward " |
| "AND concession penalty. Net effect: misses upside but avoids downside. " |
| "Document as known limitation." |
| ) |
| results.append( |
| ( |
| "Concession (DELTA) / offer=None", |
| "WARN", |
| "offer_amount=None zeroes both deltaV and concession terms", |
| ) |
| ) |
|
|
| print() |
| print("=" * 72) |
| print("6. TERMINAL vs STEP REWARD alignment") |
| print("-" * 72) |
| print("Step: emphasizes offer improvement (ALPHA), ToM (BETA), penalties and bonuses as shaped in grader.") |
| print( |
| "Terminal (compute_terminal_reward): deal_efficiency, speed, drift bonus; GAMMA = " |
| f"{reward_mod.GAMMA} on efficiency." |
| ) |
| print( |
| "Tension: an agent can chase high per-step terms (e.g. anchoring, offer deltas) and still miss " |
| "agreement, yielding low terminal efficiency if no deal closes or final price is poor." |
| ) |
| print( |
| "This is a mis-alignment by design: it pressures closing unless step weights drown the signal - " |
| "monitor in training, not a pure bug." |
| ) |
| print("STEP vs TERMINAL: WARN - intentional tension; monitor in training, not a pure logic bug.") |
| results.append( |
| ( |
| "Step vs terminal alignment", |
| "WARN", |
| "Dense step and terminal E can pull apart without a deal", |
| ) |
| ) |
|
|
| print() |
| print("=" * 72) |
| print("SUMMARY (6 checks)") |
| print("=" * 72) |
| for label, level, note in results: |
| print(f" [{level:4s}] {label} - {note}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|