Parlay / scripts /audit_reward.py
sh4shv4t's picture
Add pre-training audit scripts, OpenEnv manifest, and tune Parlay training/env (GRPO 1.5B default, min-reward filters, weighted data gen, hiring ZOPA+drift, veteran/opponent prompts, Docker/docs)
df724f2
#!/usr/bin/env python3
"""
Static reward-surface audit for Parlay (read-only, no env rollouts).
Analytical notes derived from parlay_env/grader.py, parlay_env/reward.py, game/scenarios.py.
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
def main() -> None:
parser = argparse.ArgumentParser(
description="Analytical Parlay reward-hacking and alignment audit (static, no rollouts)"
)
parser.add_argument(
"--repo-root",
type=Path,
default=None,
help="Project root (default: parent of scripts/)",
)
args = parser.parse_args()
root = (args.repo_root or Path(__file__).resolve().parent.parent).resolve()
for sub in (root / "parlay_env", root / "game"):
if not sub.is_dir():
print(f"Expected directory missing: {sub}")
return
if str(root) not in sys.path:
sys.path.insert(0, str(root))
from parlay_env import grader as grader_mod
from parlay_env import reward as reward_mod
from game import scenarios as scenarios_mod
# Ensure grader symbols resolve (import side effects only)
_ = (grader_mod.compute_step_reward, grader_mod.detect_bluff_challenge)
results: list[tuple[str, str, str]] = []
print("=" * 72)
print("1. NOISE TERM (THETA * noise_t)")
print("-" * 72)
print(
"In compute_step_reward, noise_t = 1.0 when cosine_sim(utterance, prior offer text) < 0.3, "
"else 0.0. The total applies -THETA*noise (penalty on low similarity, not a bonus)."
)
print(
"Trivial *positive* side-channel from the noise term does not exist: noise can only add "
"a penalty, never increase reward. Avoiding the penalty means keeping utterance "
"overlapping the token history of prior offers (e.g. echoing offer-like numbers), not "
"necessarily any arbitrary small talk (which can score low overlap and be penalized)."
)
print("NOISE TERM: Low hacking risk - the term is a unilateral penalty, not a reward. OK.")
results.append(("NOISE TERM (THETA*noise)", "PASS", "Penalty only; no positive exploit"))
print()
print("=" * 72)
print("2. TOM TERM (BETA * ToM)")
print("-" * 72)
print(
"ToM in compute_step_reward uses the latest belief in next_state.belief_history against "
"next_state.hidden_state. The agent's utterance does not directly author beliefs; in the "
"runner/server path, beliefs update from observed opponent behavior."
)
print("TOM TERM: Not hackable by agent. OK.")
results.append(("ToM (BETA*ToM)", "PASS", "Beliefs from observation path, not direct agent edit"))
print()
print("=" * 72)
print("3. BLUFF BONUS (PSI)")
print("-" * 72)
print("detect_bluff_challenge() is structured as: (1) if stated/true are None -> False; (2) compute")
print(" bluff_threshold = 15% of |true| and require |stated-true| > threshold; (3) only then check")
print(" skepticism phrases. There is no partial credit for phrases alone if (2) fails.")
print(
"In compute_step_reward, bluff_bonus = PSI only when: tactical_move is None, "
"state.hidden_state.last_stated_batna is not None, AND detect_bluff_challenge(...)=True "
"(which already requires the >15% gap AND a skepticism phrase)."
)
print("All conditions are ANDed; there is no independent partial PSI for skepticism only.")
print("BLUFF BONUS: Gated correctly. OK.")
results.append(("BLUFF BONUS (PSI)", "PASS", "All conjuncts required; no partial PSI"))
print()
print("=" * 72)
print("4. MEV (MU * MEV) - drift + adaptation")
print("-" * 72)
print("MEV in compute_step_reward uses drift_event or next_state.drift_event; mev_bonus = MU if a drift")
print("marker is present AND the utterance contains an adaptation subphrase (see grader for tokens).")
print("The agent does not set drift_event; game/scenarios.py defines trigger_turn per scenario.\n")
for sid, sc in sorted(scenarios_mod.SCENARIOS.items()):
if not sc.drift_events:
print(f" {sid}: (no drift_events)")
else:
turns = [f"turn {e.trigger_turn}: {e.event!r}" for e in sc.drift_events]
print(f" {sid}: {', '.join(turns)}")
print()
print("MEV TERM: Not hackable. OK.")
results.append(("MEV (MU*drift adapt)", "PASS", "Drift is scenario-time-gated, not agent-triggered"))
print()
print("=" * 72)
print("5. DELTA CONCESSION - offer_amount = None")
print("-" * 72)
print(
"In compute_step_reward: delta_v only updates when action.offer_amount is not None. "
"concession_t only runs when state.offer_history and action.offer_amount is not None."
)
print(
"If offer_amount is always None, delta_v=0 and concession_t=0, so the agent forgoes both "
"alpha*deltaV upside and any delta*concession penalty in those terms."
)
print(
"CONCESSION HACK RISK: Agent can set offer_amount=None every turn to avoid both deltaV reward "
"AND concession penalty. Net effect: misses upside but avoids downside. "
"Document as known limitation."
)
results.append(
(
"Concession (DELTA) / offer=None",
"WARN",
"offer_amount=None zeroes both deltaV and concession terms",
)
)
print()
print("=" * 72)
print("6. TERMINAL vs STEP REWARD alignment")
print("-" * 72)
print("Step: emphasizes offer improvement (ALPHA), ToM (BETA), penalties and bonuses as shaped in grader.")
print(
"Terminal (compute_terminal_reward): deal_efficiency, speed, drift bonus; GAMMA = "
f"{reward_mod.GAMMA} on efficiency."
)
print(
"Tension: an agent can chase high per-step terms (e.g. anchoring, offer deltas) and still miss "
"agreement, yielding low terminal efficiency if no deal closes or final price is poor."
)
print(
"This is a mis-alignment by design: it pressures closing unless step weights drown the signal - "
"monitor in training, not a pure bug."
)
print("STEP vs TERMINAL: WARN - intentional tension; monitor in training, not a pure logic bug.")
results.append(
(
"Step vs terminal alignment",
"WARN",
"Dense step and terminal E can pull apart without a deal",
)
)
print()
print("=" * 72)
print("SUMMARY (6 checks)")
print("=" * 72)
for label, level, note in results:
print(f" [{level:4s}] {label} - {note}")
if __name__ == "__main__":
main()