Spaces:

sh4shv4t
/

Parlay

Paused

Parlay / scripts /audit_reward.py

Add pre-training audit scripts, OpenEnv manifest, and tune Parlay training/env (GRPO 1.5B default, min-reward filters, weighted data gen, hiring ZOPA+drift, veteran/opponent prompts, Docker/docs)

df724f2 13 days ago

raw

history blame contribute delete

6.66 kB

	#!/usr/bin/env python3
	"""
	Static reward-surface audit for Parlay (read-only, no env rollouts).
	Analytical notes derived from parlay_env/grader.py, parlay_env/reward.py, game/scenarios.py.
	"""
	from __future__ import annotations

	import argparse
	import sys
	from pathlib import Path


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Analytical Parlay reward-hacking and alignment audit (static, no rollouts)"
	)
	parser.add_argument(
	"--repo-root",
	type=Path,
	default=None,
	help="Project root (default: parent of scripts/)",
	)
	args = parser.parse_args()

	root = (args.repo_root or Path(__file__).resolve().parent.parent).resolve()
	for sub in (root / "parlay_env", root / "game"):
	if not sub.is_dir():
	print(f"Expected directory missing: {sub}")
	return
	if str(root) not in sys.path:
	sys.path.insert(0, str(root))

	from parlay_env import grader as grader_mod
	from parlay_env import reward as reward_mod
	from game import scenarios as scenarios_mod

	# Ensure grader symbols resolve (import side effects only)
	_ = (grader_mod.compute_step_reward, grader_mod.detect_bluff_challenge)

	results: list[tuple[str, str, str]] = []

	print("=" * 72)
	print("1. NOISE TERM (THETA * noise_t)")
	print("-" * 72)
	print(
	"In compute_step_reward, noise_t = 1.0 when cosine_sim(utterance, prior offer text) < 0.3, "
	"else 0.0. The total applies -THETA*noise (penalty on low similarity, not a bonus)."
	)
	print(
	"Trivial positive side-channel from the noise term does not exist: noise can only add "
	"a penalty, never increase reward. Avoiding the penalty means keeping utterance "
	"overlapping the token history of prior offers (e.g. echoing offer-like numbers), not "
	"necessarily any arbitrary small talk (which can score low overlap and be penalized)."
	)
	print("NOISE TERM: Low hacking risk - the term is a unilateral penalty, not a reward. OK.")
	results.append(("NOISE TERM (THETA*noise)", "PASS", "Penalty only; no positive exploit"))

	print()
	print("=" * 72)
	print("2. TOM TERM (BETA * ToM)")
	print("-" * 72)
	print(
	"ToM in compute_step_reward uses the latest belief in next_state.belief_history against "
	"next_state.hidden_state. The agent's utterance does not directly author beliefs; in the "
	"runner/server path, beliefs update from observed opponent behavior."
	)
	print("TOM TERM: Not hackable by agent. OK.")
	results.append(("ToM (BETA*ToM)", "PASS", "Beliefs from observation path, not direct agent edit"))

	print()
	print("=" * 72)
	print("3. BLUFF BONUS (PSI)")
	print("-" * 72)
	print("detect_bluff_challenge() is structured as: (1) if stated/true are None -> False; (2) compute")
	print(" bluff_threshold = 15% of \|true\| and require \|stated-true\| > threshold; (3) only then check")
	print(" skepticism phrases. There is no partial credit for phrases alone if (2) fails.")
	print(
	"In compute_step_reward, bluff_bonus = PSI only when: tactical_move is None, "
	"state.hidden_state.last_stated_batna is not None, AND detect_bluff_challenge(...)=True "
	"(which already requires the >15% gap AND a skepticism phrase)."
	)
	print("All conditions are ANDed; there is no independent partial PSI for skepticism only.")
	print("BLUFF BONUS: Gated correctly. OK.")
	results.append(("BLUFF BONUS (PSI)", "PASS", "All conjuncts required; no partial PSI"))

	print()
	print("=" * 72)
	print("4. MEV (MU * MEV) - drift + adaptation")
	print("-" * 72)
	print("MEV in compute_step_reward uses drift_event or next_state.drift_event; mev_bonus = MU if a drift")
	print("marker is present AND the utterance contains an adaptation subphrase (see grader for tokens).")
	print("The agent does not set drift_event; game/scenarios.py defines trigger_turn per scenario.\n")
	for sid, sc in sorted(scenarios_mod.SCENARIOS.items()):
	if not sc.drift_events:
	print(f" {sid}: (no drift_events)")
	else:
	turns = [f"turn {e.trigger_turn}: {e.event!r}" for e in sc.drift_events]
	print(f" {sid}: {', '.join(turns)}")
	print()
	print("MEV TERM: Not hackable. OK.")
	results.append(("MEV (MU*drift adapt)", "PASS", "Drift is scenario-time-gated, not agent-triggered"))

	print()
	print("=" * 72)
	print("5. DELTA CONCESSION - offer_amount = None")
	print("-" * 72)
	print(
	"In compute_step_reward: delta_v only updates when action.offer_amount is not None. "
	"concession_t only runs when state.offer_history and action.offer_amount is not None."
	)
	print(
	"If offer_amount is always None, delta_v=0 and concession_t=0, so the agent forgoes both "
	"alphadeltaV upside and any deltaconcession penalty in those terms."
	)
	print(
	"CONCESSION HACK RISK: Agent can set offer_amount=None every turn to avoid both deltaV reward "
	"AND concession penalty. Net effect: misses upside but avoids downside. "
	"Document as known limitation."
	)
	results.append(
	(
	"Concession (DELTA) / offer=None",
	"WARN",
	"offer_amount=None zeroes both deltaV and concession terms",
	)
	)

	print()
	print("=" * 72)
	print("6. TERMINAL vs STEP REWARD alignment")
	print("-" * 72)
	print("Step: emphasizes offer improvement (ALPHA), ToM (BETA), penalties and bonuses as shaped in grader.")
	print(
	"Terminal (compute_terminal_reward): deal_efficiency, speed, drift bonus; GAMMA = "
	f"{reward_mod.GAMMA} on efficiency."
	)
	print(
	"Tension: an agent can chase high per-step terms (e.g. anchoring, offer deltas) and still miss "
	"agreement, yielding low terminal efficiency if no deal closes or final price is poor."
	)
	print(
	"This is a mis-alignment by design: it pressures closing unless step weights drown the signal - "
	"monitor in training, not a pure bug."
	)
	print("STEP vs TERMINAL: WARN - intentional tension; monitor in training, not a pure logic bug.")
	results.append(
	(
	"Step vs terminal alignment",
	"WARN",
	"Dense step and terminal E can pull apart without a deal",
	)
	)

	print()
	print("=" * 72)
	print("SUMMARY (6 checks)")
	print("=" * 72)
	for label, level, note in results:
	print(f" [{level:4s}] {label} - {note}")


	if __name__ == "__main__":
	main()