File size: 6,657 Bytes
df724f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | #!/usr/bin/env python3
"""
Static reward-surface audit for Parlay (read-only, no env rollouts).
Analytical notes derived from parlay_env/grader.py, parlay_env/reward.py, game/scenarios.py.
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
def main() -> None:
parser = argparse.ArgumentParser(
description="Analytical Parlay reward-hacking and alignment audit (static, no rollouts)"
)
parser.add_argument(
"--repo-root",
type=Path,
default=None,
help="Project root (default: parent of scripts/)",
)
args = parser.parse_args()
root = (args.repo_root or Path(__file__).resolve().parent.parent).resolve()
for sub in (root / "parlay_env", root / "game"):
if not sub.is_dir():
print(f"Expected directory missing: {sub}")
return
if str(root) not in sys.path:
sys.path.insert(0, str(root))
from parlay_env import grader as grader_mod
from parlay_env import reward as reward_mod
from game import scenarios as scenarios_mod
# Ensure grader symbols resolve (import side effects only)
_ = (grader_mod.compute_step_reward, grader_mod.detect_bluff_challenge)
results: list[tuple[str, str, str]] = []
print("=" * 72)
print("1. NOISE TERM (THETA * noise_t)")
print("-" * 72)
print(
"In compute_step_reward, noise_t = 1.0 when cosine_sim(utterance, prior offer text) < 0.3, "
"else 0.0. The total applies -THETA*noise (penalty on low similarity, not a bonus)."
)
print(
"Trivial *positive* side-channel from the noise term does not exist: noise can only add "
"a penalty, never increase reward. Avoiding the penalty means keeping utterance "
"overlapping the token history of prior offers (e.g. echoing offer-like numbers), not "
"necessarily any arbitrary small talk (which can score low overlap and be penalized)."
)
print("NOISE TERM: Low hacking risk - the term is a unilateral penalty, not a reward. OK.")
results.append(("NOISE TERM (THETA*noise)", "PASS", "Penalty only; no positive exploit"))
print()
print("=" * 72)
print("2. TOM TERM (BETA * ToM)")
print("-" * 72)
print(
"ToM in compute_step_reward uses the latest belief in next_state.belief_history against "
"next_state.hidden_state. The agent's utterance does not directly author beliefs; in the "
"runner/server path, beliefs update from observed opponent behavior."
)
print("TOM TERM: Not hackable by agent. OK.")
results.append(("ToM (BETA*ToM)", "PASS", "Beliefs from observation path, not direct agent edit"))
print()
print("=" * 72)
print("3. BLUFF BONUS (PSI)")
print("-" * 72)
print("detect_bluff_challenge() is structured as: (1) if stated/true are None -> False; (2) compute")
print(" bluff_threshold = 15% of |true| and require |stated-true| > threshold; (3) only then check")
print(" skepticism phrases. There is no partial credit for phrases alone if (2) fails.")
print(
"In compute_step_reward, bluff_bonus = PSI only when: tactical_move is None, "
"state.hidden_state.last_stated_batna is not None, AND detect_bluff_challenge(...)=True "
"(which already requires the >15% gap AND a skepticism phrase)."
)
print("All conditions are ANDed; there is no independent partial PSI for skepticism only.")
print("BLUFF BONUS: Gated correctly. OK.")
results.append(("BLUFF BONUS (PSI)", "PASS", "All conjuncts required; no partial PSI"))
print()
print("=" * 72)
print("4. MEV (MU * MEV) - drift + adaptation")
print("-" * 72)
print("MEV in compute_step_reward uses drift_event or next_state.drift_event; mev_bonus = MU if a drift")
print("marker is present AND the utterance contains an adaptation subphrase (see grader for tokens).")
print("The agent does not set drift_event; game/scenarios.py defines trigger_turn per scenario.\n")
for sid, sc in sorted(scenarios_mod.SCENARIOS.items()):
if not sc.drift_events:
print(f" {sid}: (no drift_events)")
else:
turns = [f"turn {e.trigger_turn}: {e.event!r}" for e in sc.drift_events]
print(f" {sid}: {', '.join(turns)}")
print()
print("MEV TERM: Not hackable. OK.")
results.append(("MEV (MU*drift adapt)", "PASS", "Drift is scenario-time-gated, not agent-triggered"))
print()
print("=" * 72)
print("5. DELTA CONCESSION - offer_amount = None")
print("-" * 72)
print(
"In compute_step_reward: delta_v only updates when action.offer_amount is not None. "
"concession_t only runs when state.offer_history and action.offer_amount is not None."
)
print(
"If offer_amount is always None, delta_v=0 and concession_t=0, so the agent forgoes both "
"alpha*deltaV upside and any delta*concession penalty in those terms."
)
print(
"CONCESSION HACK RISK: Agent can set offer_amount=None every turn to avoid both deltaV reward "
"AND concession penalty. Net effect: misses upside but avoids downside. "
"Document as known limitation."
)
results.append(
(
"Concession (DELTA) / offer=None",
"WARN",
"offer_amount=None zeroes both deltaV and concession terms",
)
)
print()
print("=" * 72)
print("6. TERMINAL vs STEP REWARD alignment")
print("-" * 72)
print("Step: emphasizes offer improvement (ALPHA), ToM (BETA), penalties and bonuses as shaped in grader.")
print(
"Terminal (compute_terminal_reward): deal_efficiency, speed, drift bonus; GAMMA = "
f"{reward_mod.GAMMA} on efficiency."
)
print(
"Tension: an agent can chase high per-step terms (e.g. anchoring, offer deltas) and still miss "
"agreement, yielding low terminal efficiency if no deal closes or final price is poor."
)
print(
"This is a mis-alignment by design: it pressures closing unless step weights drown the signal - "
"monitor in training, not a pure bug."
)
print("STEP vs TERMINAL: WARN - intentional tension; monitor in training, not a pure logic bug.")
results.append(
(
"Step vs terminal alignment",
"WARN",
"Dense step and terminal E can pull apart without a deal",
)
)
print()
print("=" * 72)
print("SUMMARY (6 checks)")
print("=" * 72)
for label, level, note in results:
print(f" [{level:4s}] {label} - {note}")
if __name__ == "__main__":
main()
|