Spaces:

sh4shv4t
/

Parlay

Paused

File size: 6,657 Bytes

df724f2

#!/usr/bin/env python3
"""
Static reward-surface audit for Parlay (read-only, no env rollouts).
Analytical notes derived from parlay_env/grader.py, parlay_env/reward.py, game/scenarios.py.
"""
from __future__ import annotations

import argparse
import sys
from pathlib import Path


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Analytical Parlay reward-hacking and alignment audit (static, no rollouts)"
    )
    parser.add_argument(
        "--repo-root",
        type=Path,
        default=None,
        help="Project root (default: parent of scripts/)",
    )
    args = parser.parse_args()

    root = (args.repo_root or Path(__file__).resolve().parent.parent).resolve()
    for sub in (root / "parlay_env", root / "game"):
        if not sub.is_dir():
            print(f"Expected directory missing: {sub}")
            return
    if str(root) not in sys.path:
        sys.path.insert(0, str(root))

    from parlay_env import grader as grader_mod
    from parlay_env import reward as reward_mod
    from game import scenarios as scenarios_mod

    # Ensure grader symbols resolve (import side effects only)
    _ = (grader_mod.compute_step_reward, grader_mod.detect_bluff_challenge)

    results: list[tuple[str, str, str]] = []

    print("=" * 72)
    print("1. NOISE TERM (THETA * noise_t)")
    print("-" * 72)
    print(
        "In compute_step_reward, noise_t = 1.0 when cosine_sim(utterance, prior offer text) < 0.3, "
        "else 0.0. The total applies -THETA*noise (penalty on low similarity, not a bonus)."
    )
    print(
        "Trivial *positive* side-channel from the noise term does not exist: noise can only add "
        "a penalty, never increase reward. Avoiding the penalty means keeping utterance "
        "overlapping the token history of prior offers (e.g. echoing offer-like numbers), not "
        "necessarily any arbitrary small talk (which can score low overlap and be penalized)."
    )
    print("NOISE TERM: Low hacking risk - the term is a unilateral penalty, not a reward. OK.")
    results.append(("NOISE TERM (THETA*noise)", "PASS", "Penalty only; no positive exploit"))

    print()
    print("=" * 72)
    print("2. TOM TERM (BETA * ToM)")
    print("-" * 72)
    print(
        "ToM in compute_step_reward uses the latest belief in next_state.belief_history against "
        "next_state.hidden_state. The agent's utterance does not directly author beliefs; in the "
        "runner/server path, beliefs update from observed opponent behavior."
    )
    print("TOM TERM: Not hackable by agent. OK.")
    results.append(("ToM (BETA*ToM)", "PASS", "Beliefs from observation path, not direct agent edit"))

    print()
    print("=" * 72)
    print("3. BLUFF BONUS (PSI)")
    print("-" * 72)
    print("detect_bluff_challenge() is structured as: (1) if stated/true are None -> False; (2) compute")
    print("  bluff_threshold = 15% of |true| and require |stated-true| > threshold; (3) only then check")
    print("  skepticism phrases. There is no partial credit for phrases alone if (2) fails.")
    print(
        "In compute_step_reward, bluff_bonus = PSI only when: tactical_move is None, "
        "state.hidden_state.last_stated_batna is not None, AND detect_bluff_challenge(...)=True "
        "(which already requires the >15% gap AND a skepticism phrase)."
    )
    print("All conditions are ANDed; there is no independent partial PSI for skepticism only.")
    print("BLUFF BONUS: Gated correctly. OK.")
    results.append(("BLUFF BONUS (PSI)", "PASS", "All conjuncts required; no partial PSI"))

    print()
    print("=" * 72)
    print("4. MEV (MU * MEV) - drift + adaptation")
    print("-" * 72)
    print("MEV in compute_step_reward uses drift_event or next_state.drift_event; mev_bonus = MU if a drift")
    print("marker is present AND the utterance contains an adaptation subphrase (see grader for tokens).")
    print("The agent does not set drift_event; game/scenarios.py defines trigger_turn per scenario.\n")
    for sid, sc in sorted(scenarios_mod.SCENARIOS.items()):
        if not sc.drift_events:
            print(f"  {sid}: (no drift_events)")
        else:
            turns = [f"turn {e.trigger_turn}: {e.event!r}" for e in sc.drift_events]
            print(f"  {sid}: {', '.join(turns)}")
    print()
    print("MEV TERM: Not hackable. OK.")
    results.append(("MEV (MU*drift adapt)", "PASS", "Drift is scenario-time-gated, not agent-triggered"))

    print()
    print("=" * 72)
    print("5. DELTA CONCESSION - offer_amount = None")
    print("-" * 72)
    print(
        "In compute_step_reward: delta_v only updates when action.offer_amount is not None. "
        "concession_t only runs when state.offer_history and action.offer_amount is not None."
    )
    print(
        "If offer_amount is always None, delta_v=0 and concession_t=0, so the agent forgoes both "
        "alpha*deltaV upside and any delta*concession penalty in those terms."
    )
    print(
        "CONCESSION HACK RISK: Agent can set offer_amount=None every turn to avoid both deltaV reward "
        "AND concession penalty. Net effect: misses upside but avoids downside. "
        "Document as known limitation."
    )
    results.append(
        (
            "Concession (DELTA) / offer=None",
            "WARN",
            "offer_amount=None zeroes both deltaV and concession terms",
        )
    )

    print()
    print("=" * 72)
    print("6. TERMINAL vs STEP REWARD alignment")
    print("-" * 72)
    print("Step: emphasizes offer improvement (ALPHA), ToM (BETA), penalties and bonuses as shaped in grader.")
    print(
        "Terminal (compute_terminal_reward): deal_efficiency, speed, drift bonus; GAMMA = "
        f"{reward_mod.GAMMA} on efficiency."
    )
    print(
        "Tension: an agent can chase high per-step terms (e.g. anchoring, offer deltas) and still miss "
        "agreement, yielding low terminal efficiency if no deal closes or final price is poor."
    )
    print(
        "This is a mis-alignment by design: it pressures closing unless step weights drown the signal - "
        "monitor in training, not a pure bug."
    )
    print("STEP vs TERMINAL: WARN - intentional tension; monitor in training, not a pure logic bug.")
    results.append(
        (
            "Step vs terminal alignment",
            "WARN",
            "Dense step and terminal E can pull apart without a deal",
        )
    )

    print()
    print("=" * 72)
    print("SUMMARY (6 checks)")
    print("=" * 72)
    for label, level, note in results:
        print(f"  [{level:4s}] {label} - {note}")


if __name__ == "__main__":
    main()