# Parlay OpenEnv Environment Manifest # openenv-core v0.2.3 compatible env_id: parlay-negotiation-v1 name: "Parlay ◈ — Negotiation MDP" description: > A negotiation MDP with hidden information, Theory-of-Mind belief tracking, dynamic ZOPA erosion, tactical moves, and drift events. Three scenarios x three personas = nine training combinations. The agent must infer opponent constraints from behavior, adapt to exogenous shocks, and close deals without breaching its own BATNA floor. version: "1.0.0" author: "Shashvat Singh" contact: "shashvat.k.singh.16@gmail.com" theme: "multi-agent-interactions" license: "MIT" # URLs — judges pull the env from space_url space_url: "https://huggingface.co/spaces/sh4shv4t/Parlay" repository: "https://github.com/sh4shv4t/Parlay" blog: "https://github.com/sh4shv4t/Parlay/blob/main/BLOG.md" dataset: "https://huggingface.co/datasets/sh4shv4t/parlay-episodes" sft_model: "https://huggingface.co/sh4shv4t/parlay-sft-1-5b" grpo_model: "https://huggingface.co/sh4shv4t/parlay-grpo-1-5b" # OpenEnv client integration client: package: "parlay_env" class: "parlay_env.client.ParlayEnvClient" action_class: "parlay_env.client.ParlayAction" install: "pip install git+https://huggingface.co/spaces/sh4shv4t/Parlay" # WebSocket API (OpenEnv protocol) api: protocol: websocket endpoint: "/env/ws" full_url: "wss://sh4shv4t-parlay.hf.space/env/ws" messages: reset: send: '{"cmd": "reset", "scenario_id": "saas_enterprise|hiring_package|acquisition_term_sheet", "persona": "shark|diplomat|veteran"}' returns: ParlayObservation JSON step: send: '{"cmd": "step", "session_id": "", "action": ""}' returns: ParlayObservation JSON with reward and done state: send: '{"cmd": "state", "session_id": ""}' returns: ParlayState JSON including hidden state # Action space action_space: type: dict fields: - {name: utterance, type: string, required: true, description: "Natural language negotiation text"} - {name: offer_amount, type: "float|null", description: "Numeric offer in scenario currency"} - {name: tactical_move, type: "string|null", values: [anchor_high, batna_reveal, silence], description: "Tactical card — costs CP points"} - {name: accept_deal, type: bool, default: false} - {name: walk_away, type: bool, default: false} # Observation space observation_space: type: dict fields: - {name: offers, type: "list[float]", description: "History of offers made this episode"} - {name: zopa_lower, type: float, description: "Current lower bound of Zone of Possible Agreement"} - {name: zopa_upper, type: float, description: "Current upper bound of ZOPA"} - {name: nash_point, type: float, description: "Nash bargaining solution midpoint"} - {name: tension_score, type: float, range: [0, 100], description: "Current negotiation tension; >75 triggers ZOPA erosion"} - {name: belief_state, type: dict, description: "Agent beliefs: est_budget, est_walk_away, est_urgency, est_has_alternative, confidence"} - {name: last_utterance, type: string} - {name: available_moves, type: "list[string]"} - {name: cp, type: int, description: "Tactical card points remaining"} - {name: drift_event, type: "string|null", description: "Exogenous shock description if triggered this turn"} - {name: zopa_width_pct_remaining, type: float, range: [0.0, 1.0]} - {name: reward, type: float, description: "Per-step reward"} - {name: done, type: bool} # Reward design reward: range: [-200, 320] per_step: formula: "R_t = alpha*dV + beta*ToM - delta*C - theta*noise + psi*bluff + mu*MEV" terms: - {name: "alpha*dV", coeff: 2, description: "ZOPA progress — reward upward offer movement"} - {name: "beta*ToM", coeff: 5, description: "Theory-of-Mind accuracy vs opponent hidden state"} - {name: "-delta*C", coeff: -3, description: "Penalise unnecessary concessions"} - {name: "-theta*noise", coeff: -10, description: "Penalise incoherent utterances"} - {name: "psi*bluff", coeff: 12, description: "Bonus for detecting opponent bluffs"} - {name: "mu*MEV", coeff: 8, description: "Market event valuation — drift adaptation bonus"} terminal: formula: "R_T = gamma*E + epsilon*S + zeta*D or -omega on capitulation" terms: - {name: "gamma*E", coeff: 100, description: "Deal efficiency: (final_price - BATNA) / ZOPA_width"} - {name: "epsilon*S", coeff: 20, description: "Speed bonus: closes early vs max turns"} - {name: "zeta*D", coeff: 15, description: "Drift adaptation bonus"} - {name: "-omega", coeff: -200, description: "Capitulation cliff: final price below own BATNA"} # Episode episode: max_steps: 20 termination_conditions: - accept_deal action - walk_away action - max_turns reached - zopa_collapsed (BATNAs cross after erosion) - very_negative_step_reward threshold # Training training: framework: "HuggingFace TRL (GRPO)" notebook: "training/notebooks/parlay_training.ipynb" openenv_rollout_notebook: "training/notebooks/openenv_rollout_training.ipynb" pipeline: - step: "Gemini self-play data generation" script: "python -m training.generate_data --episodes 140" - step: "SFT cold start" script: "python -m training.sft_train --data data/episodes_v2.jsonl" - step: "GRPO fine-tuning" script: "python -m training.grpo_train --data data/episodes_v2.jsonl" base_model: "Qwen/Qwen2.5-1.5B-Instruct" sft_checkpoint: "sh4shv4t/parlay-sft-1-5b" grpo_checkpoint: "sh4shv4t/parlay-grpo-1-5b" dataset: "sh4shv4t/parlay-episodes" # Scenarios scenarios: - id: saas_enterprise description: "B2B SaaS license, $125k-$165k ACV" drift_turns: [8, 14] - id: hiring_package description: "Compensation negotiation, $195k-$230k total comp" drift_turns: [8] - id: acquisition_term_sheet description: "Startup acquisition, $10.5M-$16M valuation" drift_turns: [7, 13] # Personas (opponent agents) personas: - id: shark description: "Aggressive anchoring, bluffs, artificial deadlines" - id: diplomat description: "Win-win framing, reveals constraints after trust builds" - id: veteran description: "Strategic silence, mirrors language, k=2 ToM reasoning" # Hidden information (partial observability) hidden_state: - {name: budget_ceiling, description: "Opponent true max budget"} - {name: walk_away_price, description: "Opponent true BATNA"} - {name: urgency_score, description: "How time-pressured opponent is [0,1]"} - {name: has_alternative, description: "Whether opponent has competing offer"}