| |
| |
| env_id: parlay-negotiation-v1 |
| name: "Parlay ◈ — Negotiation MDP" |
| description: > |
| A negotiation MDP with hidden information, Theory-of-Mind belief tracking, |
| dynamic ZOPA erosion, tactical moves, and drift events. |
| Three scenarios x three personas = nine training combinations. |
| The agent must infer opponent constraints from behavior, adapt to |
| exogenous shocks, and close deals without breaching its own BATNA floor. |
| version: "1.0.0" |
| author: "Shashvat Singh" |
| contact: "shashvat.k.singh.16@gmail.com" |
| theme: "multi-agent-interactions" |
| license: "MIT" |
|
|
| |
| space_url: "https://huggingface.co/spaces/sh4shv4t/Parlay" |
| repository: "https://github.com/sh4shv4t/Parlay" |
| blog: "https://github.com/sh4shv4t/Parlay/blob/main/BLOG.md" |
| dataset: "https://huggingface.co/datasets/sh4shv4t/parlay-episodes" |
| sft_model: "https://huggingface.co/sh4shv4t/parlay-sft-1-5b" |
| grpo_model: "https://huggingface.co/sh4shv4t/parlay-grpo-1-5b" |
|
|
| |
| client: |
| package: "parlay_env" |
| class: "parlay_env.client.ParlayEnvClient" |
| action_class: "parlay_env.client.ParlayAction" |
| install: "pip install git+https://huggingface.co/spaces/sh4shv4t/Parlay" |
|
|
| |
| api: |
| protocol: websocket |
| endpoint: "/env/ws" |
| full_url: "wss://sh4shv4t-parlay.hf.space/env/ws" |
| messages: |
| reset: |
| send: '{"cmd": "reset", "scenario_id": "saas_enterprise|hiring_package|acquisition_term_sheet", "persona": "shark|diplomat|veteran"}' |
| returns: ParlayObservation JSON |
| step: |
| send: '{"cmd": "step", "session_id": "<uuid>", "action": "<ParlayAction JSON>"}' |
| returns: ParlayObservation JSON with reward and done |
| state: |
| send: '{"cmd": "state", "session_id": "<uuid>"}' |
| returns: ParlayState JSON including hidden state |
|
|
| |
| action_space: |
| type: dict |
| fields: |
| - {name: utterance, type: string, required: true, |
| description: "Natural language negotiation text"} |
| - {name: offer_amount, type: "float|null", |
| description: "Numeric offer in scenario currency"} |
| - {name: tactical_move, type: "string|null", |
| values: [anchor_high, batna_reveal, silence], |
| description: "Tactical card — costs CP points"} |
| - {name: accept_deal, type: bool, default: false} |
| - {name: walk_away, type: bool, default: false} |
|
|
| |
| observation_space: |
| type: dict |
| fields: |
| - {name: offers, type: "list[float]", |
| description: "History of offers made this episode"} |
| - {name: zopa_lower, type: float, |
| description: "Current lower bound of Zone of Possible Agreement"} |
| - {name: zopa_upper, type: float, |
| description: "Current upper bound of ZOPA"} |
| - {name: nash_point, type: float, |
| description: "Nash bargaining solution midpoint"} |
| - {name: tension_score, type: float, range: [0, 100], |
| description: "Current negotiation tension; >75 triggers ZOPA erosion"} |
| - {name: belief_state, type: dict, |
| description: "Agent beliefs: est_budget, est_walk_away, est_urgency, est_has_alternative, confidence"} |
| - {name: last_utterance, type: string} |
| - {name: available_moves, type: "list[string]"} |
| - {name: cp, type: int, description: "Tactical card points remaining"} |
| - {name: drift_event, type: "string|null", |
| description: "Exogenous shock description if triggered this turn"} |
| - {name: zopa_width_pct_remaining, type: float, range: [0.0, 1.0]} |
| - {name: reward, type: float, description: "Per-step reward"} |
| - {name: done, type: bool} |
|
|
| |
| reward: |
| range: [-200, 320] |
| per_step: |
| formula: "R_t = alpha*dV + beta*ToM - delta*C - theta*noise + psi*bluff + mu*MEV" |
| terms: |
| - {name: "alpha*dV", coeff: 2, |
| description: "ZOPA progress — reward upward offer movement"} |
| - {name: "beta*ToM", coeff: 5, |
| description: "Theory-of-Mind accuracy vs opponent hidden state"} |
| - {name: "-delta*C", coeff: -3, |
| description: "Penalise unnecessary concessions"} |
| - {name: "-theta*noise", coeff: -10, |
| description: "Penalise incoherent utterances"} |
| - {name: "psi*bluff", coeff: 12, |
| description: "Bonus for detecting opponent bluffs"} |
| - {name: "mu*MEV", coeff: 8, |
| description: "Market event valuation — drift adaptation bonus"} |
| terminal: |
| formula: "R_T = gamma*E + epsilon*S + zeta*D or -omega on capitulation" |
| terms: |
| - {name: "gamma*E", coeff: 100, |
| description: "Deal efficiency: (final_price - BATNA) / ZOPA_width"} |
| - {name: "epsilon*S", coeff: 20, |
| description: "Speed bonus: closes early vs max turns"} |
| - {name: "zeta*D", coeff: 15, |
| description: "Drift adaptation bonus"} |
| - {name: "-omega", coeff: -200, |
| description: "Capitulation cliff: final price below own BATNA"} |
|
|
| |
| episode: |
| max_steps: 20 |
| termination_conditions: |
| - accept_deal action |
| - walk_away action |
| - max_turns reached |
| - zopa_collapsed (BATNAs cross after erosion) |
| - very_negative_step_reward threshold |
|
|
| |
| training: |
| framework: "HuggingFace TRL (GRPO)" |
| notebook: "training/notebooks/parlay_training.ipynb" |
| openenv_rollout_notebook: "training/notebooks/openenv_rollout_training.ipynb" |
| pipeline: |
| - step: "Gemini self-play data generation" |
| script: "python -m training.generate_data --episodes 140" |
| - step: "SFT cold start" |
| script: "python -m training.sft_train --data data/episodes_v2.jsonl" |
| - step: "GRPO fine-tuning" |
| script: "python -m training.grpo_train --data data/episodes_v2.jsonl" |
| base_model: "Qwen/Qwen2.5-1.5B-Instruct" |
| sft_checkpoint: "sh4shv4t/parlay-sft-1-5b" |
| grpo_checkpoint: "sh4shv4t/parlay-grpo-1-5b" |
| dataset: "sh4shv4t/parlay-episodes" |
|
|
| |
| scenarios: |
| - id: saas_enterprise |
| description: "B2B SaaS license, $125k-$165k ACV" |
| drift_turns: [8, 14] |
| - id: hiring_package |
| description: "Compensation negotiation, $195k-$230k total comp" |
| drift_turns: [8] |
| - id: acquisition_term_sheet |
| description: "Startup acquisition, $10.5M-$16M valuation" |
| drift_turns: [7, 13] |
|
|
| |
| personas: |
| - id: shark |
| description: "Aggressive anchoring, bluffs, artificial deadlines" |
| - id: diplomat |
| description: "Win-win framing, reveals constraints after trust builds" |
| - id: veteran |
| description: "Strategic silence, mirrors language, k=2 ToM reasoning" |
|
|
| |
| hidden_state: |
| - {name: budget_ceiling, description: "Opponent true max budget"} |
| - {name: walk_away_price, description: "Opponent true BATNA"} |
| - {name: urgency_score, description: "How time-pressured opponent is [0,1]"} |
| - {name: has_alternative, description: "Whether opponent has competing offer"} |
|
|