# Parlay OpenEnv Environment Manifest
# openenv-core v0.2.3 compatible
env_id: parlay-negotiation-v1
name: "Parlay ◈ — Negotiation MDP"
description: >
  A negotiation MDP with hidden information, Theory-of-Mind belief tracking,
  dynamic ZOPA erosion, tactical moves, and drift events.
  Three scenarios x three personas = nine training combinations.
  The agent must infer opponent constraints from behavior, adapt to
  exogenous shocks, and close deals without breaching its own BATNA floor.
version: "1.0.0"
author: "Shashvat Singh"
contact: "shashvat.k.singh.16@gmail.com"
theme: "multi-agent-interactions"
license: "MIT"

# URLs — judges pull the env from space_url
space_url: "https://huggingface.co/spaces/sh4shv4t/Parlay"
repository: "https://github.com/sh4shv4t/Parlay"
blog: "https://github.com/sh4shv4t/Parlay/blob/main/BLOG.md"
dataset: "https://huggingface.co/datasets/sh4shv4t/parlay-episodes"
sft_model: "https://huggingface.co/sh4shv4t/parlay-sft-1-5b"
grpo_model: "https://huggingface.co/sh4shv4t/parlay-grpo-1-5b"

# OpenEnv client integration
client:
  package: "parlay_env"
  class: "parlay_env.client.ParlayEnvClient"
  action_class: "parlay_env.client.ParlayAction"
  install: "pip install git+https://huggingface.co/spaces/sh4shv4t/Parlay"

# WebSocket API (OpenEnv protocol)
api:
  protocol: websocket
  endpoint: "/env/ws"
  full_url: "wss://sh4shv4t-parlay.hf.space/env/ws"
  messages:
    reset:
      send: '{"cmd": "reset", "scenario_id": "saas_enterprise|hiring_package|acquisition_term_sheet", "persona": "shark|diplomat|veteran"}'
      returns: ParlayObservation JSON
    step:
      send: '{"cmd": "step", "session_id": "<uuid>", "action": "<ParlayAction JSON>"}'
      returns: ParlayObservation JSON with reward and done
    state:
      send: '{"cmd": "state", "session_id": "<uuid>"}'
      returns: ParlayState JSON including hidden state

# Action space
action_space:
  type: dict
  fields:
    - {name: utterance, type: string, required: true,
       description: "Natural language negotiation text"}
    - {name: offer_amount, type: "float|null",
       description: "Numeric offer in scenario currency"}
    - {name: tactical_move, type: "string|null",
       values: [anchor_high, batna_reveal, silence],
       description: "Tactical card — costs CP points"}
    - {name: accept_deal, type: bool, default: false}
    - {name: walk_away, type: bool, default: false}

# Observation space
observation_space:
  type: dict
  fields:
    - {name: offers, type: "list[float]",
       description: "History of offers made this episode"}
    - {name: zopa_lower, type: float,
       description: "Current lower bound of Zone of Possible Agreement"}
    - {name: zopa_upper, type: float,
       description: "Current upper bound of ZOPA"}
    - {name: nash_point, type: float,
       description: "Nash bargaining solution midpoint"}
    - {name: tension_score, type: float, range: [0, 100],
       description: "Current negotiation tension; >75 triggers ZOPA erosion"}
    - {name: belief_state, type: dict,
       description: "Agent beliefs: est_budget, est_walk_away, est_urgency, est_has_alternative, confidence"}
    - {name: last_utterance, type: string}
    - {name: available_moves, type: "list[string]"}
    - {name: cp, type: int, description: "Tactical card points remaining"}
    - {name: drift_event, type: "string|null",
       description: "Exogenous shock description if triggered this turn"}
    - {name: zopa_width_pct_remaining, type: float, range: [0.0, 1.0]}
    - {name: reward, type: float, description: "Per-step reward"}
    - {name: done, type: bool}

# Reward design
reward:
  range: [-200, 320]
  per_step:
    formula: "R_t = alpha*dV + beta*ToM - delta*C - theta*noise + psi*bluff + mu*MEV"
    terms:
      - {name: "alpha*dV", coeff: 2,
         description: "ZOPA progress — reward upward offer movement"}
      - {name: "beta*ToM", coeff: 5,
         description: "Theory-of-Mind accuracy vs opponent hidden state"}
      - {name: "-delta*C", coeff: -3,
         description: "Penalise unnecessary concessions"}
      - {name: "-theta*noise", coeff: -10,
         description: "Penalise incoherent utterances"}
      - {name: "psi*bluff", coeff: 12,
         description: "Bonus for detecting opponent bluffs"}
      - {name: "mu*MEV", coeff: 8,
         description: "Market event valuation — drift adaptation bonus"}
  terminal:
    formula: "R_T = gamma*E + epsilon*S + zeta*D  or  -omega on capitulation"
    terms:
      - {name: "gamma*E", coeff: 100,
         description: "Deal efficiency: (final_price - BATNA) / ZOPA_width"}
      - {name: "epsilon*S", coeff: 20,
         description: "Speed bonus: closes early vs max turns"}
      - {name: "zeta*D", coeff: 15,
         description: "Drift adaptation bonus"}
      - {name: "-omega", coeff: -200,
         description: "Capitulation cliff: final price below own BATNA"}

# Episode
episode:
  max_steps: 20
  termination_conditions:
    - accept_deal action
    - walk_away action
    - max_turns reached
    - zopa_collapsed (BATNAs cross after erosion)
    - very_negative_step_reward threshold

# Training
training:
  framework: "HuggingFace TRL (GRPO)"
  notebook: "training/notebooks/parlay_training.ipynb"
  openenv_rollout_notebook: "training/notebooks/openenv_rollout_training.ipynb"
  pipeline:
    - step: "Gemini self-play data generation"
      script: "python -m training.generate_data --episodes 140"
    - step: "SFT cold start"
      script: "python -m training.sft_train --data data/episodes_v2.jsonl"
    - step: "GRPO fine-tuning"
      script: "python -m training.grpo_train --data data/episodes_v2.jsonl"
  base_model: "Qwen/Qwen2.5-1.5B-Instruct"
  sft_checkpoint: "sh4shv4t/parlay-sft-1-5b"
  grpo_checkpoint: "sh4shv4t/parlay-grpo-1-5b"
  dataset: "sh4shv4t/parlay-episodes"

# Scenarios
scenarios:
  - id: saas_enterprise
    description: "B2B SaaS license, $125k-$165k ACV"
    drift_turns: [8, 14]
  - id: hiring_package
    description: "Compensation negotiation, $195k-$230k total comp"
    drift_turns: [8]
  - id: acquisition_term_sheet
    description: "Startup acquisition, $10.5M-$16M valuation"
    drift_turns: [7, 13]

# Personas (opponent agents)
personas:
  - id: shark
    description: "Aggressive anchoring, bluffs, artificial deadlines"
  - id: diplomat
    description: "Win-win framing, reveals constraints after trust builds"
  - id: veteran
    description: "Strategic silence, mirrors language, k=2 ToM reasoning"

# Hidden information (partial observability)
hidden_state:
  - {name: budget_ceiling, description: "Opponent true max budget"}
  - {name: walk_away_price, description: "Opponent true BATNA"}
  - {name: urgency_score, description: "How time-pressured opponent is [0,1]"}
  - {name: has_alternative, description: "Whether opponent has competing offer"}