Spaces:

sh4shv4t
/

Parlay

Paused

App Files Files Community

Parlay / openenv.yaml

sh4shv4t

Relocate training notebooks, add BLOG and Google Colab links (SFT + GRPO HF Job), dashboard updates, and eval artifacts

00a2188 13 days ago

raw

history blame contribute delete

6.86 kB

	# Parlay OpenEnv Environment Manifest
	# openenv-core v0.2.3 compatible
	env_id: parlay-negotiation-v1
	name: "Parlay ◈ — Negotiation MDP"
	description: >
	A negotiation MDP with hidden information, Theory-of-Mind belief tracking,
	dynamic ZOPA erosion, tactical moves, and drift events.
	Three scenarios x three personas = nine training combinations.
	The agent must infer opponent constraints from behavior, adapt to
	exogenous shocks, and close deals without breaching its own BATNA floor.
	version: "1.0.0"
	author: "Shashvat Singh"
	contact: "shashvat.k.singh.16@gmail.com"
	theme: "multi-agent-interactions"
	license: "MIT"

	# URLs — judges pull the env from space_url
	space_url: "https://huggingface.co/spaces/sh4shv4t/Parlay"
	repository: "https://github.com/sh4shv4t/Parlay"
	blog: "https://github.com/sh4shv4t/Parlay/blob/main/BLOG.md"
	dataset: "https://huggingface.co/datasets/sh4shv4t/parlay-episodes"
	sft_model: "https://huggingface.co/sh4shv4t/parlay-sft-1-5b"
	grpo_model: "https://huggingface.co/sh4shv4t/parlay-grpo-1-5b"

	# OpenEnv client integration
	client:
	package: "parlay_env"
	class: "parlay_env.client.ParlayEnvClient"
	action_class: "parlay_env.client.ParlayAction"
	install: "pip install git+https://huggingface.co/spaces/sh4shv4t/Parlay"

	# WebSocket API (OpenEnv protocol)
	api:
	protocol: websocket
	endpoint: "/env/ws"
	full_url: "wss://sh4shv4t-parlay.hf.space/env/ws"
	messages:
	reset:
	send: '{"cmd": "reset", "scenario_id": "saas_enterprise\|hiring_package\|acquisition_term_sheet", "persona": "shark\|diplomat\|veteran"}'
	returns: ParlayObservation JSON
	step:
	send: '{"cmd": "step", "session_id": "<uuid>", "action": "<ParlayAction JSON>"}'
	returns: ParlayObservation JSON with reward and done
	state:
	send: '{"cmd": "state", "session_id": "<uuid>"}'
	returns: ParlayState JSON including hidden state

	# Action space
	action_space:
	type: dict
	fields:
	- {name: utterance, type: string, required: true,
	description: "Natural language negotiation text"}
	- {name: offer_amount, type: "float\|null",
	description: "Numeric offer in scenario currency"}
	- {name: tactical_move, type: "string\|null",
	values: [anchor_high, batna_reveal, silence],
	description: "Tactical card — costs CP points"}
	- {name: accept_deal, type: bool, default: false}
	- {name: walk_away, type: bool, default: false}

	# Observation space
	observation_space:
	type: dict
	fields:
	- {name: offers, type: "list[float]",
	description: "History of offers made this episode"}
	- {name: zopa_lower, type: float,
	description: "Current lower bound of Zone of Possible Agreement"}
	- {name: zopa_upper, type: float,
	description: "Current upper bound of ZOPA"}
	- {name: nash_point, type: float,
	description: "Nash bargaining solution midpoint"}
	- {name: tension_score, type: float, range: [0, 100],
	description: "Current negotiation tension; >75 triggers ZOPA erosion"}
	- {name: belief_state, type: dict,
	description: "Agent beliefs: est_budget, est_walk_away, est_urgency, est_has_alternative, confidence"}
	- {name: last_utterance, type: string}
	- {name: available_moves, type: "list[string]"}
	- {name: cp, type: int, description: "Tactical card points remaining"}
	- {name: drift_event, type: "string\|null",
	description: "Exogenous shock description if triggered this turn"}
	- {name: zopa_width_pct_remaining, type: float, range: [0.0, 1.0]}
	- {name: reward, type: float, description: "Per-step reward"}
	- {name: done, type: bool}

	# Reward design
	reward:
	range: [-200, 320]
	per_step:
	formula: "R_t = alphadV + betaToM - deltaC - thetanoise + psibluff + muMEV"
	terms:
	- {name: "alpha*dV", coeff: 2,
	description: "ZOPA progress — reward upward offer movement"}
	- {name: "beta*ToM", coeff: 5,
	description: "Theory-of-Mind accuracy vs opponent hidden state"}
	- {name: "-delta*C", coeff: -3,
	description: "Penalise unnecessary concessions"}
	- {name: "-theta*noise", coeff: -10,
	description: "Penalise incoherent utterances"}
	- {name: "psi*bluff", coeff: 12,
	description: "Bonus for detecting opponent bluffs"}
	- {name: "mu*MEV", coeff: 8,
	description: "Market event valuation — drift adaptation bonus"}
	terminal:
	formula: "R_T = gammaE + epsilonS + zeta*D or -omega on capitulation"
	terms:
	- {name: "gamma*E", coeff: 100,
	description: "Deal efficiency: (final_price - BATNA) / ZOPA_width"}
	- {name: "epsilon*S", coeff: 20,
	description: "Speed bonus: closes early vs max turns"}
	- {name: "zeta*D", coeff: 15,
	description: "Drift adaptation bonus"}
	- {name: "-omega", coeff: -200,
	description: "Capitulation cliff: final price below own BATNA"}

	# Episode
	episode:
	max_steps: 20
	termination_conditions:
	- accept_deal action
	- walk_away action
	- max_turns reached
	- zopa_collapsed (BATNAs cross after erosion)
	- very_negative_step_reward threshold

	# Training
	training:
	framework: "HuggingFace TRL (GRPO)"
	notebook: "training/notebooks/parlay_training.ipynb"
	openenv_rollout_notebook: "training/notebooks/openenv_rollout_training.ipynb"
	pipeline:
	- step: "Gemini self-play data generation"
	script: "python -m training.generate_data --episodes 140"
	- step: "SFT cold start"
	script: "python -m training.sft_train --data data/episodes_v2.jsonl"
	- step: "GRPO fine-tuning"
	script: "python -m training.grpo_train --data data/episodes_v2.jsonl"
	base_model: "Qwen/Qwen2.5-1.5B-Instruct"
	sft_checkpoint: "sh4shv4t/parlay-sft-1-5b"
	grpo_checkpoint: "sh4shv4t/parlay-grpo-1-5b"
	dataset: "sh4shv4t/parlay-episodes"

	# Scenarios
	scenarios:
	- id: saas_enterprise
	description: "B2B SaaS license, $125k-$165k ACV"
	drift_turns: [8, 14]
	- id: hiring_package
	description: "Compensation negotiation, $195k-$230k total comp"
	drift_turns: [8]
	- id: acquisition_term_sheet
	description: "Startup acquisition, $10.5M-$16M valuation"
	drift_turns: [7, 13]

	# Personas (opponent agents)
	personas:
	- id: shark
	description: "Aggressive anchoring, bluffs, artificial deadlines"
	- id: diplomat
	description: "Win-win framing, reveals constraints after trust builds"
	- id: veteran
	description: "Strategic silence, mirrors language, k=2 ToM reasoning"

	# Hidden information (partial observability)
	hidden_state:
	- {name: budget_ceiling, description: "Opponent true max budget"}
	- {name: walk_away_price, description: "Opponent true BATNA"}
	- {name: urgency_score, description: "How time-pressured opponent is [0,1]"}
	- {name: has_alternative, description: "Whether opponent has competing offer"}