Spaces:

Lomesh7777
/

openenv-multi-agent-RL

Sleeping

openenv-multi-agent-RL / salespath_env /server /prospect_simulator.py

Lomesh2000

FIX: grop update new , env changes

e6a02dd 12 days ago

7 kB

	# salespath_env/server/prospect_simulator.py

	import hashlib
	import random

	from ..models import SalesPathAction, SalesPathState


	RESPONSE_TEXT = {
	"open:positive_signal": "That sounds interesting. Tell me more about how this works.",
	"open:neutral_signal": "I see. We're evaluating a few options at the moment.",

	"objection:price": "The pricing seems higher than what we budgeted for.",
	"objection:timing": "The timing isn't ideal — we're in the middle of a quarter close.",
	"objection:premature_pitch": (
	"I'm not sure we're ready to discuss solutions yet. "
	"What do you know about our current situation?"
	),

	"deflect:budget_not_discussed": (
	"We haven't really talked about what we're looking for yet."
	),
	"deflect:stall": (
	"Let me get back to you on this. A lot is happening on our end."
	),

	"accept:demo_scheduled": (
	"Yes, let's set up a demo. What time works next week?"
	),
	"accept:close_success": (
	"Alright, I think we can move forward with this. "
	"Send over the paperwork."
	),

	"reject:close_failed": (
	"I don't think we're ready to commit at this point."
	),

	"silence": "",

	"exit:disqualified": (
	"I think we're done here. This isn't the right fit."
	),
	}


	def _seeded_random(state: SalesPathState, action: SalesPathAction) -> random.Random:
	"""
	Build a deterministic RNG keyed on (episode_id, turn_number, action_type).

	Why: GRPO training restores environment state from snapshots and re-applies
	actions in a separate process / thread. If the prospect's response depends
	on an unseeded `random.random()` call, the reward computed during gradient
	update can disagree with the rollout-time reward, breaking the snapshot
	trick and silently corrupting the gradient.
	"""
	key = f"{state.episode_id}\|{state.turn_number}\|{action.action_type}"
	seed = int(hashlib.sha1(key.encode("utf-8")).hexdigest()[:12], 16)
	return random.Random(seed)

	# Prefix injected into QUALIFY response to reveal budget signal
	# without mutating prospect_profile (immutable prospect state).
	BUDGET_REVEAL_TEXT = {
	"high": "We do have solid budget allocated for this initiative. ",
	"medium": "We have some budget set aside, though flexibility is limited. ",
	"low": "Our budget is quite constrained right now. ",
	}


	class ProspectSimulator:
	"""
	Pure rule-based simulator. No LLM. No transformers.
	Deterministic per action type.

	Immutability guarantee:
	This class NEVER mutates state.prospect_profile.
	Budget reveal is surfaced via the response text only.
	The environment (salespath_environment.py) owns all state writes.
	"""

	def respond(
	self,
	action: SalesPathAction,
	state: SalesPathState,
	) -> tuple[str, str]:
	"""
	Returns:
	(response_token, response_text)
	"""
	token = self._get_token(action, state)
	text = self._build_text(token, action, state)
	return token, text

	# ------------------------------------------------------------------
	# Private helpers
	# ------------------------------------------------------------------

	def _build_text(
	self,
	token: str,
	action: SalesPathAction,
	state: SalesPathState,
	) -> str:
	base = RESPONSE_TEXT[token]

	# Inject budget reveal into QUALIFY response text.
	# We read from hidden_state, not prospect_profile, so no mutation needed.
	if action.action_type == "QUALIFY":
	budget_signal = state.prospect_profile.get("budget_signal", "unknown")
	if budget_signal == "unknown":
	revealed = state.hidden_state.get("revealed_budget", "medium")
	prefix = BUDGET_REVEAL_TEXT.get(revealed, "")
	return prefix + base

	return base

	def _get_token(
	self,
	action: SalesPathAction,
	state: SalesPathState,
	) -> str:
	atype = action.action_type
	difficulty = state.difficulty
	turn = state.turn_number
	profile = state.prospect_profile
	hidden = state.hidden_state
	objections = state.objections_handled

	# --------------------------------------------------
	# 1. Rule-violation responses (highest priority)
	# --------------------------------------------------
	if state.constraints_violated:
	latest = state.constraints_violated[-1]
	if latest == "R01":
	return "objection:premature_pitch"
	if latest == "R03":
	return "deflect:budget_not_discussed"

	# --------------------------------------------------
	# 2. Stall injection for difficulty 3+
	# Uses a state-seeded RNG so the response is
	# deterministic given (episode_id, turn, action).
	# Required for GRPO state-snapshot consistency.
	# --------------------------------------------------
	if difficulty >= 3 and turn >= 5:
	stall_prob = hidden.get("stall_probability", 0.0)
	if stall_prob > 0.0:
	rng = _seeded_random(state, action)
	if rng.random() < stall_prob:
	return "deflect:stall"

	# --------------------------------------------------
	# 3. Action-based deterministic responses
	# --------------------------------------------------
	if atype == "PROSPECT":
	return "open:positive_signal"

	if atype == "QUALIFY":
	return "open:neutral_signal"

	if atype == "PRESENT":
	if difficulty >= 2 and objections == 0:
	return "objection:price"
	return "open:positive_signal"

	if atype == "HANDLE_OBJECTION":
	state.objections_handled += 1 # only non-profile mutation
	required = hidden.get("num_objections", 1)
	if state.objections_handled >= required:
	return "open:positive_signal"
	if objections == 0:
	return "objection:timing"
	return "open:positive_signal"

	if atype == "OFFER_DEMO":
	return "accept:demo_scheduled"

	if atype == "NEGOTIATE":
	return "open:neutral_signal"

	if atype == "CLOSE":
	true_budget = hidden.get("true_budget", 0.7)
	close_threshold = hidden.get("close_threshold", 0.5)
	decision_maker = profile.get("decision_maker", True)
	if true_budget >= close_threshold and decision_maker:
	return "accept:close_success"
	return "reject:close_failed"

	if atype == "FOLLOW_UP":
	return "open:neutral_signal"

	if atype == "DISQUALIFY":
	return "exit:disqualified"

	return "open:neutral_signal"