openenv-multi-agent-RL / salespath_env /server /prospect_simulator.py
Lomesh2000
FIX: grop update new , env changes
e6a02dd
# salespath_env/server/prospect_simulator.py
import hashlib
import random
from ..models import SalesPathAction, SalesPathState
RESPONSE_TEXT = {
"open:positive_signal": "That sounds interesting. Tell me more about how this works.",
"open:neutral_signal": "I see. We're evaluating a few options at the moment.",
"objection:price": "The pricing seems higher than what we budgeted for.",
"objection:timing": "The timing isn't ideal — we're in the middle of a quarter close.",
"objection:premature_pitch": (
"I'm not sure we're ready to discuss solutions yet. "
"What do you know about our current situation?"
),
"deflect:budget_not_discussed": (
"We haven't really talked about what we're looking for yet."
),
"deflect:stall": (
"Let me get back to you on this. A lot is happening on our end."
),
"accept:demo_scheduled": (
"Yes, let's set up a demo. What time works next week?"
),
"accept:close_success": (
"Alright, I think we can move forward with this. "
"Send over the paperwork."
),
"reject:close_failed": (
"I don't think we're ready to commit at this point."
),
"silence": "",
"exit:disqualified": (
"I think we're done here. This isn't the right fit."
),
}
def _seeded_random(state: SalesPathState, action: SalesPathAction) -> random.Random:
"""
Build a deterministic RNG keyed on (episode_id, turn_number, action_type).
Why: GRPO training restores environment state from snapshots and re-applies
actions in a separate process / thread. If the prospect's response depends
on an unseeded `random.random()` call, the reward computed during gradient
update can disagree with the rollout-time reward, breaking the snapshot
trick and silently corrupting the gradient.
"""
key = f"{state.episode_id}|{state.turn_number}|{action.action_type}"
seed = int(hashlib.sha1(key.encode("utf-8")).hexdigest()[:12], 16)
return random.Random(seed)
# Prefix injected into QUALIFY response to reveal budget signal
# without mutating prospect_profile (immutable prospect state).
BUDGET_REVEAL_TEXT = {
"high": "We do have solid budget allocated for this initiative. ",
"medium": "We have some budget set aside, though flexibility is limited. ",
"low": "Our budget is quite constrained right now. ",
}
class ProspectSimulator:
"""
Pure rule-based simulator. No LLM. No transformers.
Deterministic per action type.
Immutability guarantee:
This class NEVER mutates state.prospect_profile.
Budget reveal is surfaced via the response *text* only.
The environment (salespath_environment.py) owns all state writes.
"""
def respond(
self,
action: SalesPathAction,
state: SalesPathState,
) -> tuple[str, str]:
"""
Returns:
(response_token, response_text)
"""
token = self._get_token(action, state)
text = self._build_text(token, action, state)
return token, text
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _build_text(
self,
token: str,
action: SalesPathAction,
state: SalesPathState,
) -> str:
base = RESPONSE_TEXT[token]
# Inject budget reveal into QUALIFY response text.
# We read from hidden_state, not prospect_profile, so no mutation needed.
if action.action_type == "QUALIFY":
budget_signal = state.prospect_profile.get("budget_signal", "unknown")
if budget_signal == "unknown":
revealed = state.hidden_state.get("revealed_budget", "medium")
prefix = BUDGET_REVEAL_TEXT.get(revealed, "")
return prefix + base
return base
def _get_token(
self,
action: SalesPathAction,
state: SalesPathState,
) -> str:
atype = action.action_type
difficulty = state.difficulty
turn = state.turn_number
profile = state.prospect_profile
hidden = state.hidden_state
objections = state.objections_handled
# --------------------------------------------------
# 1. Rule-violation responses (highest priority)
# --------------------------------------------------
if state.constraints_violated:
latest = state.constraints_violated[-1]
if latest == "R01":
return "objection:premature_pitch"
if latest == "R03":
return "deflect:budget_not_discussed"
# --------------------------------------------------
# 2. Stall injection for difficulty 3+
# Uses a state-seeded RNG so the response is
# deterministic given (episode_id, turn, action).
# Required for GRPO state-snapshot consistency.
# --------------------------------------------------
if difficulty >= 3 and turn >= 5:
stall_prob = hidden.get("stall_probability", 0.0)
if stall_prob > 0.0:
rng = _seeded_random(state, action)
if rng.random() < stall_prob:
return "deflect:stall"
# --------------------------------------------------
# 3. Action-based deterministic responses
# --------------------------------------------------
if atype == "PROSPECT":
return "open:positive_signal"
if atype == "QUALIFY":
return "open:neutral_signal"
if atype == "PRESENT":
if difficulty >= 2 and objections == 0:
return "objection:price"
return "open:positive_signal"
if atype == "HANDLE_OBJECTION":
state.objections_handled += 1 # only non-profile mutation
required = hidden.get("num_objections", 1)
if state.objections_handled >= required:
return "open:positive_signal"
if objections == 0:
return "objection:timing"
return "open:positive_signal"
if atype == "OFFER_DEMO":
return "accept:demo_scheduled"
if atype == "NEGOTIATE":
return "open:neutral_signal"
if atype == "CLOSE":
true_budget = hidden.get("true_budget", 0.7)
close_threshold = hidden.get("close_threshold", 0.5)
decision_maker = profile.get("decision_maker", True)
if true_budget >= close_threshold and decision_maker:
return "accept:close_success"
return "reject:close_failed"
if atype == "FOLLOW_UP":
return "open:neutral_signal"
if atype == "DISQUALIFY":
return "exit:disqualified"
return "open:neutral_signal"