# salespath_env/server/prospect_simulator.py import hashlib import random from ..models import SalesPathAction, SalesPathState RESPONSE_TEXT = { "open:positive_signal": "That sounds interesting. Tell me more about how this works.", "open:neutral_signal": "I see. We're evaluating a few options at the moment.", "objection:price": "The pricing seems higher than what we budgeted for.", "objection:timing": "The timing isn't ideal — we're in the middle of a quarter close.", "objection:premature_pitch": ( "I'm not sure we're ready to discuss solutions yet. " "What do you know about our current situation?" ), "deflect:budget_not_discussed": ( "We haven't really talked about what we're looking for yet." ), "deflect:stall": ( "Let me get back to you on this. A lot is happening on our end." ), "accept:demo_scheduled": ( "Yes, let's set up a demo. What time works next week?" ), "accept:close_success": ( "Alright, I think we can move forward with this. " "Send over the paperwork." ), "reject:close_failed": ( "I don't think we're ready to commit at this point." ), "silence": "", "exit:disqualified": ( "I think we're done here. This isn't the right fit." ), } def _seeded_random(state: SalesPathState, action: SalesPathAction) -> random.Random: """ Build a deterministic RNG keyed on (episode_id, turn_number, action_type). Why: GRPO training restores environment state from snapshots and re-applies actions in a separate process / thread. If the prospect's response depends on an unseeded `random.random()` call, the reward computed during gradient update can disagree with the rollout-time reward, breaking the snapshot trick and silently corrupting the gradient. """ key = f"{state.episode_id}|{state.turn_number}|{action.action_type}" seed = int(hashlib.sha1(key.encode("utf-8")).hexdigest()[:12], 16) return random.Random(seed) # Prefix injected into QUALIFY response to reveal budget signal # without mutating prospect_profile (immutable prospect state). BUDGET_REVEAL_TEXT = { "high": "We do have solid budget allocated for this initiative. ", "medium": "We have some budget set aside, though flexibility is limited. ", "low": "Our budget is quite constrained right now. ", } class ProspectSimulator: """ Pure rule-based simulator. No LLM. No transformers. Deterministic per action type. Immutability guarantee: This class NEVER mutates state.prospect_profile. Budget reveal is surfaced via the response *text* only. The environment (salespath_environment.py) owns all state writes. """ def respond( self, action: SalesPathAction, state: SalesPathState, ) -> tuple[str, str]: """ Returns: (response_token, response_text) """ token = self._get_token(action, state) text = self._build_text(token, action, state) return token, text # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ def _build_text( self, token: str, action: SalesPathAction, state: SalesPathState, ) -> str: base = RESPONSE_TEXT[token] # Inject budget reveal into QUALIFY response text. # We read from hidden_state, not prospect_profile, so no mutation needed. if action.action_type == "QUALIFY": budget_signal = state.prospect_profile.get("budget_signal", "unknown") if budget_signal == "unknown": revealed = state.hidden_state.get("revealed_budget", "medium") prefix = BUDGET_REVEAL_TEXT.get(revealed, "") return prefix + base return base def _get_token( self, action: SalesPathAction, state: SalesPathState, ) -> str: atype = action.action_type difficulty = state.difficulty turn = state.turn_number profile = state.prospect_profile hidden = state.hidden_state objections = state.objections_handled # -------------------------------------------------- # 1. Rule-violation responses (highest priority) # -------------------------------------------------- if state.constraints_violated: latest = state.constraints_violated[-1] if latest == "R01": return "objection:premature_pitch" if latest == "R03": return "deflect:budget_not_discussed" # -------------------------------------------------- # 2. Stall injection for difficulty 3+ # Uses a state-seeded RNG so the response is # deterministic given (episode_id, turn, action). # Required for GRPO state-snapshot consistency. # -------------------------------------------------- if difficulty >= 3 and turn >= 5: stall_prob = hidden.get("stall_probability", 0.0) if stall_prob > 0.0: rng = _seeded_random(state, action) if rng.random() < stall_prob: return "deflect:stall" # -------------------------------------------------- # 3. Action-based deterministic responses # -------------------------------------------------- if atype == "PROSPECT": return "open:positive_signal" if atype == "QUALIFY": return "open:neutral_signal" if atype == "PRESENT": if difficulty >= 2 and objections == 0: return "objection:price" return "open:positive_signal" if atype == "HANDLE_OBJECTION": state.objections_handled += 1 # only non-profile mutation required = hidden.get("num_objections", 1) if state.objections_handled >= required: return "open:positive_signal" if objections == 0: return "objection:timing" return "open:positive_signal" if atype == "OFFER_DEMO": return "accept:demo_scheduled" if atype == "NEGOTIATE": return "open:neutral_signal" if atype == "CLOSE": true_budget = hidden.get("true_budget", 0.7) close_threshold = hidden.get("close_threshold", 0.5) decision_maker = profile.get("decision_maker", True) if true_budget >= close_threshold and decision_maker: return "accept:close_success" return "reject:close_failed" if atype == "FOLLOW_UP": return "open:neutral_signal" if atype == "DISQUALIFY": return "exit:disqualified" return "open:neutral_signal"