File size: 6,999 Bytes
57eab70
 
e6a02dd
57eab70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6a02dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57eab70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6a02dd
 
 
57eab70
 
 
e6a02dd
 
 
 
57eab70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbf5bf6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# salespath_env/server/prospect_simulator.py

import hashlib
import random

from ..models import SalesPathAction, SalesPathState


RESPONSE_TEXT = {
    "open:positive_signal":     "That sounds interesting. Tell me more about how this works.",
    "open:neutral_signal":      "I see. We're evaluating a few options at the moment.",

    "objection:price":          "The pricing seems higher than what we budgeted for.",
    "objection:timing":         "The timing isn't ideal — we're in the middle of a quarter close.",
    "objection:premature_pitch": (
        "I'm not sure we're ready to discuss solutions yet. "
        "What do you know about our current situation?"
    ),

    "deflect:budget_not_discussed": (
        "We haven't really talked about what we're looking for yet."
    ),
    "deflect:stall": (
        "Let me get back to you on this. A lot is happening on our end."
    ),

    "accept:demo_scheduled": (
        "Yes, let's set up a demo. What time works next week?"
    ),
    "accept:close_success": (
        "Alright, I think we can move forward with this. "
        "Send over the paperwork."
    ),

    "reject:close_failed": (
        "I don't think we're ready to commit at this point."
    ),

    "silence": "",

    "exit:disqualified": (
        "I think we're done here. This isn't the right fit."
    ),
}


def _seeded_random(state: SalesPathState, action: SalesPathAction) -> random.Random:
    """
    Build a deterministic RNG keyed on (episode_id, turn_number, action_type).

    Why: GRPO training restores environment state from snapshots and re-applies
    actions in a separate process / thread.  If the prospect's response depends
    on an unseeded `random.random()` call, the reward computed during gradient
    update can disagree with the rollout-time reward, breaking the snapshot
    trick and silently corrupting the gradient.
    """
    key = f"{state.episode_id}|{state.turn_number}|{action.action_type}"
    seed = int(hashlib.sha1(key.encode("utf-8")).hexdigest()[:12], 16)
    return random.Random(seed)

# Prefix injected into QUALIFY response to reveal budget signal
# without mutating prospect_profile (immutable prospect state).
BUDGET_REVEAL_TEXT = {
    "high":   "We do have solid budget allocated for this initiative. ",
    "medium": "We have some budget set aside, though flexibility is limited. ",
    "low":    "Our budget is quite constrained right now. ",
}


class ProspectSimulator:
    """
    Pure rule-based simulator. No LLM. No transformers.
    Deterministic per action type.

    Immutability guarantee:
        This class NEVER mutates state.prospect_profile.
        Budget reveal is surfaced via the response *text* only.
        The environment (salespath_environment.py) owns all state writes.
    """

    def respond(
        self,
        action: SalesPathAction,
        state: SalesPathState,
    ) -> tuple[str, str]:
        """
        Returns:
            (response_token, response_text)
        """
        token = self._get_token(action, state)
        text  = self._build_text(token, action, state)
        return token, text

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _build_text(
        self,
        token: str,
        action: SalesPathAction,
        state: SalesPathState,
    ) -> str:
        base = RESPONSE_TEXT[token]

        # Inject budget reveal into QUALIFY response text.
        # We read from hidden_state, not prospect_profile, so no mutation needed.
        if action.action_type == "QUALIFY":
            budget_signal = state.prospect_profile.get("budget_signal", "unknown")
            if budget_signal == "unknown":
                revealed = state.hidden_state.get("revealed_budget", "medium")
                prefix   = BUDGET_REVEAL_TEXT.get(revealed, "")
                return prefix + base

        return base

    def _get_token(
        self,
        action: SalesPathAction,
        state: SalesPathState,
    ) -> str:
        atype      = action.action_type
        difficulty = state.difficulty
        turn       = state.turn_number
        profile    = state.prospect_profile
        hidden     = state.hidden_state
        objections = state.objections_handled

        # --------------------------------------------------
        # 1. Rule-violation responses (highest priority)
        # --------------------------------------------------
        if state.constraints_violated:
            latest = state.constraints_violated[-1]
            if latest == "R01":
                return "objection:premature_pitch"
            if latest == "R03":
                return "deflect:budget_not_discussed"

        # --------------------------------------------------
        # 2. Stall injection for difficulty 3+
        #    Uses a state-seeded RNG so the response is
        #    deterministic given (episode_id, turn, action).
        #    Required for GRPO state-snapshot consistency.
        # --------------------------------------------------
        if difficulty >= 3 and turn >= 5:
            stall_prob = hidden.get("stall_probability", 0.0)
            if stall_prob > 0.0:
                rng = _seeded_random(state, action)
                if rng.random() < stall_prob:
                    return "deflect:stall"

        # --------------------------------------------------
        # 3. Action-based deterministic responses
        # --------------------------------------------------
        if atype == "PROSPECT":
            return "open:positive_signal"

        if atype == "QUALIFY":
            return "open:neutral_signal"

        if atype == "PRESENT":
            if difficulty >= 2 and objections == 0:
                return "objection:price"
            return "open:positive_signal"

        if atype == "HANDLE_OBJECTION":
            state.objections_handled += 1          # only non-profile mutation
            required = hidden.get("num_objections", 1)
            if state.objections_handled >= required:
                return "open:positive_signal"
            if objections == 0:
                return "objection:timing"
            return "open:positive_signal"

        if atype == "OFFER_DEMO":
            return "accept:demo_scheduled"

        if atype == "NEGOTIATE":
            return "open:neutral_signal"

        if atype == "CLOSE":
            true_budget     = hidden.get("true_budget", 0.7)
            close_threshold = hidden.get("close_threshold", 0.5)
            decision_maker  = profile.get("decision_maker", True)
            if true_budget >= close_threshold and decision_maker:
                return "accept:close_success"
            return "reject:close_failed"

        if atype == "FOLLOW_UP":
            return "open:neutral_signal"

        if atype == "DISQUALIFY":
            return "exit:disqualified"

        return "open:neutral_signal"