Spaces:

Lomesh7777
/

openenv-multi-agent-RL

Sleeping

File size: 6,999 Bytes

# salespath_env/server/prospect_simulator.py

import hashlib
import random

from ..models import SalesPathAction, SalesPathState


RESPONSE_TEXT = {
    "open:positive_signal":     "That sounds interesting. Tell me more about how this works.",
    "open:neutral_signal":      "I see. We're evaluating a few options at the moment.",

    "objection:price":          "The pricing seems higher than what we budgeted for.",
    "objection:timing":         "The timing isn't ideal — we're in the middle of a quarter close.",
    "objection:premature_pitch": (
        "I'm not sure we're ready to discuss solutions yet. "
        "What do you know about our current situation?"
    ),

    "deflect:budget_not_discussed": (
        "We haven't really talked about what we're looking for yet."
    ),
    "deflect:stall": (
        "Let me get back to you on this. A lot is happening on our end."
    ),

    "accept:demo_scheduled": (
        "Yes, let's set up a demo. What time works next week?"
    ),
    "accept:close_success": (
        "Alright, I think we can move forward with this. "
        "Send over the paperwork."
    ),

    "reject:close_failed": (
        "I don't think we're ready to commit at this point."
    ),

    "silence": "",

    "exit:disqualified": (
        "I think we're done here. This isn't the right fit."
    ),
}


def _seeded_random(state: SalesPathState, action: SalesPathAction) -> random.Random:
    """
    Build a deterministic RNG keyed on (episode_id, turn_number, action_type).

    Why: GRPO training restores environment state from snapshots and re-applies
    actions in a separate process / thread.  If the prospect's response depends
    on an unseeded `random.random()` call, the reward computed during gradient
    update can disagree with the rollout-time reward, breaking the snapshot
    trick and silently corrupting the gradient.
    """
    key = f"{state.episode_id}|{state.turn_number}|{action.action_type}"
    seed = int(hashlib.sha1(key.encode("utf-8")).hexdigest()[:12], 16)
    return random.Random(seed)

# Prefix injected into QUALIFY response to reveal budget signal
# without mutating prospect_profile (immutable prospect state).
BUDGET_REVEAL_TEXT = {
    "high":   "We do have solid budget allocated for this initiative. ",
    "medium": "We have some budget set aside, though flexibility is limited. ",
    "low":    "Our budget is quite constrained right now. ",
}


class ProspectSimulator:
    """
    Pure rule-based simulator. No LLM. No transformers.
    Deterministic per action type.

    Immutability guarantee:
        This class NEVER mutates state.prospect_profile.
        Budget reveal is surfaced via the response *text* only.
        The environment (salespath_environment.py) owns all state writes.
    """

    def respond(
        self,
        action: SalesPathAction,
        state: SalesPathState,
    ) -> tuple[str, str]:
        """
        Returns:
            (response_token, response_text)
        """
        token = self._get_token(action, state)
        text  = self._build_text(token, action, state)
        return token, text

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _build_text(
        self,
        token: str,
        action: SalesPathAction,
        state: SalesPathState,
    ) -> str:
        base = RESPONSE_TEXT[token]

        # Inject budget reveal into QUALIFY response text.
        # We read from hidden_state, not prospect_profile, so no mutation needed.
        if action.action_type == "QUALIFY":
            budget_signal = state.prospect_profile.get("budget_signal", "unknown")
            if budget_signal == "unknown":
                revealed = state.hidden_state.get("revealed_budget", "medium")
                prefix   = BUDGET_REVEAL_TEXT.get(revealed, "")
                return prefix + base

        return base

    def _get_token(
        self,
        action: SalesPathAction,
        state: SalesPathState,
    ) -> str:
        atype      = action.action_type
        difficulty = state.difficulty
        turn       = state.turn_number
        profile    = state.prospect_profile
        hidden     = state.hidden_state
        objections = state.objections_handled

        # --------------------------------------------------
        # 1. Rule-violation responses (highest priority)
        # --------------------------------------------------
        if state.constraints_violated:
            latest = state.constraints_violated[-1]
            if latest == "R01":
                return "objection:premature_pitch"
            if latest == "R03":
                return "deflect:budget_not_discussed"

        # --------------------------------------------------
        # 2. Stall injection for difficulty 3+
        #    Uses a state-seeded RNG so the response is
        #    deterministic given (episode_id, turn, action).
        #    Required for GRPO state-snapshot consistency.
        # --------------------------------------------------
        if difficulty >= 3 and turn >= 5:
            stall_prob = hidden.get("stall_probability", 0.0)
            if stall_prob > 0.0:
                rng = _seeded_random(state, action)
                if rng.random() < stall_prob:
                    return "deflect:stall"

        # --------------------------------------------------
        # 3. Action-based deterministic responses
        # --------------------------------------------------
        if atype == "PROSPECT":
            return "open:positive_signal"

        if atype == "QUALIFY":
            return "open:neutral_signal"

        if atype == "PRESENT":
            if difficulty >= 2 and objections == 0:
                return "objection:price"
            return "open:positive_signal"

        if atype == "HANDLE_OBJECTION":
            state.objections_handled += 1          # only non-profile mutation
            required = hidden.get("num_objections", 1)
            if state.objections_handled >= required:
                return "open:positive_signal"
            if objections == 0:
                return "objection:timing"
            return "open:positive_signal"

        if atype == "OFFER_DEMO":
            return "accept:demo_scheduled"

        if atype == "NEGOTIATE":
            return "open:neutral_signal"

        if atype == "CLOSE":
            true_budget     = hidden.get("true_budget", 0.7)
            close_threshold = hidden.get("close_threshold", 0.5)
            decision_maker  = profile.get("decision_maker", True)
            if true_budget >= close_threshold and decision_maker:
                return "accept:close_success"
            return "reject:close_failed"

        if atype == "FOLLOW_UP":
            return "open:neutral_signal"

        if atype == "DISQUALIFY":
            return "exit:disqualified"

        return "open:neutral_signal"