"""
Trajectory scorers for the Pharmacovigilance Signal Detector.

These functions are intentionally pharmacovigilance-specific rather than
generic "reward bucket" adapters. The scoring rubric emphasizes:

1. Signal sensitivity: missing a true novel safety signal is costly.
2. Operational judgment: escalation/log/dismiss choices matter independently.
3. Causal calibration: high scores should reflect not just suspicion, but
   identifying the right drug or interaction.

All public grader outputs are forced into the judge-safe interval (0.01, 0.99).
"""

from typing import Any, Iterable, List


STRICT_MIN = 0.01
STRICT_MAX = 0.99


def _bounded(value: float) -> float:
    return min(max(round(value, 4), STRICT_MIN), STRICT_MAX)


def _as_reward_list(trajectory: dict | None) -> List[float]:
    payload = trajectory or {}

    rewards = payload.get("rewards")
    if isinstance(rewards, list) and rewards:
        return [float(item) for item in rewards]

    if "score" in payload:
        return [float(payload["score"])]

    reward = payload.get("reward")
    if isinstance(reward, dict) and "total" in reward:
        return [float(reward["total"])]
    if reward is not None:
        return [float(reward)]

    return []


def _reward_profile(reward: float) -> str:
    """
    Translate a step reward into a pharmacovigilance interpretation bucket.

    This keeps the grader coupled to the meaning of the environment rather than
    to borrowed labels from a different domain.
    """
    if reward <= 0.05:
        return "unsafe_miss"
    if reward <= 0.20:
        return "bad_call"
    if reward < 0.50:
        return "weak_triage"
    if reward < 0.80:
        return "workable_triage"
    if reward < 0.95:
        return "strong_triage"
    return "expert_triage"


def _mean(values: Iterable[float]) -> float:
    items = list(values)
    if not items:
        return 0.5
    return sum(items) / len(items)


def _score_episode(
    rewards: List[float],
    *,
    miss_cost: float,
    overcall_cost: float,
    stability_gain: float,
    expertise_gain: float,
) -> float:
    if not rewards:
        return 0.5

    labels = [_reward_profile(reward) for reward in rewards]
    mean_reward = _mean(rewards)
    total_steps = len(rewards)

    unsafe_miss_count = labels.count("unsafe_miss")
    bad_call_count = labels.count("bad_call")
    weak_count = labels.count("weak_triage")
    strong_count = labels.count("strong_triage") + labels.count("expert_triage")
    expert_count = labels.count("expert_triage")

    downward_pressure = (
        min(unsafe_miss_count * miss_cost, 0.35)
        + min(bad_call_count * overcall_cost, 0.15)
        + min(weak_count * 0.015, 0.06)
    )

    upward_pressure = 0.0
    if strong_count / total_steps >= 0.80:
        upward_pressure += stability_gain
    if expert_count / total_steps >= 0.60:
        upward_pressure += expertise_gain

    return _bounded(mean_reward - downward_pressure + upward_pressure)


def easy_grader(trajectory: dict = None) -> float:
    """
    Easy tier: obvious known-signal recognition and straightforward handling.

    The scorer expects high reliability here. Weak or missed judgments are
    penalized more sharply because these are the least ambiguous cases.
    """
    rewards = _as_reward_list(trajectory)
    return _score_episode(
        rewards,
        miss_cost=0.12,
        overcall_cost=0.03,
        stability_gain=0.05,
        expertise_gain=0.01,
    )


def medium_grader(trajectory: dict = None) -> float:
    """
    Medium tier: cluster recognition and escalation readiness.

    These cases reward agents that can move from single-case thinking to
    population-level signal interpretation.
    """
    rewards = _as_reward_list(trajectory)
    return _score_episode(
        rewards,
        miss_cost=0.09,
        overcall_cost=0.04,
        stability_gain=0.03,
        expertise_gain=0.02,
    )


def hard_grader(trajectory: dict = None) -> float:
    """
    Hard tier: confounding, blame reassignment, and interaction reasoning.

    The hard scorer gives extra value to near-expert trajectories because this
    tier is specifically designed to separate shallow pattern matching from
    mechanistic causal reasoning.
    """
    rewards = _as_reward_list(trajectory)
    return _score_episode(
        rewards,
        miss_cost=0.07,
        overcall_cost=0.03,
        stability_gain=0.02,
        expertise_gain=0.04,
    )


def known_signal_easy_grader(trajectory: dict = None) -> float:
    return easy_grader(trajectory)


def cluster_signal_medium_grader(trajectory: dict = None) -> float:
    return medium_grader(trajectory)


def confounded_hard_grader(trajectory: dict = None) -> float:
    return hard_grader(trajectory)


__all__ = [
    "easy_grader",
    "medium_grader",
    "hard_grader",
    "known_signal_easy_grader",
    "cluster_signal_medium_grader",
    "confounded_hard_grader",
]