"""
heuristic_agent.py
------------------
A high-performance heuristic agent for the UIEnv environment.

Architecture
============
The agent uses a **multi-stage decision pipeline** that evaluates conditions
in priority order.  The first stage to produce an action wins.

    Stage 1  →  Risk Mitigation      (prevent imminent drop)
    Stage 2  →  Feedback Adaptation   (react to distrust / drop signals)
    Stage 3  →  Layout Optimization   (converge toward ideal layout)
    Stage 4  →  Exploration           (controlled randomness in safe states)
    Stage 5  →  Fallback              (safe default when layout is near-optimal)

Internal state (outcome history, action history, noop streak) is used to
make context-aware decisions and avoid oscillation.

Includes a full evaluation harness that benchmarks the heuristic agent
against a random baseline.
"""

from __future__ import annotations

import random
from collections import deque
from typing import Optional

from env import UIEnv, Action, Observation

# ──────────────────────────────────────────────────────────────────────
# Optimal layout targets (derived from reward shaping in env.py)
# ──────────────────────────────────────────────────────────────────────

BUTTON_SWEET_LOW: float = 0.9
BUTTON_SWEET_HIGH: float = 1.3
BUTTON_SWEET_MID: float = 1.1        # centre of the sweet spot for jumps

TARGET_STEPS: int = 2                 # at or below → shaping bonus
TARGET_FORM_LENGTH: int = 4           # at or below → progress bonus
SAFE_FORM_FLOOR: int = 3             # do NOT reduce below this (careful-user trap)

DROP_STEPS_THRESHOLD: int = 3        # steps above this → impatient drop
DROP_FORM_THRESHOLD: int = 5         # form_length above this → impatient drop

EXPLORE_PROBABILITY: float = 0.07    # 7 % exploration rate
NOOP_SAFE_LIMIT: int = 1             # max consecutive noops before forcing action

# Inverse action pairs — used for oscillation detection
_INVERSE_ACTIONS: dict[str, str] = {
    "increase_button": "set_button_size",   # conceptual inverse
    "increase_steps": "decrease_steps",
    "decrease_steps": "increase_steps",
}


# ──────────────────────────────────────────────────────────────────────
# Heuristic Agent
# ──────────────────────────────────────────────────────────────────────

class HeuristicAgent:
    """
    Structured, multi-stage heuristic agent for UIEnv.

    The agent maintains internal state that is updated every step via
    `update(info)`, and selects actions via `act(obs)` using a
    priority-ordered decision pipeline.
    """

    def __init__(self, seed: int = 99) -> None:
        self._rng = random.Random(seed)

        # ── internal tracking ──
        self.last_outcome: Optional[str] = None
        self.noop_streak: int = 0
        self.action_history: deque[str] = deque(maxlen=5)
        self.distrust_count: int = 0
        self.drop_count: int = 0
        self.step_number: int = 0

    # ──────────────────────── public API ──────────────────────────

    def reset(self) -> None:
        """Clear per-episode state at the start of a new episode."""
        self.last_outcome = None
        self.noop_streak = 0
        self.action_history.clear()
        self.distrust_count = 0
        self.drop_count = 0
        self.step_number = 0

    def act(self, obs: Observation) -> Action:
        """
        Select the next action by running the decision pipeline.

        Stages are evaluated in priority order; the first stage to return
        a non-None action wins.  This guarantees that safety-critical
        adjustments always take precedence over optimisation moves.
        """
        self.step_number += 1

        action = (
            self._risk_mitigation(obs)
            or self._adaptation(obs)
            or self._optimize_layout(obs)
            or self._explore(obs)
            or self._fallback(obs)
        )

        # Record for oscillation detection
        self.action_history.append(action.type)

        # Track noop streak
        if action.type == "noop":
            self.noop_streak += 1
        else:
            self.noop_streak = 0

        return action

    def update(self, info: dict) -> None:
        """Ingest environment info dict to update internal beliefs."""
        outcome = info.get("outcome", "continue")
        self.last_outcome = outcome
        if outcome == "distrust":
            self.distrust_count += 1
        elif outcome == "drop":
            self.drop_count += 1

    # ──────────────────────── helpers ─────────────────────────────

    def _would_oscillate(self, candidate: str) -> bool:
        """
        Return True if `candidate` would undo the most recent action,
        creating a pointless back-and-forth oscillation.
        """
        if not self.action_history:
            return False
        last = self.action_history[-1]
        inv = _INVERSE_ACTIONS.get(candidate)
        return last == inv or _INVERSE_ACTIONS.get(last) == candidate

    @staticmethod
    def _make(action_type: str, value: float | None = None) -> Action:
        """Shorthand to construct an Action."""
        return Action(type=action_type, value=value)

    # ──────────── Stage 1: Risk Mitigation ────────────────────────

    def _risk_mitigation(self, obs: Observation) -> Optional[Action]:
        """
        Immediately neutralise conditions that lead to user drop.

        Priority:
          1. steps > 3 → decrease_steps   (impatient-drop rule)
          2. form_length > 5 → decrease_form   (impatient-drop rule)

        Steps are prioritised because the impatient drop threshold for
        steps (> 3) is stricter and more common than form (> 5).
        """
        layout = obs.layout

        if layout.steps > DROP_STEPS_THRESHOLD:
            return self._make("decrease_steps")

        if layout.form_length > DROP_FORM_THRESHOLD:
            return self._make("decrease_form")

        return None

    # ──────────── Stage 2: Feedback Adaptation ────────────────────

    def _adaptation(self, obs: Observation) -> Optional[Action]:
        """
        React to the most recent user outcome signal.

        - 'distrust' means the layout is *too minimal* for this user type:
              • new users distrust when steps < 2 → increase_steps
              • careful users distrust when form_length < 3 → stop reducing
                (since there is no increase_form action, we can only prevent
                future reduction — but if steps are low, raising them is safe)
        - 'drop' means the layout was *too heavy* → aggressively reduce
        """
        if self.last_outcome == "distrust":
            layout = obs.layout

            # New-user distrust: steps too low
            if layout.steps < 2 and not self._would_oscillate("increase_steps"):
                return self._make("increase_steps")

            # Careful-user distrust is likely about form being too short.
            # We can't increase form, but we can ensure steps stay reasonable
            # (having decent steps helps overall progress which offsets the
            # distrust effect on the next simulation round).
            if layout.steps < 2:
                return self._make("increase_steps")

            # If distrust persists but layout looks safe, do nothing drastic
            # — let the optimiser handle it.
            return None

        if self.last_outcome == "drop":
            layout = obs.layout

            # Emergency: cut the most expensive dimension first
            if layout.steps > 2 and not self._would_oscillate("decrease_steps"):
                return self._make("decrease_steps")

            if layout.form_length > SAFE_FORM_FLOOR:
                return self._make("decrease_form")

            return None

        return None

    # ──────────── Stage 3: Layout Optimization ────────────────────

    def _optimize_layout(self, obs: Observation) -> Optional[Action]:
        """
        Gradually move the layout toward the ideal configuration:
            button_size ∈ [0.9, 1.3]
            steps ≤ 2
            form_length ≤ 4  (but ≥ 3 for safety)

        Optimisation order (by reward impact):
            1. steps  → biggest reward shaping bonus (+0.1) AND progress bonus
            2. form   → progress bonus when ≤ 4
            3. button → shaping bonus (+0.1) when in sweet spot

        Each call makes at most ONE change to avoid compounding effects
        in a single step.
        """
        layout = obs.layout

        # ── Steps: aim for TARGET_STEPS (2) ──
        if layout.steps > TARGET_STEPS and not self._would_oscillate("decrease_steps"):
            # Don't reduce below 2 if we've seen distrust (new-user guard)
            if not (self.distrust_count > 0 and layout.steps <= 2):
                return self._make("decrease_steps")

        # ── Form: aim for TARGET_FORM_LENGTH (4) but never below SAFE_FORM_FLOOR (3) ──
        if layout.form_length > TARGET_FORM_LENGTH and layout.form_length > SAFE_FORM_FLOOR:
            return self._make("decrease_form")

        # ── Button size: steer into sweet spot ──
        bs = layout.button_size
        if bs < BUTTON_SWEET_LOW:
            if not self._would_oscillate("increase_button"):
                return self._make("increase_button")

        if bs > BUTTON_SWEET_HIGH:
            # Use set_button_size to jump directly into the sweet zone
            # rather than slowly decrementing (no decrease_button action exists)
            return self._make("set_button_size", BUTTON_SWEET_MID)

        return None

    # ──────────── Stage 4: Exploration ────────────────────────────

    def _explore(self, obs: Observation) -> Optional[Action]:
        """
        Small controlled randomness to discover micro-improvements.

        Only fires when:
          - RNG says so (7 % chance)
          - Last outcome was NOT negative (don't explore under stress)
          - Layout is already reasonably safe

        Exploration action: try a random button_size within the sweet spot.
        This is the safest dimension to explore because it has no drop or
        distrust rules tied to it.
        """
        if self.last_outcome in ("drop", "distrust"):
            return None

        if self._rng.random() < EXPLORE_PROBABILITY:
            target = self._rng.uniform(BUTTON_SWEET_LOW, BUTTON_SWEET_HIGH)
            target = round(target, 2)
            return self._make("set_button_size", target)

        return None

    # ──────────── Stage 5: Fallback ───────────────────────────────

    def _fallback(self, obs: Observation) -> Action:
        """
        Default action when the layout is already near-optimal.

        - If noop streak is still safe → noop (preserves a good layout)
        - Otherwise → a tiny, safe micro-adjustment to break the streak
          while keeping the layout in the sweet spot.
        """
        if self.noop_streak < NOOP_SAFE_LIMIT:
            return self._make("noop")

        # Break the noop streak with a harmless move
        bs = obs.layout.button_size
        if bs <= BUTTON_SWEET_MID:
            target = min(BUTTON_SWEET_HIGH, bs + 0.05)
        else:
            target = max(BUTTON_SWEET_LOW, bs - 0.05)

        return self._make("set_button_size", round(target, 2))


# ──────────────────────────────────────────────────────────────────────
# Random Agent (Baseline)
# ──────────────────────────────────────────────────────────────────────

class RandomAgent:
    """Uniformly random discrete-action agent for baseline comparison."""

    _ACTIONS = [
        "increase_button",
        "decrease_form",
        "increase_steps",
        "decrease_steps",
        "reorder_sections",
        "noop",
    ]

    def __init__(self, seed: int = 99) -> None:
        self._rng = random.Random(seed)

    def reset(self) -> None:
        pass

    def act(self, obs: Observation) -> Action:
        return Action(type=self._rng.choice(self._ACTIONS), value=None)

    def update(self, info: dict) -> None:
        pass


# ──────────────────────────────────────────────────────────────────────
# Evaluation Harness
# ──────────────────────────────────────────────────────────────────────

def run_evaluation(
    agent,
    n_episodes: int = 200,
    env_seed: int = 42,
    verbose: bool = False,
) -> dict:
    """
    Run *n_episodes* in UIEnv with the given agent and collect metrics.

    Returns
    -------
    dict with keys:
        avg_reward, completion_rate, drop_rate, avg_steps
    """
    env = UIEnv(seed=env_seed)

    total_reward: float = 0.0
    completions: int = 0
    drops: int = 0
    total_steps: int = 0

    for ep in range(n_episodes):
        obs = env.reset()
        agent.reset()
        ep_reward: float = 0.0
        done = False

        while not done:
            action = agent.act(obs)
            obs, reward, done, info = env.step(action)
            agent.update(info)
            ep_reward += reward

        total_reward += ep_reward
        total_steps += info["step_count"]

        if info["outcome"] == "complete":
            completions += 1
        elif info["outcome"] == "drop":
            drops += 1

        if verbose and ep < 10:
            print(
                f"  ep={ep:03d}  outcome={info['outcome']:<10s}  "
                f"reward={ep_reward:+.3f}  steps={info['step_count']}"
            )

    return {
        "avg_reward": total_reward / n_episodes,
        "completion_rate": completions / n_episodes,
        "drop_rate": drops / n_episodes,
        "avg_steps": total_steps / n_episodes,
    }


def _fmt_pct(v: float) -> str:
    return f"{v * 100:.1f}%"


# ──────────────────────────────────────────────────────────────────────
# Main — run benchmark
# ──────────────────────────────────────────────────────────────────────

if __name__ == "__main__":

    N_EPISODES = 200

    print("=" * 64)
    print("  UIEnv Heuristic Agent -- Benchmark Suite")
    print("=" * 64)

    # -- Heuristic Agent --
    print("\n> Running Heuristic Agent ...")
    h_agent = HeuristicAgent(seed=99)
    h_metrics = run_evaluation(h_agent, n_episodes=N_EPISODES, verbose=True)

    # -- Random Baseline --
    print("\n> Running Random Agent ...")
    r_agent = RandomAgent(seed=99)
    r_metrics = run_evaluation(r_agent, n_episodes=N_EPISODES, verbose=True)

    # -- Comparison Table --
    print("\n" + "-" * 64)
    print(f"  {'Metric':<22s} {'Heuristic':>12s} {'Random':>12s} {'Delta':>12s}")
    print("-" * 64)

    for key, label in [
        ("avg_reward",      "Avg Reward"),
        ("completion_rate", "Completion Rate"),
        ("drop_rate",       "Drop Rate"),
        ("avg_steps",       "Avg Steps"),
    ]:
        h_val = h_metrics[key]
        r_val = r_metrics[key]
        delta = h_val - r_val

        if "rate" in key:
            h_str = _fmt_pct(h_val)
            r_str = _fmt_pct(r_val)
            d_str = f"{delta * 100:+.1f}pp"
        elif "step" in key:
            h_str = f"{h_val:.1f}"
            r_str = f"{r_val:.1f}"
            d_str = f"{delta:+.1f}"
        else:
            h_str = f"{h_val:+.4f}"
            r_str = f"{r_val:+.4f}"
            d_str = f"{delta:+.4f}"

        print(f"  {label:<22s} {h_str:>12s} {r_str:>12s} {d_str:>12s}")

    print("-" * 64)

    # -- Verdict --
    lift = h_metrics["avg_reward"] - r_metrics["avg_reward"]
    if lift > 0.2:
        verdict = "[PASS] STRONG improvement over random baseline"
    elif lift > 0.05:
        verdict = "[WARN] Moderate improvement -- consider tuning"
    else:
        verdict = "[FAIL] Marginal -- agent needs rework"

    print(f"\n  Verdict: {verdict}")
    print(f"  Reward lift: {lift:+.4f}\n")