""" heuristic_agent.py ------------------ A high-performance heuristic agent for the UIEnv environment. Architecture ============ The agent uses a **multi-stage decision pipeline** that evaluates conditions in priority order. The first stage to produce an action wins. Stage 1 → Risk Mitigation (prevent imminent drop) Stage 2 → Feedback Adaptation (react to distrust / drop signals) Stage 3 → Layout Optimization (converge toward ideal layout) Stage 4 → Exploration (controlled randomness in safe states) Stage 5 → Fallback (safe default when layout is near-optimal) Internal state (outcome history, action history, noop streak) is used to make context-aware decisions and avoid oscillation. Includes a full evaluation harness that benchmarks the heuristic agent against a random baseline. """ from __future__ import annotations import random from collections import deque from typing import Optional from env import UIEnv, Action, Observation # ────────────────────────────────────────────────────────────────────── # Optimal layout targets (derived from reward shaping in env.py) # ────────────────────────────────────────────────────────────────────── BUTTON_SWEET_LOW: float = 0.9 BUTTON_SWEET_HIGH: float = 1.3 BUTTON_SWEET_MID: float = 1.1 # centre of the sweet spot for jumps TARGET_STEPS: int = 2 # at or below → shaping bonus TARGET_FORM_LENGTH: int = 4 # at or below → progress bonus SAFE_FORM_FLOOR: int = 3 # do NOT reduce below this (careful-user trap) DROP_STEPS_THRESHOLD: int = 3 # steps above this → impatient drop DROP_FORM_THRESHOLD: int = 5 # form_length above this → impatient drop EXPLORE_PROBABILITY: float = 0.07 # 7 % exploration rate NOOP_SAFE_LIMIT: int = 1 # max consecutive noops before forcing action # Inverse action pairs — used for oscillation detection _INVERSE_ACTIONS: dict[str, str] = { "increase_button": "set_button_size", # conceptual inverse "increase_steps": "decrease_steps", "decrease_steps": "increase_steps", } # ────────────────────────────────────────────────────────────────────── # Heuristic Agent # ────────────────────────────────────────────────────────────────────── class HeuristicAgent: """ Structured, multi-stage heuristic agent for UIEnv. The agent maintains internal state that is updated every step via `update(info)`, and selects actions via `act(obs)` using a priority-ordered decision pipeline. """ def __init__(self, seed: int = 99) -> None: self._rng = random.Random(seed) # ── internal tracking ── self.last_outcome: Optional[str] = None self.noop_streak: int = 0 self.action_history: deque[str] = deque(maxlen=5) self.distrust_count: int = 0 self.drop_count: int = 0 self.step_number: int = 0 # ──────────────────────── public API ────────────────────────── def reset(self) -> None: """Clear per-episode state at the start of a new episode.""" self.last_outcome = None self.noop_streak = 0 self.action_history.clear() self.distrust_count = 0 self.drop_count = 0 self.step_number = 0 def act(self, obs: Observation) -> Action: """ Select the next action by running the decision pipeline. Stages are evaluated in priority order; the first stage to return a non-None action wins. This guarantees that safety-critical adjustments always take precedence over optimisation moves. """ self.step_number += 1 action = ( self._risk_mitigation(obs) or self._adaptation(obs) or self._optimize_layout(obs) or self._explore(obs) or self._fallback(obs) ) # Record for oscillation detection self.action_history.append(action.type) # Track noop streak if action.type == "noop": self.noop_streak += 1 else: self.noop_streak = 0 return action def update(self, info: dict) -> None: """Ingest environment info dict to update internal beliefs.""" outcome = info.get("outcome", "continue") self.last_outcome = outcome if outcome == "distrust": self.distrust_count += 1 elif outcome == "drop": self.drop_count += 1 # ──────────────────────── helpers ───────────────────────────── def _would_oscillate(self, candidate: str) -> bool: """ Return True if `candidate` would undo the most recent action, creating a pointless back-and-forth oscillation. """ if not self.action_history: return False last = self.action_history[-1] inv = _INVERSE_ACTIONS.get(candidate) return last == inv or _INVERSE_ACTIONS.get(last) == candidate @staticmethod def _make(action_type: str, value: float | None = None) -> Action: """Shorthand to construct an Action.""" return Action(type=action_type, value=value) # ──────────── Stage 1: Risk Mitigation ──────────────────────── def _risk_mitigation(self, obs: Observation) -> Optional[Action]: """ Immediately neutralise conditions that lead to user drop. Priority: 1. steps > 3 → decrease_steps (impatient-drop rule) 2. form_length > 5 → decrease_form (impatient-drop rule) Steps are prioritised because the impatient drop threshold for steps (> 3) is stricter and more common than form (> 5). """ layout = obs.layout if layout.steps > DROP_STEPS_THRESHOLD: return self._make("decrease_steps") if layout.form_length > DROP_FORM_THRESHOLD: return self._make("decrease_form") return None # ──────────── Stage 2: Feedback Adaptation ──────────────────── def _adaptation(self, obs: Observation) -> Optional[Action]: """ React to the most recent user outcome signal. - 'distrust' means the layout is *too minimal* for this user type: • new users distrust when steps < 2 → increase_steps • careful users distrust when form_length < 3 → stop reducing (since there is no increase_form action, we can only prevent future reduction — but if steps are low, raising them is safe) - 'drop' means the layout was *too heavy* → aggressively reduce """ if self.last_outcome == "distrust": layout = obs.layout # New-user distrust: steps too low if layout.steps < 2 and not self._would_oscillate("increase_steps"): return self._make("increase_steps") # Careful-user distrust is likely about form being too short. # We can't increase form, but we can ensure steps stay reasonable # (having decent steps helps overall progress which offsets the # distrust effect on the next simulation round). if layout.steps < 2: return self._make("increase_steps") # If distrust persists but layout looks safe, do nothing drastic # — let the optimiser handle it. return None if self.last_outcome == "drop": layout = obs.layout # Emergency: cut the most expensive dimension first if layout.steps > 2 and not self._would_oscillate("decrease_steps"): return self._make("decrease_steps") if layout.form_length > SAFE_FORM_FLOOR: return self._make("decrease_form") return None return None # ──────────── Stage 3: Layout Optimization ──────────────────── def _optimize_layout(self, obs: Observation) -> Optional[Action]: """ Gradually move the layout toward the ideal configuration: button_size ∈ [0.9, 1.3] steps ≤ 2 form_length ≤ 4 (but ≥ 3 for safety) Optimisation order (by reward impact): 1. steps → biggest reward shaping bonus (+0.1) AND progress bonus 2. form → progress bonus when ≤ 4 3. button → shaping bonus (+0.1) when in sweet spot Each call makes at most ONE change to avoid compounding effects in a single step. """ layout = obs.layout # ── Steps: aim for TARGET_STEPS (2) ── if layout.steps > TARGET_STEPS and not self._would_oscillate("decrease_steps"): # Don't reduce below 2 if we've seen distrust (new-user guard) if not (self.distrust_count > 0 and layout.steps <= 2): return self._make("decrease_steps") # ── Form: aim for TARGET_FORM_LENGTH (4) but never below SAFE_FORM_FLOOR (3) ── if layout.form_length > TARGET_FORM_LENGTH and layout.form_length > SAFE_FORM_FLOOR: return self._make("decrease_form") # ── Button size: steer into sweet spot ── bs = layout.button_size if bs < BUTTON_SWEET_LOW: if not self._would_oscillate("increase_button"): return self._make("increase_button") if bs > BUTTON_SWEET_HIGH: # Use set_button_size to jump directly into the sweet zone # rather than slowly decrementing (no decrease_button action exists) return self._make("set_button_size", BUTTON_SWEET_MID) return None # ──────────── Stage 4: Exploration ──────────────────────────── def _explore(self, obs: Observation) -> Optional[Action]: """ Small controlled randomness to discover micro-improvements. Only fires when: - RNG says so (7 % chance) - Last outcome was NOT negative (don't explore under stress) - Layout is already reasonably safe Exploration action: try a random button_size within the sweet spot. This is the safest dimension to explore because it has no drop or distrust rules tied to it. """ if self.last_outcome in ("drop", "distrust"): return None if self._rng.random() < EXPLORE_PROBABILITY: target = self._rng.uniform(BUTTON_SWEET_LOW, BUTTON_SWEET_HIGH) target = round(target, 2) return self._make("set_button_size", target) return None # ──────────── Stage 5: Fallback ─────────────────────────────── def _fallback(self, obs: Observation) -> Action: """ Default action when the layout is already near-optimal. - If noop streak is still safe → noop (preserves a good layout) - Otherwise → a tiny, safe micro-adjustment to break the streak while keeping the layout in the sweet spot. """ if self.noop_streak < NOOP_SAFE_LIMIT: return self._make("noop") # Break the noop streak with a harmless move bs = obs.layout.button_size if bs <= BUTTON_SWEET_MID: target = min(BUTTON_SWEET_HIGH, bs + 0.05) else: target = max(BUTTON_SWEET_LOW, bs - 0.05) return self._make("set_button_size", round(target, 2)) # ────────────────────────────────────────────────────────────────────── # Random Agent (Baseline) # ────────────────────────────────────────────────────────────────────── class RandomAgent: """Uniformly random discrete-action agent for baseline comparison.""" _ACTIONS = [ "increase_button", "decrease_form", "increase_steps", "decrease_steps", "reorder_sections", "noop", ] def __init__(self, seed: int = 99) -> None: self._rng = random.Random(seed) def reset(self) -> None: pass def act(self, obs: Observation) -> Action: return Action(type=self._rng.choice(self._ACTIONS), value=None) def update(self, info: dict) -> None: pass # ────────────────────────────────────────────────────────────────────── # Evaluation Harness # ────────────────────────────────────────────────────────────────────── def run_evaluation( agent, n_episodes: int = 200, env_seed: int = 42, verbose: bool = False, ) -> dict: """ Run *n_episodes* in UIEnv with the given agent and collect metrics. Returns ------- dict with keys: avg_reward, completion_rate, drop_rate, avg_steps """ env = UIEnv(seed=env_seed) total_reward: float = 0.0 completions: int = 0 drops: int = 0 total_steps: int = 0 for ep in range(n_episodes): obs = env.reset() agent.reset() ep_reward: float = 0.0 done = False while not done: action = agent.act(obs) obs, reward, done, info = env.step(action) agent.update(info) ep_reward += reward total_reward += ep_reward total_steps += info["step_count"] if info["outcome"] == "complete": completions += 1 elif info["outcome"] == "drop": drops += 1 if verbose and ep < 10: print( f" ep={ep:03d} outcome={info['outcome']:<10s} " f"reward={ep_reward:+.3f} steps={info['step_count']}" ) return { "avg_reward": total_reward / n_episodes, "completion_rate": completions / n_episodes, "drop_rate": drops / n_episodes, "avg_steps": total_steps / n_episodes, } def _fmt_pct(v: float) -> str: return f"{v * 100:.1f}%" # ────────────────────────────────────────────────────────────────────── # Main — run benchmark # ────────────────────────────────────────────────────────────────────── if __name__ == "__main__": N_EPISODES = 200 print("=" * 64) print(" UIEnv Heuristic Agent -- Benchmark Suite") print("=" * 64) # -- Heuristic Agent -- print("\n> Running Heuristic Agent ...") h_agent = HeuristicAgent(seed=99) h_metrics = run_evaluation(h_agent, n_episodes=N_EPISODES, verbose=True) # -- Random Baseline -- print("\n> Running Random Agent ...") r_agent = RandomAgent(seed=99) r_metrics = run_evaluation(r_agent, n_episodes=N_EPISODES, verbose=True) # -- Comparison Table -- print("\n" + "-" * 64) print(f" {'Metric':<22s} {'Heuristic':>12s} {'Random':>12s} {'Delta':>12s}") print("-" * 64) for key, label in [ ("avg_reward", "Avg Reward"), ("completion_rate", "Completion Rate"), ("drop_rate", "Drop Rate"), ("avg_steps", "Avg Steps"), ]: h_val = h_metrics[key] r_val = r_metrics[key] delta = h_val - r_val if "rate" in key: h_str = _fmt_pct(h_val) r_str = _fmt_pct(r_val) d_str = f"{delta * 100:+.1f}pp" elif "step" in key: h_str = f"{h_val:.1f}" r_str = f"{r_val:.1f}" d_str = f"{delta:+.1f}" else: h_str = f"{h_val:+.4f}" r_str = f"{r_val:+.4f}" d_str = f"{delta:+.4f}" print(f" {label:<22s} {h_str:>12s} {r_str:>12s} {d_str:>12s}") print("-" * 64) # -- Verdict -- lift = h_metrics["avg_reward"] - r_metrics["avg_reward"] if lift > 0.2: verdict = "[PASS] STRONG improvement over random baseline" elif lift > 0.05: verdict = "[WARN] Moderate improvement -- consider tuning" else: verdict = "[FAIL] Marginal -- agent needs rework" print(f"\n Verdict: {verdict}") print(f" Reward lift: {lift:+.4f}\n")