Spaces:
Sleeping
Sleeping
| """ | |
| heuristic_agent.py | |
| ------------------ | |
| A high-performance heuristic agent for the UIEnv environment. | |
| Architecture | |
| ============ | |
| The agent uses a **multi-stage decision pipeline** that evaluates conditions | |
| in priority order. The first stage to produce an action wins. | |
| Stage 1 β Risk Mitigation (prevent imminent drop) | |
| Stage 2 β Feedback Adaptation (react to distrust / drop signals) | |
| Stage 3 β Layout Optimization (converge toward ideal layout) | |
| Stage 4 β Exploration (controlled randomness in safe states) | |
| Stage 5 β Fallback (safe default when layout is near-optimal) | |
| Internal state (outcome history, action history, noop streak) is used to | |
| make context-aware decisions and avoid oscillation. | |
| Includes a full evaluation harness that benchmarks the heuristic agent | |
| against a random baseline. | |
| """ | |
| from __future__ import annotations | |
| import random | |
| from collections import deque | |
| from typing import Optional | |
| from env import UIEnv, Action, Observation | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Optimal layout targets (derived from reward shaping in env.py) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BUTTON_SWEET_LOW: float = 0.9 | |
| BUTTON_SWEET_HIGH: float = 1.3 | |
| BUTTON_SWEET_MID: float = 1.1 # centre of the sweet spot for jumps | |
| TARGET_STEPS: int = 2 # at or below β shaping bonus | |
| TARGET_FORM_LENGTH: int = 4 # at or below β progress bonus | |
| SAFE_FORM_FLOOR: int = 3 # do NOT reduce below this (careful-user trap) | |
| DROP_STEPS_THRESHOLD: int = 3 # steps above this β impatient drop | |
| DROP_FORM_THRESHOLD: int = 5 # form_length above this β impatient drop | |
| EXPLORE_PROBABILITY: float = 0.07 # 7 % exploration rate | |
| NOOP_SAFE_LIMIT: int = 1 # max consecutive noops before forcing action | |
| # Inverse action pairs β used for oscillation detection | |
| _INVERSE_ACTIONS: dict[str, str] = { | |
| "increase_button": "set_button_size", # conceptual inverse | |
| "increase_steps": "decrease_steps", | |
| "decrease_steps": "increase_steps", | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Heuristic Agent | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class HeuristicAgent: | |
| """ | |
| Structured, multi-stage heuristic agent for UIEnv. | |
| The agent maintains internal state that is updated every step via | |
| `update(info)`, and selects actions via `act(obs)` using a | |
| priority-ordered decision pipeline. | |
| """ | |
| def __init__(self, seed: int = 99) -> None: | |
| self._rng = random.Random(seed) | |
| # ββ internal tracking ββ | |
| self.last_outcome: Optional[str] = None | |
| self.noop_streak: int = 0 | |
| self.action_history: deque[str] = deque(maxlen=5) | |
| self.distrust_count: int = 0 | |
| self.drop_count: int = 0 | |
| self.step_number: int = 0 | |
| # ββββββββββββββββββββββββ public API ββββββββββββββββββββββββββ | |
| def reset(self) -> None: | |
| """Clear per-episode state at the start of a new episode.""" | |
| self.last_outcome = None | |
| self.noop_streak = 0 | |
| self.action_history.clear() | |
| self.distrust_count = 0 | |
| self.drop_count = 0 | |
| self.step_number = 0 | |
| def act(self, obs: Observation) -> Action: | |
| """ | |
| Select the next action by running the decision pipeline. | |
| Stages are evaluated in priority order; the first stage to return | |
| a non-None action wins. This guarantees that safety-critical | |
| adjustments always take precedence over optimisation moves. | |
| """ | |
| self.step_number += 1 | |
| action = ( | |
| self._risk_mitigation(obs) | |
| or self._adaptation(obs) | |
| or self._optimize_layout(obs) | |
| or self._explore(obs) | |
| or self._fallback(obs) | |
| ) | |
| # Record for oscillation detection | |
| self.action_history.append(action.type) | |
| # Track noop streak | |
| if action.type == "noop": | |
| self.noop_streak += 1 | |
| else: | |
| self.noop_streak = 0 | |
| return action | |
| def update(self, info: dict) -> None: | |
| """Ingest environment info dict to update internal beliefs.""" | |
| outcome = info.get("outcome", "continue") | |
| self.last_outcome = outcome | |
| if outcome == "distrust": | |
| self.distrust_count += 1 | |
| elif outcome == "drop": | |
| self.drop_count += 1 | |
| # ββββββββββββββββββββββββ helpers βββββββββββββββββββββββββββββ | |
| def _would_oscillate(self, candidate: str) -> bool: | |
| """ | |
| Return True if `candidate` would undo the most recent action, | |
| creating a pointless back-and-forth oscillation. | |
| """ | |
| if not self.action_history: | |
| return False | |
| last = self.action_history[-1] | |
| inv = _INVERSE_ACTIONS.get(candidate) | |
| return last == inv or _INVERSE_ACTIONS.get(last) == candidate | |
| def _make(action_type: str, value: float | None = None) -> Action: | |
| """Shorthand to construct an Action.""" | |
| return Action(type=action_type, value=value) | |
| # ββββββββββββ Stage 1: Risk Mitigation ββββββββββββββββββββββββ | |
| def _risk_mitigation(self, obs: Observation) -> Optional[Action]: | |
| """ | |
| Immediately neutralise conditions that lead to user drop. | |
| Priority: | |
| 1. steps > 3 β decrease_steps (impatient-drop rule) | |
| 2. form_length > 5 β decrease_form (impatient-drop rule) | |
| Steps are prioritised because the impatient drop threshold for | |
| steps (> 3) is stricter and more common than form (> 5). | |
| """ | |
| layout = obs.layout | |
| if layout.steps > DROP_STEPS_THRESHOLD: | |
| return self._make("decrease_steps") | |
| if layout.form_length > DROP_FORM_THRESHOLD: | |
| return self._make("decrease_form") | |
| return None | |
| # ββββββββββββ Stage 2: Feedback Adaptation ββββββββββββββββββββ | |
| def _adaptation(self, obs: Observation) -> Optional[Action]: | |
| """ | |
| React to the most recent user outcome signal. | |
| - 'distrust' means the layout is *too minimal* for this user type: | |
| β’ new users distrust when steps < 2 β increase_steps | |
| β’ careful users distrust when form_length < 3 β stop reducing | |
| (since there is no increase_form action, we can only prevent | |
| future reduction β but if steps are low, raising them is safe) | |
| - 'drop' means the layout was *too heavy* β aggressively reduce | |
| """ | |
| if self.last_outcome == "distrust": | |
| layout = obs.layout | |
| # New-user distrust: steps too low | |
| if layout.steps < 2 and not self._would_oscillate("increase_steps"): | |
| return self._make("increase_steps") | |
| # Careful-user distrust is likely about form being too short. | |
| # We can't increase form, but we can ensure steps stay reasonable | |
| # (having decent steps helps overall progress which offsets the | |
| # distrust effect on the next simulation round). | |
| if layout.steps < 2: | |
| return self._make("increase_steps") | |
| # If distrust persists but layout looks safe, do nothing drastic | |
| # β let the optimiser handle it. | |
| return None | |
| if self.last_outcome == "drop": | |
| layout = obs.layout | |
| # Emergency: cut the most expensive dimension first | |
| if layout.steps > 2 and not self._would_oscillate("decrease_steps"): | |
| return self._make("decrease_steps") | |
| if layout.form_length > SAFE_FORM_FLOOR: | |
| return self._make("decrease_form") | |
| return None | |
| return None | |
| # ββββββββββββ Stage 3: Layout Optimization ββββββββββββββββββββ | |
| def _optimize_layout(self, obs: Observation) -> Optional[Action]: | |
| """ | |
| Gradually move the layout toward the ideal configuration: | |
| button_size β [0.9, 1.3] | |
| steps β€ 2 | |
| form_length β€ 4 (but β₯ 3 for safety) | |
| Optimisation order (by reward impact): | |
| 1. steps β biggest reward shaping bonus (+0.1) AND progress bonus | |
| 2. form β progress bonus when β€ 4 | |
| 3. button β shaping bonus (+0.1) when in sweet spot | |
| Each call makes at most ONE change to avoid compounding effects | |
| in a single step. | |
| """ | |
| layout = obs.layout | |
| # ββ Steps: aim for TARGET_STEPS (2) ββ | |
| if layout.steps > TARGET_STEPS and not self._would_oscillate("decrease_steps"): | |
| # Don't reduce below 2 if we've seen distrust (new-user guard) | |
| if not (self.distrust_count > 0 and layout.steps <= 2): | |
| return self._make("decrease_steps") | |
| # ββ Form: aim for TARGET_FORM_LENGTH (4) but never below SAFE_FORM_FLOOR (3) ββ | |
| if layout.form_length > TARGET_FORM_LENGTH and layout.form_length > SAFE_FORM_FLOOR: | |
| return self._make("decrease_form") | |
| # ββ Button size: steer into sweet spot ββ | |
| bs = layout.button_size | |
| if bs < BUTTON_SWEET_LOW: | |
| if not self._would_oscillate("increase_button"): | |
| return self._make("increase_button") | |
| if bs > BUTTON_SWEET_HIGH: | |
| # Use set_button_size to jump directly into the sweet zone | |
| # rather than slowly decrementing (no decrease_button action exists) | |
| return self._make("set_button_size", BUTTON_SWEET_MID) | |
| return None | |
| # ββββββββββββ Stage 4: Exploration ββββββββββββββββββββββββββββ | |
| def _explore(self, obs: Observation) -> Optional[Action]: | |
| """ | |
| Small controlled randomness to discover micro-improvements. | |
| Only fires when: | |
| - RNG says so (7 % chance) | |
| - Last outcome was NOT negative (don't explore under stress) | |
| - Layout is already reasonably safe | |
| Exploration action: try a random button_size within the sweet spot. | |
| This is the safest dimension to explore because it has no drop or | |
| distrust rules tied to it. | |
| """ | |
| if self.last_outcome in ("drop", "distrust"): | |
| return None | |
| if self._rng.random() < EXPLORE_PROBABILITY: | |
| target = self._rng.uniform(BUTTON_SWEET_LOW, BUTTON_SWEET_HIGH) | |
| target = round(target, 2) | |
| return self._make("set_button_size", target) | |
| return None | |
| # ββββββββββββ Stage 5: Fallback βββββββββββββββββββββββββββββββ | |
| def _fallback(self, obs: Observation) -> Action: | |
| """ | |
| Default action when the layout is already near-optimal. | |
| - If noop streak is still safe β noop (preserves a good layout) | |
| - Otherwise β a tiny, safe micro-adjustment to break the streak | |
| while keeping the layout in the sweet spot. | |
| """ | |
| if self.noop_streak < NOOP_SAFE_LIMIT: | |
| return self._make("noop") | |
| # Break the noop streak with a harmless move | |
| bs = obs.layout.button_size | |
| if bs <= BUTTON_SWEET_MID: | |
| target = min(BUTTON_SWEET_HIGH, bs + 0.05) | |
| else: | |
| target = max(BUTTON_SWEET_LOW, bs - 0.05) | |
| return self._make("set_button_size", round(target, 2)) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Random Agent (Baseline) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class RandomAgent: | |
| """Uniformly random discrete-action agent for baseline comparison.""" | |
| _ACTIONS = [ | |
| "increase_button", | |
| "decrease_form", | |
| "increase_steps", | |
| "decrease_steps", | |
| "reorder_sections", | |
| "noop", | |
| ] | |
| def __init__(self, seed: int = 99) -> None: | |
| self._rng = random.Random(seed) | |
| def reset(self) -> None: | |
| pass | |
| def act(self, obs: Observation) -> Action: | |
| return Action(type=self._rng.choice(self._ACTIONS), value=None) | |
| def update(self, info: dict) -> None: | |
| pass | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Evaluation Harness | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_evaluation( | |
| agent, | |
| n_episodes: int = 200, | |
| env_seed: int = 42, | |
| verbose: bool = False, | |
| ) -> dict: | |
| """ | |
| Run *n_episodes* in UIEnv with the given agent and collect metrics. | |
| Returns | |
| ------- | |
| dict with keys: | |
| avg_reward, completion_rate, drop_rate, avg_steps | |
| """ | |
| env = UIEnv(seed=env_seed) | |
| total_reward: float = 0.0 | |
| completions: int = 0 | |
| drops: int = 0 | |
| total_steps: int = 0 | |
| for ep in range(n_episodes): | |
| obs = env.reset() | |
| agent.reset() | |
| ep_reward: float = 0.0 | |
| done = False | |
| while not done: | |
| action = agent.act(obs) | |
| obs, reward, done, info = env.step(action) | |
| agent.update(info) | |
| ep_reward += reward | |
| total_reward += ep_reward | |
| total_steps += info["step_count"] | |
| if info["outcome"] == "complete": | |
| completions += 1 | |
| elif info["outcome"] == "drop": | |
| drops += 1 | |
| if verbose and ep < 10: | |
| print( | |
| f" ep={ep:03d} outcome={info['outcome']:<10s} " | |
| f"reward={ep_reward:+.3f} steps={info['step_count']}" | |
| ) | |
| return { | |
| "avg_reward": total_reward / n_episodes, | |
| "completion_rate": completions / n_episodes, | |
| "drop_rate": drops / n_episodes, | |
| "avg_steps": total_steps / n_episodes, | |
| } | |
| def _fmt_pct(v: float) -> str: | |
| return f"{v * 100:.1f}%" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main β run benchmark | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| N_EPISODES = 200 | |
| print("=" * 64) | |
| print(" UIEnv Heuristic Agent -- Benchmark Suite") | |
| print("=" * 64) | |
| # -- Heuristic Agent -- | |
| print("\n> Running Heuristic Agent ...") | |
| h_agent = HeuristicAgent(seed=99) | |
| h_metrics = run_evaluation(h_agent, n_episodes=N_EPISODES, verbose=True) | |
| # -- Random Baseline -- | |
| print("\n> Running Random Agent ...") | |
| r_agent = RandomAgent(seed=99) | |
| r_metrics = run_evaluation(r_agent, n_episodes=N_EPISODES, verbose=True) | |
| # -- Comparison Table -- | |
| print("\n" + "-" * 64) | |
| print(f" {'Metric':<22s} {'Heuristic':>12s} {'Random':>12s} {'Delta':>12s}") | |
| print("-" * 64) | |
| for key, label in [ | |
| ("avg_reward", "Avg Reward"), | |
| ("completion_rate", "Completion Rate"), | |
| ("drop_rate", "Drop Rate"), | |
| ("avg_steps", "Avg Steps"), | |
| ]: | |
| h_val = h_metrics[key] | |
| r_val = r_metrics[key] | |
| delta = h_val - r_val | |
| if "rate" in key: | |
| h_str = _fmt_pct(h_val) | |
| r_str = _fmt_pct(r_val) | |
| d_str = f"{delta * 100:+.1f}pp" | |
| elif "step" in key: | |
| h_str = f"{h_val:.1f}" | |
| r_str = f"{r_val:.1f}" | |
| d_str = f"{delta:+.1f}" | |
| else: | |
| h_str = f"{h_val:+.4f}" | |
| r_str = f"{r_val:+.4f}" | |
| d_str = f"{delta:+.4f}" | |
| print(f" {label:<22s} {h_str:>12s} {r_str:>12s} {d_str:>12s}") | |
| print("-" * 64) | |
| # -- Verdict -- | |
| lift = h_metrics["avg_reward"] - r_metrics["avg_reward"] | |
| if lift > 0.2: | |
| verdict = "[PASS] STRONG improvement over random baseline" | |
| elif lift > 0.05: | |
| verdict = "[WARN] Moderate improvement -- consider tuning" | |
| else: | |
| verdict = "[FAIL] Marginal -- agent needs rework" | |
| print(f"\n Verdict: {verdict}") | |
| print(f" Reward lift: {lift:+.4f}\n") | |