UI-layout-optimizer / heuristic_agent.py
ChaitanyaRasane
deploy: clean initial commit
f582a68
"""
heuristic_agent.py
------------------
A high-performance heuristic agent for the UIEnv environment.
Architecture
============
The agent uses a **multi-stage decision pipeline** that evaluates conditions
in priority order. The first stage to produce an action wins.
Stage 1 β†’ Risk Mitigation (prevent imminent drop)
Stage 2 β†’ Feedback Adaptation (react to distrust / drop signals)
Stage 3 β†’ Layout Optimization (converge toward ideal layout)
Stage 4 β†’ Exploration (controlled randomness in safe states)
Stage 5 β†’ Fallback (safe default when layout is near-optimal)
Internal state (outcome history, action history, noop streak) is used to
make context-aware decisions and avoid oscillation.
Includes a full evaluation harness that benchmarks the heuristic agent
against a random baseline.
"""
from __future__ import annotations
import random
from collections import deque
from typing import Optional
from env import UIEnv, Action, Observation
# ──────────────────────────────────────────────────────────────────────
# Optimal layout targets (derived from reward shaping in env.py)
# ──────────────────────────────────────────────────────────────────────
BUTTON_SWEET_LOW: float = 0.9
BUTTON_SWEET_HIGH: float = 1.3
BUTTON_SWEET_MID: float = 1.1 # centre of the sweet spot for jumps
TARGET_STEPS: int = 2 # at or below β†’ shaping bonus
TARGET_FORM_LENGTH: int = 4 # at or below β†’ progress bonus
SAFE_FORM_FLOOR: int = 3 # do NOT reduce below this (careful-user trap)
DROP_STEPS_THRESHOLD: int = 3 # steps above this β†’ impatient drop
DROP_FORM_THRESHOLD: int = 5 # form_length above this β†’ impatient drop
EXPLORE_PROBABILITY: float = 0.07 # 7 % exploration rate
NOOP_SAFE_LIMIT: int = 1 # max consecutive noops before forcing action
# Inverse action pairs β€” used for oscillation detection
_INVERSE_ACTIONS: dict[str, str] = {
"increase_button": "set_button_size", # conceptual inverse
"increase_steps": "decrease_steps",
"decrease_steps": "increase_steps",
}
# ──────────────────────────────────────────────────────────────────────
# Heuristic Agent
# ──────────────────────────────────────────────────────────────────────
class HeuristicAgent:
"""
Structured, multi-stage heuristic agent for UIEnv.
The agent maintains internal state that is updated every step via
`update(info)`, and selects actions via `act(obs)` using a
priority-ordered decision pipeline.
"""
def __init__(self, seed: int = 99) -> None:
self._rng = random.Random(seed)
# ── internal tracking ──
self.last_outcome: Optional[str] = None
self.noop_streak: int = 0
self.action_history: deque[str] = deque(maxlen=5)
self.distrust_count: int = 0
self.drop_count: int = 0
self.step_number: int = 0
# ──────────────────────── public API ──────────────────────────
def reset(self) -> None:
"""Clear per-episode state at the start of a new episode."""
self.last_outcome = None
self.noop_streak = 0
self.action_history.clear()
self.distrust_count = 0
self.drop_count = 0
self.step_number = 0
def act(self, obs: Observation) -> Action:
"""
Select the next action by running the decision pipeline.
Stages are evaluated in priority order; the first stage to return
a non-None action wins. This guarantees that safety-critical
adjustments always take precedence over optimisation moves.
"""
self.step_number += 1
action = (
self._risk_mitigation(obs)
or self._adaptation(obs)
or self._optimize_layout(obs)
or self._explore(obs)
or self._fallback(obs)
)
# Record for oscillation detection
self.action_history.append(action.type)
# Track noop streak
if action.type == "noop":
self.noop_streak += 1
else:
self.noop_streak = 0
return action
def update(self, info: dict) -> None:
"""Ingest environment info dict to update internal beliefs."""
outcome = info.get("outcome", "continue")
self.last_outcome = outcome
if outcome == "distrust":
self.distrust_count += 1
elif outcome == "drop":
self.drop_count += 1
# ──────────────────────── helpers ─────────────────────────────
def _would_oscillate(self, candidate: str) -> bool:
"""
Return True if `candidate` would undo the most recent action,
creating a pointless back-and-forth oscillation.
"""
if not self.action_history:
return False
last = self.action_history[-1]
inv = _INVERSE_ACTIONS.get(candidate)
return last == inv or _INVERSE_ACTIONS.get(last) == candidate
@staticmethod
def _make(action_type: str, value: float | None = None) -> Action:
"""Shorthand to construct an Action."""
return Action(type=action_type, value=value)
# ──────────── Stage 1: Risk Mitigation ────────────────────────
def _risk_mitigation(self, obs: Observation) -> Optional[Action]:
"""
Immediately neutralise conditions that lead to user drop.
Priority:
1. steps > 3 β†’ decrease_steps (impatient-drop rule)
2. form_length > 5 β†’ decrease_form (impatient-drop rule)
Steps are prioritised because the impatient drop threshold for
steps (> 3) is stricter and more common than form (> 5).
"""
layout = obs.layout
if layout.steps > DROP_STEPS_THRESHOLD:
return self._make("decrease_steps")
if layout.form_length > DROP_FORM_THRESHOLD:
return self._make("decrease_form")
return None
# ──────────── Stage 2: Feedback Adaptation ────────────────────
def _adaptation(self, obs: Observation) -> Optional[Action]:
"""
React to the most recent user outcome signal.
- 'distrust' means the layout is *too minimal* for this user type:
β€’ new users distrust when steps < 2 β†’ increase_steps
β€’ careful users distrust when form_length < 3 β†’ stop reducing
(since there is no increase_form action, we can only prevent
future reduction β€” but if steps are low, raising them is safe)
- 'drop' means the layout was *too heavy* β†’ aggressively reduce
"""
if self.last_outcome == "distrust":
layout = obs.layout
# New-user distrust: steps too low
if layout.steps < 2 and not self._would_oscillate("increase_steps"):
return self._make("increase_steps")
# Careful-user distrust is likely about form being too short.
# We can't increase form, but we can ensure steps stay reasonable
# (having decent steps helps overall progress which offsets the
# distrust effect on the next simulation round).
if layout.steps < 2:
return self._make("increase_steps")
# If distrust persists but layout looks safe, do nothing drastic
# β€” let the optimiser handle it.
return None
if self.last_outcome == "drop":
layout = obs.layout
# Emergency: cut the most expensive dimension first
if layout.steps > 2 and not self._would_oscillate("decrease_steps"):
return self._make("decrease_steps")
if layout.form_length > SAFE_FORM_FLOOR:
return self._make("decrease_form")
return None
return None
# ──────────── Stage 3: Layout Optimization ────────────────────
def _optimize_layout(self, obs: Observation) -> Optional[Action]:
"""
Gradually move the layout toward the ideal configuration:
button_size ∈ [0.9, 1.3]
steps ≀ 2
form_length ≀ 4 (but β‰₯ 3 for safety)
Optimisation order (by reward impact):
1. steps β†’ biggest reward shaping bonus (+0.1) AND progress bonus
2. form β†’ progress bonus when ≀ 4
3. button β†’ shaping bonus (+0.1) when in sweet spot
Each call makes at most ONE change to avoid compounding effects
in a single step.
"""
layout = obs.layout
# ── Steps: aim for TARGET_STEPS (2) ──
if layout.steps > TARGET_STEPS and not self._would_oscillate("decrease_steps"):
# Don't reduce below 2 if we've seen distrust (new-user guard)
if not (self.distrust_count > 0 and layout.steps <= 2):
return self._make("decrease_steps")
# ── Form: aim for TARGET_FORM_LENGTH (4) but never below SAFE_FORM_FLOOR (3) ──
if layout.form_length > TARGET_FORM_LENGTH and layout.form_length > SAFE_FORM_FLOOR:
return self._make("decrease_form")
# ── Button size: steer into sweet spot ──
bs = layout.button_size
if bs < BUTTON_SWEET_LOW:
if not self._would_oscillate("increase_button"):
return self._make("increase_button")
if bs > BUTTON_SWEET_HIGH:
# Use set_button_size to jump directly into the sweet zone
# rather than slowly decrementing (no decrease_button action exists)
return self._make("set_button_size", BUTTON_SWEET_MID)
return None
# ──────────── Stage 4: Exploration ────────────────────────────
def _explore(self, obs: Observation) -> Optional[Action]:
"""
Small controlled randomness to discover micro-improvements.
Only fires when:
- RNG says so (7 % chance)
- Last outcome was NOT negative (don't explore under stress)
- Layout is already reasonably safe
Exploration action: try a random button_size within the sweet spot.
This is the safest dimension to explore because it has no drop or
distrust rules tied to it.
"""
if self.last_outcome in ("drop", "distrust"):
return None
if self._rng.random() < EXPLORE_PROBABILITY:
target = self._rng.uniform(BUTTON_SWEET_LOW, BUTTON_SWEET_HIGH)
target = round(target, 2)
return self._make("set_button_size", target)
return None
# ──────────── Stage 5: Fallback ───────────────────────────────
def _fallback(self, obs: Observation) -> Action:
"""
Default action when the layout is already near-optimal.
- If noop streak is still safe β†’ noop (preserves a good layout)
- Otherwise β†’ a tiny, safe micro-adjustment to break the streak
while keeping the layout in the sweet spot.
"""
if self.noop_streak < NOOP_SAFE_LIMIT:
return self._make("noop")
# Break the noop streak with a harmless move
bs = obs.layout.button_size
if bs <= BUTTON_SWEET_MID:
target = min(BUTTON_SWEET_HIGH, bs + 0.05)
else:
target = max(BUTTON_SWEET_LOW, bs - 0.05)
return self._make("set_button_size", round(target, 2))
# ──────────────────────────────────────────────────────────────────────
# Random Agent (Baseline)
# ──────────────────────────────────────────────────────────────────────
class RandomAgent:
"""Uniformly random discrete-action agent for baseline comparison."""
_ACTIONS = [
"increase_button",
"decrease_form",
"increase_steps",
"decrease_steps",
"reorder_sections",
"noop",
]
def __init__(self, seed: int = 99) -> None:
self._rng = random.Random(seed)
def reset(self) -> None:
pass
def act(self, obs: Observation) -> Action:
return Action(type=self._rng.choice(self._ACTIONS), value=None)
def update(self, info: dict) -> None:
pass
# ──────────────────────────────────────────────────────────────────────
# Evaluation Harness
# ──────────────────────────────────────────────────────────────────────
def run_evaluation(
agent,
n_episodes: int = 200,
env_seed: int = 42,
verbose: bool = False,
) -> dict:
"""
Run *n_episodes* in UIEnv with the given agent and collect metrics.
Returns
-------
dict with keys:
avg_reward, completion_rate, drop_rate, avg_steps
"""
env = UIEnv(seed=env_seed)
total_reward: float = 0.0
completions: int = 0
drops: int = 0
total_steps: int = 0
for ep in range(n_episodes):
obs = env.reset()
agent.reset()
ep_reward: float = 0.0
done = False
while not done:
action = agent.act(obs)
obs, reward, done, info = env.step(action)
agent.update(info)
ep_reward += reward
total_reward += ep_reward
total_steps += info["step_count"]
if info["outcome"] == "complete":
completions += 1
elif info["outcome"] == "drop":
drops += 1
if verbose and ep < 10:
print(
f" ep={ep:03d} outcome={info['outcome']:<10s} "
f"reward={ep_reward:+.3f} steps={info['step_count']}"
)
return {
"avg_reward": total_reward / n_episodes,
"completion_rate": completions / n_episodes,
"drop_rate": drops / n_episodes,
"avg_steps": total_steps / n_episodes,
}
def _fmt_pct(v: float) -> str:
return f"{v * 100:.1f}%"
# ──────────────────────────────────────────────────────────────────────
# Main β€” run benchmark
# ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
N_EPISODES = 200
print("=" * 64)
print(" UIEnv Heuristic Agent -- Benchmark Suite")
print("=" * 64)
# -- Heuristic Agent --
print("\n> Running Heuristic Agent ...")
h_agent = HeuristicAgent(seed=99)
h_metrics = run_evaluation(h_agent, n_episodes=N_EPISODES, verbose=True)
# -- Random Baseline --
print("\n> Running Random Agent ...")
r_agent = RandomAgent(seed=99)
r_metrics = run_evaluation(r_agent, n_episodes=N_EPISODES, verbose=True)
# -- Comparison Table --
print("\n" + "-" * 64)
print(f" {'Metric':<22s} {'Heuristic':>12s} {'Random':>12s} {'Delta':>12s}")
print("-" * 64)
for key, label in [
("avg_reward", "Avg Reward"),
("completion_rate", "Completion Rate"),
("drop_rate", "Drop Rate"),
("avg_steps", "Avg Steps"),
]:
h_val = h_metrics[key]
r_val = r_metrics[key]
delta = h_val - r_val
if "rate" in key:
h_str = _fmt_pct(h_val)
r_str = _fmt_pct(r_val)
d_str = f"{delta * 100:+.1f}pp"
elif "step" in key:
h_str = f"{h_val:.1f}"
r_str = f"{r_val:.1f}"
d_str = f"{delta:+.1f}"
else:
h_str = f"{h_val:+.4f}"
r_str = f"{r_val:+.4f}"
d_str = f"{delta:+.4f}"
print(f" {label:<22s} {h_str:>12s} {r_str:>12s} {d_str:>12s}")
print("-" * 64)
# -- Verdict --
lift = h_metrics["avg_reward"] - r_metrics["avg_reward"]
if lift > 0.2:
verdict = "[PASS] STRONG improvement over random baseline"
elif lift > 0.05:
verdict = "[WARN] Moderate improvement -- consider tuning"
else:
verdict = "[FAIL] Marginal -- agent needs rework"
print(f"\n Verdict: {verdict}")
print(f" Reward lift: {lift:+.4f}\n")