Spaces:

Kolaps27
/

UI-layout-optimizer

Sleeping

UI-layout-optimizer / heuristic_agent.py

ChaitanyaRasane

deploy: clean initial commit

f582a68 10 days ago

17.8 kB

	"""
	heuristic_agent.py
	------------------
	A high-performance heuristic agent for the UIEnv environment.

	Architecture
	============
	The agent uses a multi-stage decision pipeline that evaluates conditions
	in priority order. The first stage to produce an action wins.

	Stage 1 → Risk Mitigation (prevent imminent drop)
	Stage 2 → Feedback Adaptation (react to distrust / drop signals)
	Stage 3 → Layout Optimization (converge toward ideal layout)
	Stage 4 → Exploration (controlled randomness in safe states)
	Stage 5 → Fallback (safe default when layout is near-optimal)

	Internal state (outcome history, action history, noop streak) is used to
	make context-aware decisions and avoid oscillation.

	Includes a full evaluation harness that benchmarks the heuristic agent
	against a random baseline.
	"""

	from __future__ import annotations

	import random
	from collections import deque
	from typing import Optional

	from env import UIEnv, Action, Observation

	# ──────────────────────────────────────────────────────────────────────
	# Optimal layout targets (derived from reward shaping in env.py)
	# ──────────────────────────────────────────────────────────────────────

	BUTTON_SWEET_LOW: float = 0.9
	BUTTON_SWEET_HIGH: float = 1.3
	BUTTON_SWEET_MID: float = 1.1 # centre of the sweet spot for jumps

	TARGET_STEPS: int = 2 # at or below → shaping bonus
	TARGET_FORM_LENGTH: int = 4 # at or below → progress bonus
	SAFE_FORM_FLOOR: int = 3 # do NOT reduce below this (careful-user trap)

	DROP_STEPS_THRESHOLD: int = 3 # steps above this → impatient drop
	DROP_FORM_THRESHOLD: int = 5 # form_length above this → impatient drop

	EXPLORE_PROBABILITY: float = 0.07 # 7 % exploration rate
	NOOP_SAFE_LIMIT: int = 1 # max consecutive noops before forcing action

	# Inverse action pairs — used for oscillation detection
	_INVERSE_ACTIONS: dict[str, str] = {
	"increase_button": "set_button_size", # conceptual inverse
	"increase_steps": "decrease_steps",
	"decrease_steps": "increase_steps",
	}


	# ──────────────────────────────────────────────────────────────────────
	# Heuristic Agent
	# ──────────────────────────────────────────────────────────────────────

	class HeuristicAgent:
	"""
	Structured, multi-stage heuristic agent for UIEnv.

	The agent maintains internal state that is updated every step via
	`update(info)`, and selects actions via `act(obs)` using a
	priority-ordered decision pipeline.
	"""

	def __init__(self, seed: int = 99) -> None:
	self._rng = random.Random(seed)

	# ── internal tracking ──
	self.last_outcome: Optional[str] = None
	self.noop_streak: int = 0
	self.action_history: deque[str] = deque(maxlen=5)
	self.distrust_count: int = 0
	self.drop_count: int = 0
	self.step_number: int = 0

	# ──────────────────────── public API ──────────────────────────

	def reset(self) -> None:
	"""Clear per-episode state at the start of a new episode."""
	self.last_outcome = None
	self.noop_streak = 0
	self.action_history.clear()
	self.distrust_count = 0
	self.drop_count = 0
	self.step_number = 0

	def act(self, obs: Observation) -> Action:
	"""
	Select the next action by running the decision pipeline.

	Stages are evaluated in priority order; the first stage to return
	a non-None action wins. This guarantees that safety-critical
	adjustments always take precedence over optimisation moves.
	"""
	self.step_number += 1

	action = (
	self._risk_mitigation(obs)
	or self._adaptation(obs)
	or self._optimize_layout(obs)
	or self._explore(obs)
	or self._fallback(obs)
	)

	# Record for oscillation detection
	self.action_history.append(action.type)

	# Track noop streak
	if action.type == "noop":
	self.noop_streak += 1
	else:
	self.noop_streak = 0

	return action

	def update(self, info: dict) -> None:
	"""Ingest environment info dict to update internal beliefs."""
	outcome = info.get("outcome", "continue")
	self.last_outcome = outcome
	if outcome == "distrust":
	self.distrust_count += 1
	elif outcome == "drop":
	self.drop_count += 1

	# ──────────────────────── helpers ─────────────────────────────

	def _would_oscillate(self, candidate: str) -> bool:
	"""
	Return True if `candidate` would undo the most recent action,
	creating a pointless back-and-forth oscillation.
	"""
	if not self.action_history:
	return False
	last = self.action_history[-1]
	inv = _INVERSE_ACTIONS.get(candidate)
	return last == inv or _INVERSE_ACTIONS.get(last) == candidate

	@staticmethod
	def _make(action_type: str, value: float \| None = None) -> Action:
	"""Shorthand to construct an Action."""
	return Action(type=action_type, value=value)

	# ──────────── Stage 1: Risk Mitigation ────────────────────────

	def _risk_mitigation(self, obs: Observation) -> Optional[Action]:
	"""
	Immediately neutralise conditions that lead to user drop.

	Priority:
	1. steps > 3 → decrease_steps (impatient-drop rule)
	2. form_length > 5 → decrease_form (impatient-drop rule)

	Steps are prioritised because the impatient drop threshold for
	steps (> 3) is stricter and more common than form (> 5).
	"""
	layout = obs.layout

	if layout.steps > DROP_STEPS_THRESHOLD:
	return self._make("decrease_steps")

	if layout.form_length > DROP_FORM_THRESHOLD:
	return self._make("decrease_form")

	return None

	# ──────────── Stage 2: Feedback Adaptation ────────────────────

	def _adaptation(self, obs: Observation) -> Optional[Action]:
	"""
	React to the most recent user outcome signal.

	- 'distrust' means the layout is too minimal for this user type:
	• new users distrust when steps < 2 → increase_steps
	• careful users distrust when form_length < 3 → stop reducing
	(since there is no increase_form action, we can only prevent
	future reduction — but if steps are low, raising them is safe)
	- 'drop' means the layout was too heavy → aggressively reduce
	"""
	if self.last_outcome == "distrust":
	layout = obs.layout

	# New-user distrust: steps too low
	if layout.steps < 2 and not self._would_oscillate("increase_steps"):
	return self._make("increase_steps")

	# Careful-user distrust is likely about form being too short.
	# We can't increase form, but we can ensure steps stay reasonable
	# (having decent steps helps overall progress which offsets the
	# distrust effect on the next simulation round).
	if layout.steps < 2:
	return self._make("increase_steps")

	# If distrust persists but layout looks safe, do nothing drastic
	# — let the optimiser handle it.
	return None

	if self.last_outcome == "drop":
	layout = obs.layout

	# Emergency: cut the most expensive dimension first
	if layout.steps > 2 and not self._would_oscillate("decrease_steps"):
	return self._make("decrease_steps")

	if layout.form_length > SAFE_FORM_FLOOR:
	return self._make("decrease_form")

	return None

	return None

	# ──────────── Stage 3: Layout Optimization ────────────────────

	def _optimize_layout(self, obs: Observation) -> Optional[Action]:
	"""
	Gradually move the layout toward the ideal configuration:
	button_size ∈ [0.9, 1.3]
	steps ≤ 2
	form_length ≤ 4 (but ≥ 3 for safety)

	Optimisation order (by reward impact):
	1. steps → biggest reward shaping bonus (+0.1) AND progress bonus
	2. form → progress bonus when ≤ 4
	3. button → shaping bonus (+0.1) when in sweet spot

	Each call makes at most ONE change to avoid compounding effects
	in a single step.
	"""
	layout = obs.layout

	# ── Steps: aim for TARGET_STEPS (2) ──
	if layout.steps > TARGET_STEPS and not self._would_oscillate("decrease_steps"):
	# Don't reduce below 2 if we've seen distrust (new-user guard)
	if not (self.distrust_count > 0 and layout.steps <= 2):
	return self._make("decrease_steps")

	# ── Form: aim for TARGET_FORM_LENGTH (4) but never below SAFE_FORM_FLOOR (3) ──
	if layout.form_length > TARGET_FORM_LENGTH and layout.form_length > SAFE_FORM_FLOOR:
	return self._make("decrease_form")

	# ── Button size: steer into sweet spot ──
	bs = layout.button_size
	if bs < BUTTON_SWEET_LOW:
	if not self._would_oscillate("increase_button"):
	return self._make("increase_button")

	if bs > BUTTON_SWEET_HIGH:
	# Use set_button_size to jump directly into the sweet zone
	# rather than slowly decrementing (no decrease_button action exists)
	return self._make("set_button_size", BUTTON_SWEET_MID)

	return None

	# ──────────── Stage 4: Exploration ────────────────────────────

	def _explore(self, obs: Observation) -> Optional[Action]:
	"""
	Small controlled randomness to discover micro-improvements.

	Only fires when:
	- RNG says so (7 % chance)
	- Last outcome was NOT negative (don't explore under stress)
	- Layout is already reasonably safe

	Exploration action: try a random button_size within the sweet spot.
	This is the safest dimension to explore because it has no drop or
	distrust rules tied to it.
	"""
	if self.last_outcome in ("drop", "distrust"):
	return None

	if self._rng.random() < EXPLORE_PROBABILITY:
	target = self._rng.uniform(BUTTON_SWEET_LOW, BUTTON_SWEET_HIGH)
	target = round(target, 2)
	return self._make("set_button_size", target)

	return None

	# ──────────── Stage 5: Fallback ───────────────────────────────

	def _fallback(self, obs: Observation) -> Action:
	"""
	Default action when the layout is already near-optimal.

	- If noop streak is still safe → noop (preserves a good layout)
	- Otherwise → a tiny, safe micro-adjustment to break the streak
	while keeping the layout in the sweet spot.
	"""
	if self.noop_streak < NOOP_SAFE_LIMIT:
	return self._make("noop")

	# Break the noop streak with a harmless move
	bs = obs.layout.button_size
	if bs <= BUTTON_SWEET_MID:
	target = min(BUTTON_SWEET_HIGH, bs + 0.05)
	else:
	target = max(BUTTON_SWEET_LOW, bs - 0.05)

	return self._make("set_button_size", round(target, 2))


	# ──────────────────────────────────────────────────────────────────────
	# Random Agent (Baseline)
	# ──────────────────────────────────────────────────────────────────────

	class RandomAgent:
	"""Uniformly random discrete-action agent for baseline comparison."""

	_ACTIONS = [
	"increase_button",
	"decrease_form",
	"increase_steps",
	"decrease_steps",
	"reorder_sections",
	"noop",
	]

	def __init__(self, seed: int = 99) -> None:
	self._rng = random.Random(seed)

	def reset(self) -> None:
	pass

	def act(self, obs: Observation) -> Action:
	return Action(type=self._rng.choice(self._ACTIONS), value=None)

	def update(self, info: dict) -> None:
	pass


	# ──────────────────────────────────────────────────────────────────────
	# Evaluation Harness
	# ──────────────────────────────────────────────────────────────────────

	def run_evaluation(
	agent,
	n_episodes: int = 200,
	env_seed: int = 42,
	verbose: bool = False,
	) -> dict:
	"""
	Run n_episodes in UIEnv with the given agent and collect metrics.

	Returns
	-------
	dict with keys:
	avg_reward, completion_rate, drop_rate, avg_steps
	"""
	env = UIEnv(seed=env_seed)

	total_reward: float = 0.0
	completions: int = 0
	drops: int = 0
	total_steps: int = 0

	for ep in range(n_episodes):
	obs = env.reset()
	agent.reset()
	ep_reward: float = 0.0
	done = False

	while not done:
	action = agent.act(obs)
	obs, reward, done, info = env.step(action)
	agent.update(info)
	ep_reward += reward

	total_reward += ep_reward
	total_steps += info["step_count"]

	if info["outcome"] == "complete":
	completions += 1
	elif info["outcome"] == "drop":
	drops += 1

	if verbose and ep < 10:
	print(
	f" ep={ep:03d} outcome={info['outcome']:<10s} "
	f"reward={ep_reward:+.3f} steps={info['step_count']}"
	)

	return {
	"avg_reward": total_reward / n_episodes,
	"completion_rate": completions / n_episodes,
	"drop_rate": drops / n_episodes,
	"avg_steps": total_steps / n_episodes,
	}


	def _fmt_pct(v: float) -> str:
	return f"{v * 100:.1f}%"


	# ──────────────────────────────────────────────────────────────────────
	# Main — run benchmark
	# ──────────────────────────────────────────────────────────────────────

	if __name__ == "__main__":

	N_EPISODES = 200

	print("=" * 64)
	print(" UIEnv Heuristic Agent -- Benchmark Suite")
	print("=" * 64)

	# -- Heuristic Agent --
	print("\n> Running Heuristic Agent ...")
	h_agent = HeuristicAgent(seed=99)
	h_metrics = run_evaluation(h_agent, n_episodes=N_EPISODES, verbose=True)

	# -- Random Baseline --
	print("\n> Running Random Agent ...")
	r_agent = RandomAgent(seed=99)
	r_metrics = run_evaluation(r_agent, n_episodes=N_EPISODES, verbose=True)

	# -- Comparison Table --
	print("\n" + "-" * 64)
	print(f" {'Metric':<22s} {'Heuristic':>12s} {'Random':>12s} {'Delta':>12s}")
	print("-" * 64)

	for key, label in [
	("avg_reward", "Avg Reward"),
	("completion_rate", "Completion Rate"),
	("drop_rate", "Drop Rate"),
	("avg_steps", "Avg Steps"),
	]:
	h_val = h_metrics[key]
	r_val = r_metrics[key]
	delta = h_val - r_val

	if "rate" in key:
	h_str = _fmt_pct(h_val)
	r_str = _fmt_pct(r_val)
	d_str = f"{delta * 100:+.1f}pp"
	elif "step" in key:
	h_str = f"{h_val:.1f}"
	r_str = f"{r_val:.1f}"
	d_str = f"{delta:+.1f}"
	else:
	h_str = f"{h_val:+.4f}"
	r_str = f"{r_val:+.4f}"
	d_str = f"{delta:+.4f}"

	print(f" {label:<22s} {h_str:>12s} {r_str:>12s} {d_str:>12s}")

	print("-" * 64)

	# -- Verdict --
	lift = h_metrics["avg_reward"] - r_metrics["avg_reward"]
	if lift > 0.2:
	verdict = "[PASS] STRONG improvement over random baseline"
	elif lift > 0.05:
	verdict = "[WARN] Moderate improvement -- consider tuning"
	else:
	verdict = "[FAIL] Marginal -- agent needs rework"

	print(f"\n Verdict: {verdict}")
	print(f" Reward lift: {lift:+.4f}\n")