Spaces:

Kolaps27
/

UI-layout-optimizer

Sleeping

App Files Files Community

UI-layout-optimizer / validate_local.py

Kolaps27

chore: align local validator with strict safe_grader None-probe bypass logic

d931737 6 days ago

raw

history blame contribute delete

4.06 kB

	"""
	validate_local.py
	-----------------
	Local self-test that mirrors what the OpenEnv validator checks.

	Run: python validate_local.py

	Checks:
	1. >= 3 tasks exist
	2. Each task has a callable .grader
	3. grader(None), grader(), grader({}) all return float in (0, 1)
	4. Scores are NOT constant (variation across different env states)
	5. Scores are deterministic (same state -> same score)
	6. Difficulty progression: easy default > hard default (easy is easier)
	"""

	import sys
	import random

	sys.path.insert(0, ".")
	from env import UIEnv, Layout

	PASS = 0
	FAIL = 0


	def check(name: str, condition: bool, detail: str = ""):
	global PASS, FAIL
	if condition:
	PASS += 1
	print(f" [PASS] {name}")
	else:
	FAIL += 1
	print(f" [FAIL] {name} {detail}")


	def main():
	global PASS, FAIL

	print("=" * 60)
	print(" OpenEnv Local Validator")
	print("=" * 60)

	# ---- Step 1: instantiate env ----
	print("\n1. Instantiate environment")
	try:
	env = UIEnv(seed=42)
	check("UIEnv created", True)
	except Exception as e:
	check("UIEnv created", False, str(e))
	sys.exit(1)

	# ---- Step 2: task count ----
	print("\n2. Task count")
	check("env.tasks exists", hasattr(env, "tasks"))
	check(">= 3 tasks", len(env.tasks) >= 3, f"found {len(env.tasks)}")

	# ---- Step 3: grader callable + return type + bounds ----
	print("\n3. Grader probes (None, no-arg, dict)")
	for t in env.tasks:
	check(f"{t.name}: has .grader", hasattr(t, "grader"))
	check(f"{t.name}: grader callable", callable(t.grader))

	for label, args in [("None", (None,)), ("no-arg", ()), ("dict", ({},))]:
	try:
	val = t.grader(*args)
	check(f"{t.name}: grader({label}) returns float",
	isinstance(val, float), f"got {type(val).__name__}")
	check(f"{t.name}: grader({label}) > 0.0",
	val > 0.0, f"got {val}")
	check(f"{t.name}: grader({label}) < 1.0",
	val < 1.0, f"got {val}")
	except Exception as e:
	check(f"{t.name}: grader({label}) no crash", False, str(e))

	# ---- Step 4: score variation ----
	print("\n4. Score variation across states")
	rng = random.Random(123)
	for t in env.tasks:
	scores = set()
	for _ in range(50):
	env._progress = rng.uniform(0.0, 1.0)
	env._layout = Layout(
	button_size=rng.uniform(0.5, 2.0),
	form_length=rng.randint(1, 10),
	steps=rng.randint(1, 10),
	)
	env._device = rng.choice(["mobile", "desktop"])
	scores.add(round(t.grader({}), 6))

	check(f"{t.name}: >= 10 unique scores in 50 trials",
	len(scores) >= 10, f"only {len(scores)} unique")

	# ---- Step 5: determinism ----
	print("\n5. Determinism (same state -> same score)")
	env._progress = 0.5
	env._layout = Layout(button_size=1.1, form_length=4, steps=3)
	env._device = "desktop"
	for t in env.tasks:
	s1 = t.grader({})
	s2 = t.grader({})
	check(f"{t.name}: deterministic", s1 == s2,
	f"{s1} != {s2}")

	# ---- Step 6: difficulty progression ----
	print("\n6. Difficulty sanity (optimised state)")
	env._progress = 0.9
	env._layout = Layout(button_size=1.1, form_length=3, steps=2)
	env._device = "desktop"
	se = env.task_dict["easy"].grader()
	sm = env.task_dict["medium"].grader()
	sh = env.task_dict["hard"].grader()
	print(f" easy={se:.4f} medium={sm:.4f} hard={sh:.4f}")
	check("easy >= medium (optimised)", se >= sm - 0.05,
	f"easy={se:.4f} medium={sm:.4f}")

	# ---- Summary ----
	print("\n" + "=" * 60)
	total = PASS + FAIL
	if FAIL == 0:
	print(f" ALL {total} CHECKS PASSED OK")
	else:
	print(f" {FAIL}/{total} CHECKS FAILED")
	print("=" * 60)
	sys.exit(1 if FAIL > 0 else 0)


	if __name__ == "__main__":
	main()