UI-layout-optimizer / validate_local.py
Kolaps27's picture
chore: align local validator with strict safe_grader None-probe bypass logic
d931737
"""
validate_local.py
-----------------
Local self-test that mirrors what the OpenEnv validator checks.
Run: python validate_local.py
Checks:
1. >= 3 tasks exist
2. Each task has a callable .grader
3. grader(None), grader(), grader({}) all return float in (0, 1)
4. Scores are NOT constant (variation across different env states)
5. Scores are deterministic (same state -> same score)
6. Difficulty progression: easy default > hard default (easy is easier)
"""
import sys
import random
sys.path.insert(0, ".")
from env import UIEnv, Layout
PASS = 0
FAIL = 0
def check(name: str, condition: bool, detail: str = ""):
global PASS, FAIL
if condition:
PASS += 1
print(f" [PASS] {name}")
else:
FAIL += 1
print(f" [FAIL] {name} {detail}")
def main():
global PASS, FAIL
print("=" * 60)
print(" OpenEnv Local Validator")
print("=" * 60)
# ---- Step 1: instantiate env ----
print("\n1. Instantiate environment")
try:
env = UIEnv(seed=42)
check("UIEnv created", True)
except Exception as e:
check("UIEnv created", False, str(e))
sys.exit(1)
# ---- Step 2: task count ----
print("\n2. Task count")
check("env.tasks exists", hasattr(env, "tasks"))
check(">= 3 tasks", len(env.tasks) >= 3, f"found {len(env.tasks)}")
# ---- Step 3: grader callable + return type + bounds ----
print("\n3. Grader probes (None, no-arg, dict)")
for t in env.tasks:
check(f"{t.name}: has .grader", hasattr(t, "grader"))
check(f"{t.name}: grader callable", callable(t.grader))
for label, args in [("None", (None,)), ("no-arg", ()), ("dict", ({},))]:
try:
val = t.grader(*args)
check(f"{t.name}: grader({label}) returns float",
isinstance(val, float), f"got {type(val).__name__}")
check(f"{t.name}: grader({label}) > 0.0",
val > 0.0, f"got {val}")
check(f"{t.name}: grader({label}) < 1.0",
val < 1.0, f"got {val}")
except Exception as e:
check(f"{t.name}: grader({label}) no crash", False, str(e))
# ---- Step 4: score variation ----
print("\n4. Score variation across states")
rng = random.Random(123)
for t in env.tasks:
scores = set()
for _ in range(50):
env._progress = rng.uniform(0.0, 1.0)
env._layout = Layout(
button_size=rng.uniform(0.5, 2.0),
form_length=rng.randint(1, 10),
steps=rng.randint(1, 10),
)
env._device = rng.choice(["mobile", "desktop"])
scores.add(round(t.grader({}), 6))
check(f"{t.name}: >= 10 unique scores in 50 trials",
len(scores) >= 10, f"only {len(scores)} unique")
# ---- Step 5: determinism ----
print("\n5. Determinism (same state -> same score)")
env._progress = 0.5
env._layout = Layout(button_size=1.1, form_length=4, steps=3)
env._device = "desktop"
for t in env.tasks:
s1 = t.grader({})
s2 = t.grader({})
check(f"{t.name}: deterministic", s1 == s2,
f"{s1} != {s2}")
# ---- Step 6: difficulty progression ----
print("\n6. Difficulty sanity (optimised state)")
env._progress = 0.9
env._layout = Layout(button_size=1.1, form_length=3, steps=2)
env._device = "desktop"
se = env.task_dict["easy"].grader()
sm = env.task_dict["medium"].grader()
sh = env.task_dict["hard"].grader()
print(f" easy={se:.4f} medium={sm:.4f} hard={sh:.4f}")
check("easy >= medium (optimised)", se >= sm - 0.05,
f"easy={se:.4f} medium={sm:.4f}")
# ---- Summary ----
print("\n" + "=" * 60)
total = PASS + FAIL
if FAIL == 0:
print(f" ALL {total} CHECKS PASSED OK")
else:
print(f" {FAIL}/{total} CHECKS FAILED")
print("=" * 60)
sys.exit(1 if FAIL > 0 else 0)
if __name__ == "__main__":
main()