Spaces:
Sleeping
Sleeping
| """ | |
| validate_local.py | |
| ----------------- | |
| Local self-test that mirrors what the OpenEnv validator checks. | |
| Run: python validate_local.py | |
| Checks: | |
| 1. >= 3 tasks exist | |
| 2. Each task has a callable .grader | |
| 3. grader(None), grader(), grader({}) all return float in (0, 1) | |
| 4. Scores are NOT constant (variation across different env states) | |
| 5. Scores are deterministic (same state -> same score) | |
| 6. Difficulty progression: easy default > hard default (easy is easier) | |
| """ | |
| import sys | |
| import random | |
| sys.path.insert(0, ".") | |
| from env import UIEnv, Layout | |
| PASS = 0 | |
| FAIL = 0 | |
| def check(name: str, condition: bool, detail: str = ""): | |
| global PASS, FAIL | |
| if condition: | |
| PASS += 1 | |
| print(f" [PASS] {name}") | |
| else: | |
| FAIL += 1 | |
| print(f" [FAIL] {name} {detail}") | |
| def main(): | |
| global PASS, FAIL | |
| print("=" * 60) | |
| print(" OpenEnv Local Validator") | |
| print("=" * 60) | |
| # ---- Step 1: instantiate env ---- | |
| print("\n1. Instantiate environment") | |
| try: | |
| env = UIEnv(seed=42) | |
| check("UIEnv created", True) | |
| except Exception as e: | |
| check("UIEnv created", False, str(e)) | |
| sys.exit(1) | |
| # ---- Step 2: task count ---- | |
| print("\n2. Task count") | |
| check("env.tasks exists", hasattr(env, "tasks")) | |
| check(">= 3 tasks", len(env.tasks) >= 3, f"found {len(env.tasks)}") | |
| # ---- Step 3: grader callable + return type + bounds ---- | |
| print("\n3. Grader probes (None, no-arg, dict)") | |
| for t in env.tasks: | |
| check(f"{t.name}: has .grader", hasattr(t, "grader")) | |
| check(f"{t.name}: grader callable", callable(t.grader)) | |
| for label, args in [("None", (None,)), ("no-arg", ()), ("dict", ({},))]: | |
| try: | |
| val = t.grader(*args) | |
| check(f"{t.name}: grader({label}) returns float", | |
| isinstance(val, float), f"got {type(val).__name__}") | |
| check(f"{t.name}: grader({label}) > 0.0", | |
| val > 0.0, f"got {val}") | |
| check(f"{t.name}: grader({label}) < 1.0", | |
| val < 1.0, f"got {val}") | |
| except Exception as e: | |
| check(f"{t.name}: grader({label}) no crash", False, str(e)) | |
| # ---- Step 4: score variation ---- | |
| print("\n4. Score variation across states") | |
| rng = random.Random(123) | |
| for t in env.tasks: | |
| scores = set() | |
| for _ in range(50): | |
| env._progress = rng.uniform(0.0, 1.0) | |
| env._layout = Layout( | |
| button_size=rng.uniform(0.5, 2.0), | |
| form_length=rng.randint(1, 10), | |
| steps=rng.randint(1, 10), | |
| ) | |
| env._device = rng.choice(["mobile", "desktop"]) | |
| scores.add(round(t.grader({}), 6)) | |
| check(f"{t.name}: >= 10 unique scores in 50 trials", | |
| len(scores) >= 10, f"only {len(scores)} unique") | |
| # ---- Step 5: determinism ---- | |
| print("\n5. Determinism (same state -> same score)") | |
| env._progress = 0.5 | |
| env._layout = Layout(button_size=1.1, form_length=4, steps=3) | |
| env._device = "desktop" | |
| for t in env.tasks: | |
| s1 = t.grader({}) | |
| s2 = t.grader({}) | |
| check(f"{t.name}: deterministic", s1 == s2, | |
| f"{s1} != {s2}") | |
| # ---- Step 6: difficulty progression ---- | |
| print("\n6. Difficulty sanity (optimised state)") | |
| env._progress = 0.9 | |
| env._layout = Layout(button_size=1.1, form_length=3, steps=2) | |
| env._device = "desktop" | |
| se = env.task_dict["easy"].grader() | |
| sm = env.task_dict["medium"].grader() | |
| sh = env.task_dict["hard"].grader() | |
| print(f" easy={se:.4f} medium={sm:.4f} hard={sh:.4f}") | |
| check("easy >= medium (optimised)", se >= sm - 0.05, | |
| f"easy={se:.4f} medium={sm:.4f}") | |
| # ---- Summary ---- | |
| print("\n" + "=" * 60) | |
| total = PASS + FAIL | |
| if FAIL == 0: | |
| print(f" ALL {total} CHECKS PASSED OK") | |
| else: | |
| print(f" {FAIL}/{total} CHECKS FAILED") | |
| print("=" * 60) | |
| sys.exit(1 if FAIL > 0 else 0) | |
| if __name__ == "__main__": | |
| main() | |