r""" CivicAI — Full Grader & Task Validation Run: venv/Scripts/python.exe validate_graders.py """ from __future__ import annotations import sys print("=" * 60) print(" CivicAI Grader Validation Suite") print("=" * 60) # ── Imports ──────────────────────────────────────────────────────────────── from civicai.environment import CivicAIEnv from civicai.models import Action, Observation, SocietyState, SubsidyPolicy from civicai.graders import ( grade, GradeResult, EconomicStabilityGrader, PandemicManagementGrader, SocialCrisisGrader, GRADERS, ) print("[OK] All grader imports successful") # ── 1. Registry check ────────────────────────────────────────────────────── print("\n── Task Registry ──") assert set(GRADERS.keys()) == {"stabilize_economy", "manage_pandemic", "control_crisis"} print(f"[OK] 3 tasks registered: {sorted(GRADERS.keys())}") # ── 2. Return type & range checks ───────────────────────────────────────── print("\n── Return Type & Range ──") env = CivicAIEnv() for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]: obs = env.reset(task_id=task_id) state = env.state() result = grade(state, task_id) assert isinstance(result, GradeResult), f"grade() must return GradeResult, got {type(result)}" assert isinstance(result.score, float), f"score must be float, got {type(result.score)}" assert 0.0 <= result.score <= 1.0, f"score out of [0,1]: {result.score}" assert isinstance(result.success, bool), "success must be bool" assert isinstance(result.to_dict(), dict), "to_dict() must return dict" d = result.to_dict() assert "score" in d and "components" in d and "success" in d and "summary" in d print(f"[OK] {task_id:25s} score={result.score:.4f} success={result.success}") # ── 3. DETERMINISM — same state always gives same score ────────────────── print("\n── Determinism Test ──") for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]: env.reset(task_id=task_id) # advance 5 steps with default action for _ in range(5): env.step(Action()) state = env.state() scores = [grade(state, task_id).score for _ in range(50)] # call 50 times assert len(set(scores)) == 1, ( f"[FAIL] Non-deterministic! Got {len(set(scores))} distinct scores for {task_id}" ) print(f"[OK] {task_id:25s} deterministic over 50 calls, score={scores[0]:.4f}") # ── 4. Boundary values — perfect state scores close to 1.0 ─────────────── print("\n── Boundary Values ──") # Perfect economy state perfect_economy = SocietyState( inflation=0.02, # very low employment_rate=0.95, # very high gdp=600.0, # high budget_balance=0.10, # surplus ) r = EconomicStabilityGrader().grade(perfect_economy) assert r.score >= 0.80, f"Perfect economy should score ≥ 0.80, got {r.score}" print(f"[OK] Perfect economy state score={r.score:.4f} (expected ≥ 0.80)") # Worst economy state worst_economy = SocietyState( inflation=0.25, # hyperinflation employment_rate=0.60, gdp=100.0, budget_balance=-0.50, ) r = EconomicStabilityGrader().grade(worst_economy) assert r.score <= 0.25, f"Worst economy should score ≤ 0.25, got {r.score}" print(f"[OK] Worst economy state score={r.score:.4f} (expected ≤ 0.25)") # Perfect pandemic state from civicai.models import EmergentMetrics perfect_pandemic = SocietyState( infection_rate=0.01, health_index=0.85, gdp=480.0, medical_supplies=0.90, ) r = PandemicManagementGrader().grade(perfect_pandemic) assert r.score >= 0.80, f"Perfect pandemic state should score ≥ 0.80, got {r.score}" print(f"[OK] Perfect pandemic state score={r.score:.4f} (expected ≥ 0.80)") # Worst pandemic state worst_pandemic = SocietyState( infection_rate=0.50, # out-of-control epidemic health_index=0.25, gdp=180.0, medical_supplies=0.10, ) r = PandemicManagementGrader().grade(worst_pandemic) assert r.score <= 0.25, f"Worst pandemic should score ≤ 0.25, got {r.score}" print(f"[OK] Worst pandemic state score={r.score:.4f} (expected ≤ 0.25)") # Perfect social state perfect_social = SocietyState( public_satisfaction=0.80, crime_rate=0.03, employment_rate=0.92, emergent=EmergentMetrics(wealth_inequality=0.18, social_unrest=0.10), ) r = SocialCrisisGrader().grade(perfect_social) assert r.score >= 0.75, f"Perfect social state should score ≥ 0.75, got {r.score}" print(f"[OK] Perfect social state score={r.score:.4f} (expected ≥ 0.75)") # Cascade penalty fires cascade_social = SocietyState( public_satisfaction=0.55, crime_rate=0.10, employment_rate=0.82, emergent=EmergentMetrics(wealth_inequality=0.35, social_unrest=0.80), # >0.65 → cascade ) r_cascade = SocialCrisisGrader().grade(cascade_social) cascade_social.emergent.social_unrest = 0.30 # same metrics, no cascade r_no_cascade = SocialCrisisGrader().grade(cascade_social) assert r_cascade.score < r_no_cascade.score, "Cascade penalty must reduce score" print(f"[OK] Cascade penalty fires: with_cascade={r_cascade.score:.4f} < no_cascade={r_no_cascade.score:.4f}") # ── 5. step() info contains task_grade ─────────────────────────────────── print("\n── Environment Integration ──") env.reset(task_id="stabilize_economy") obs, reward, done, info = env.step(Action()) assert "task_grade" in info, "step() info must contain 'task_grade'" tg = info["task_grade"] assert "score" in tg and "components" in tg and "success" in tg assert 0.0 <= tg["score"] <= 1.0 print(f"[OK] step() info['task_grade'] score={tg['score']:.4f} success={tg['success']}") # Verify all 3 tasks via step() for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]: obs = env.reset(task_id=task_id) obs, reward, done, info = env.step(Action()) tg = info["task_grade"] assert tg["task_id"] == task_id assert 0.0 <= tg["score"] <= 1.0 assert isinstance(tg["success"], bool) comp_keys = set(tg["components"].keys()) assert len(comp_keys) >= 4, f"Expected ≥4 components, got {comp_keys}" print(f"[OK] {task_id:25s} grade={tg['score']:.4f} components={sorted(comp_keys)}") # ── Summary ─────────────────────────────────────────────────────────────── print() print("=" * 60) print(" ALL GRADER CHECKS PASSED") print() print(" Tasks:") print(" stabilize_economy [EASY] — Macroeconomic governance") print(" manage_pandemic [MEDIUM] — Public-health policy") print(" control_crisis [HARD] — Multi-domain social crisis") print() print(" Grader properties:") print(" ✅ Returns float ∈ [0.0, 1.0]") print(" ✅ Fully deterministic (no randomness)") print(" ✅ Per-component breakdown included") print(" ✅ Exposed in step() info['task_grade']") print("=" * 60)