Spaces:
Sleeping
Sleeping
| r""" | |
| CivicAI β Full Grader & Task Validation | |
| Run: venv/Scripts/python.exe validate_graders.py | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| print("=" * 60) | |
| print(" CivicAI Grader Validation Suite") | |
| print("=" * 60) | |
| # ββ Imports ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| from civicai.environment import CivicAIEnv | |
| from civicai.models import Action, Observation, SocietyState, SubsidyPolicy | |
| from civicai.graders import ( | |
| grade, | |
| GradeResult, | |
| EconomicStabilityGrader, | |
| PandemicManagementGrader, | |
| SocialCrisisGrader, | |
| GRADERS, | |
| ) | |
| print("[OK] All grader imports successful") | |
| # ββ 1. Registry check ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\nββ Task Registry ββ") | |
| assert set(GRADERS.keys()) == {"stabilize_economy", "manage_pandemic", "control_crisis"} | |
| print(f"[OK] 3 tasks registered: {sorted(GRADERS.keys())}") | |
| # ββ 2. Return type & range checks βββββββββββββββββββββββββββββββββββββββββ | |
| print("\nββ Return Type & Range ββ") | |
| env = CivicAIEnv() | |
| for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]: | |
| obs = env.reset(task_id=task_id) | |
| state = env.state() | |
| result = grade(state, task_id) | |
| assert isinstance(result, GradeResult), f"grade() must return GradeResult, got {type(result)}" | |
| assert isinstance(result.score, float), f"score must be float, got {type(result.score)}" | |
| assert 0.0 <= result.score <= 1.0, f"score out of [0,1]: {result.score}" | |
| assert isinstance(result.success, bool), "success must be bool" | |
| assert isinstance(result.to_dict(), dict), "to_dict() must return dict" | |
| d = result.to_dict() | |
| assert "score" in d and "components" in d and "success" in d and "summary" in d | |
| print(f"[OK] {task_id:25s} score={result.score:.4f} success={result.success}") | |
| # ββ 3. DETERMINISM β same state always gives same score ββββββββββββββββββ | |
| print("\nββ Determinism Test ββ") | |
| for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]: | |
| env.reset(task_id=task_id) | |
| # advance 5 steps with default action | |
| for _ in range(5): | |
| env.step(Action()) | |
| state = env.state() | |
| scores = [grade(state, task_id).score for _ in range(50)] # call 50 times | |
| assert len(set(scores)) == 1, ( | |
| f"[FAIL] Non-deterministic! Got {len(set(scores))} distinct scores for {task_id}" | |
| ) | |
| print(f"[OK] {task_id:25s} deterministic over 50 calls, score={scores[0]:.4f}") | |
| # ββ 4. Boundary values β perfect state scores close to 1.0 βββββββββββββββ | |
| print("\nββ Boundary Values ββ") | |
| # Perfect economy state | |
| perfect_economy = SocietyState( | |
| inflation=0.02, # very low | |
| employment_rate=0.95, # very high | |
| gdp=600.0, # high | |
| budget_balance=0.10, # surplus | |
| ) | |
| r = EconomicStabilityGrader().grade(perfect_economy) | |
| assert r.score >= 0.80, f"Perfect economy should score β₯ 0.80, got {r.score}" | |
| print(f"[OK] Perfect economy state score={r.score:.4f} (expected β₯ 0.80)") | |
| # Worst economy state | |
| worst_economy = SocietyState( | |
| inflation=0.25, # hyperinflation | |
| employment_rate=0.60, | |
| gdp=100.0, | |
| budget_balance=-0.50, | |
| ) | |
| r = EconomicStabilityGrader().grade(worst_economy) | |
| assert r.score <= 0.25, f"Worst economy should score β€ 0.25, got {r.score}" | |
| print(f"[OK] Worst economy state score={r.score:.4f} (expected β€ 0.25)") | |
| # Perfect pandemic state | |
| from civicai.models import EmergentMetrics | |
| perfect_pandemic = SocietyState( | |
| infection_rate=0.01, | |
| health_index=0.85, | |
| gdp=480.0, | |
| medical_supplies=0.90, | |
| ) | |
| r = PandemicManagementGrader().grade(perfect_pandemic) | |
| assert r.score >= 0.80, f"Perfect pandemic state should score β₯ 0.80, got {r.score}" | |
| print(f"[OK] Perfect pandemic state score={r.score:.4f} (expected β₯ 0.80)") | |
| # Worst pandemic state | |
| worst_pandemic = SocietyState( | |
| infection_rate=0.50, # out-of-control epidemic | |
| health_index=0.25, | |
| gdp=180.0, | |
| medical_supplies=0.10, | |
| ) | |
| r = PandemicManagementGrader().grade(worst_pandemic) | |
| assert r.score <= 0.25, f"Worst pandemic should score β€ 0.25, got {r.score}" | |
| print(f"[OK] Worst pandemic state score={r.score:.4f} (expected β€ 0.25)") | |
| # Perfect social state | |
| perfect_social = SocietyState( | |
| public_satisfaction=0.80, | |
| crime_rate=0.03, | |
| employment_rate=0.92, | |
| emergent=EmergentMetrics(wealth_inequality=0.18, social_unrest=0.10), | |
| ) | |
| r = SocialCrisisGrader().grade(perfect_social) | |
| assert r.score >= 0.75, f"Perfect social state should score β₯ 0.75, got {r.score}" | |
| print(f"[OK] Perfect social state score={r.score:.4f} (expected β₯ 0.75)") | |
| # Cascade penalty fires | |
| cascade_social = SocietyState( | |
| public_satisfaction=0.55, | |
| crime_rate=0.10, | |
| employment_rate=0.82, | |
| emergent=EmergentMetrics(wealth_inequality=0.35, social_unrest=0.80), # >0.65 β cascade | |
| ) | |
| r_cascade = SocialCrisisGrader().grade(cascade_social) | |
| cascade_social.emergent.social_unrest = 0.30 # same metrics, no cascade | |
| r_no_cascade = SocialCrisisGrader().grade(cascade_social) | |
| assert r_cascade.score < r_no_cascade.score, "Cascade penalty must reduce score" | |
| print(f"[OK] Cascade penalty fires: with_cascade={r_cascade.score:.4f} < no_cascade={r_no_cascade.score:.4f}") | |
| # ββ 5. step() info contains task_grade βββββββββββββββββββββββββββββββββββ | |
| print("\nββ Environment Integration ββ") | |
| env.reset(task_id="stabilize_economy") | |
| obs, reward, done, info = env.step(Action()) | |
| assert "task_grade" in info, "step() info must contain 'task_grade'" | |
| tg = info["task_grade"] | |
| assert "score" in tg and "components" in tg and "success" in tg | |
| assert 0.0 <= tg["score"] <= 1.0 | |
| print(f"[OK] step() info['task_grade'] score={tg['score']:.4f} success={tg['success']}") | |
| # Verify all 3 tasks via step() | |
| for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]: | |
| obs = env.reset(task_id=task_id) | |
| obs, reward, done, info = env.step(Action()) | |
| tg = info["task_grade"] | |
| assert tg["task_id"] == task_id | |
| assert 0.0 <= tg["score"] <= 1.0 | |
| assert isinstance(tg["success"], bool) | |
| comp_keys = set(tg["components"].keys()) | |
| assert len(comp_keys) >= 4, f"Expected β₯4 components, got {comp_keys}" | |
| print(f"[OK] {task_id:25s} grade={tg['score']:.4f} components={sorted(comp_keys)}") | |
| # ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print() | |
| print("=" * 60) | |
| print(" ALL GRADER CHECKS PASSED") | |
| print() | |
| print(" Tasks:") | |
| print(" stabilize_economy [EASY] β Macroeconomic governance") | |
| print(" manage_pandemic [MEDIUM] β Public-health policy") | |
| print(" control_crisis [HARD] β Multi-domain social crisis") | |
| print() | |
| print(" Grader properties:") | |
| print(" β Returns float β [0.0, 1.0]") | |
| print(" β Fully deterministic (no randomness)") | |
| print(" β Per-component breakdown included") | |
| print(" β Exposed in step() info['task_grade']") | |
| print("=" * 60) | |