CivicAI / validate_graders.py
mahammadaftab's picture
Final updated
6298125
r"""
CivicAI β€” Full Grader & Task Validation
Run: venv/Scripts/python.exe validate_graders.py
"""
from __future__ import annotations
import sys
print("=" * 60)
print(" CivicAI Grader Validation Suite")
print("=" * 60)
# ── Imports ────────────────────────────────────────────────────────────────
from civicai.environment import CivicAIEnv
from civicai.models import Action, Observation, SocietyState, SubsidyPolicy
from civicai.graders import (
grade,
GradeResult,
EconomicStabilityGrader,
PandemicManagementGrader,
SocialCrisisGrader,
GRADERS,
)
print("[OK] All grader imports successful")
# ── 1. Registry check ──────────────────────────────────────────────────────
print("\n── Task Registry ──")
assert set(GRADERS.keys()) == {"stabilize_economy", "manage_pandemic", "control_crisis"}
print(f"[OK] 3 tasks registered: {sorted(GRADERS.keys())}")
# ── 2. Return type & range checks ─────────────────────────────────────────
print("\n── Return Type & Range ──")
env = CivicAIEnv()
for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]:
obs = env.reset(task_id=task_id)
state = env.state()
result = grade(state, task_id)
assert isinstance(result, GradeResult), f"grade() must return GradeResult, got {type(result)}"
assert isinstance(result.score, float), f"score must be float, got {type(result.score)}"
assert 0.0 <= result.score <= 1.0, f"score out of [0,1]: {result.score}"
assert isinstance(result.success, bool), "success must be bool"
assert isinstance(result.to_dict(), dict), "to_dict() must return dict"
d = result.to_dict()
assert "score" in d and "components" in d and "success" in d and "summary" in d
print(f"[OK] {task_id:25s} score={result.score:.4f} success={result.success}")
# ── 3. DETERMINISM β€” same state always gives same score ──────────────────
print("\n── Determinism Test ──")
for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]:
env.reset(task_id=task_id)
# advance 5 steps with default action
for _ in range(5):
env.step(Action())
state = env.state()
scores = [grade(state, task_id).score for _ in range(50)] # call 50 times
assert len(set(scores)) == 1, (
f"[FAIL] Non-deterministic! Got {len(set(scores))} distinct scores for {task_id}"
)
print(f"[OK] {task_id:25s} deterministic over 50 calls, score={scores[0]:.4f}")
# ── 4. Boundary values β€” perfect state scores close to 1.0 ───────────────
print("\n── Boundary Values ──")
# Perfect economy state
perfect_economy = SocietyState(
inflation=0.02, # very low
employment_rate=0.95, # very high
gdp=600.0, # high
budget_balance=0.10, # surplus
)
r = EconomicStabilityGrader().grade(perfect_economy)
assert r.score >= 0.80, f"Perfect economy should score β‰₯ 0.80, got {r.score}"
print(f"[OK] Perfect economy state score={r.score:.4f} (expected β‰₯ 0.80)")
# Worst economy state
worst_economy = SocietyState(
inflation=0.25, # hyperinflation
employment_rate=0.60,
gdp=100.0,
budget_balance=-0.50,
)
r = EconomicStabilityGrader().grade(worst_economy)
assert r.score <= 0.25, f"Worst economy should score ≀ 0.25, got {r.score}"
print(f"[OK] Worst economy state score={r.score:.4f} (expected ≀ 0.25)")
# Perfect pandemic state
from civicai.models import EmergentMetrics
perfect_pandemic = SocietyState(
infection_rate=0.01,
health_index=0.85,
gdp=480.0,
medical_supplies=0.90,
)
r = PandemicManagementGrader().grade(perfect_pandemic)
assert r.score >= 0.80, f"Perfect pandemic state should score β‰₯ 0.80, got {r.score}"
print(f"[OK] Perfect pandemic state score={r.score:.4f} (expected β‰₯ 0.80)")
# Worst pandemic state
worst_pandemic = SocietyState(
infection_rate=0.50, # out-of-control epidemic
health_index=0.25,
gdp=180.0,
medical_supplies=0.10,
)
r = PandemicManagementGrader().grade(worst_pandemic)
assert r.score <= 0.25, f"Worst pandemic should score ≀ 0.25, got {r.score}"
print(f"[OK] Worst pandemic state score={r.score:.4f} (expected ≀ 0.25)")
# Perfect social state
perfect_social = SocietyState(
public_satisfaction=0.80,
crime_rate=0.03,
employment_rate=0.92,
emergent=EmergentMetrics(wealth_inequality=0.18, social_unrest=0.10),
)
r = SocialCrisisGrader().grade(perfect_social)
assert r.score >= 0.75, f"Perfect social state should score β‰₯ 0.75, got {r.score}"
print(f"[OK] Perfect social state score={r.score:.4f} (expected β‰₯ 0.75)")
# Cascade penalty fires
cascade_social = SocietyState(
public_satisfaction=0.55,
crime_rate=0.10,
employment_rate=0.82,
emergent=EmergentMetrics(wealth_inequality=0.35, social_unrest=0.80), # >0.65 β†’ cascade
)
r_cascade = SocialCrisisGrader().grade(cascade_social)
cascade_social.emergent.social_unrest = 0.30 # same metrics, no cascade
r_no_cascade = SocialCrisisGrader().grade(cascade_social)
assert r_cascade.score < r_no_cascade.score, "Cascade penalty must reduce score"
print(f"[OK] Cascade penalty fires: with_cascade={r_cascade.score:.4f} < no_cascade={r_no_cascade.score:.4f}")
# ── 5. step() info contains task_grade ───────────────────────────────────
print("\n── Environment Integration ──")
env.reset(task_id="stabilize_economy")
obs, reward, done, info = env.step(Action())
assert "task_grade" in info, "step() info must contain 'task_grade'"
tg = info["task_grade"]
assert "score" in tg and "components" in tg and "success" in tg
assert 0.0 <= tg["score"] <= 1.0
print(f"[OK] step() info['task_grade'] score={tg['score']:.4f} success={tg['success']}")
# Verify all 3 tasks via step()
for task_id in ["stabilize_economy", "manage_pandemic", "control_crisis"]:
obs = env.reset(task_id=task_id)
obs, reward, done, info = env.step(Action())
tg = info["task_grade"]
assert tg["task_id"] == task_id
assert 0.0 <= tg["score"] <= 1.0
assert isinstance(tg["success"], bool)
comp_keys = set(tg["components"].keys())
assert len(comp_keys) >= 4, f"Expected β‰₯4 components, got {comp_keys}"
print(f"[OK] {task_id:25s} grade={tg['score']:.4f} components={sorted(comp_keys)}")
# ── Summary ───────────────────────────────────────────────────────────────
print()
print("=" * 60)
print(" ALL GRADER CHECKS PASSED")
print()
print(" Tasks:")
print(" stabilize_economy [EASY] β€” Macroeconomic governance")
print(" manage_pandemic [MEDIUM] β€” Public-health policy")
print(" control_crisis [HARD] β€” Multi-domain social crisis")
print()
print(" Grader properties:")
print(" βœ… Returns float ∈ [0.0, 1.0]")
print(" βœ… Fully deterministic (no randomness)")
print(" βœ… Per-component breakdown included")
print(" βœ… Exposed in step() info['task_grade']")
print("=" * 60)