Spaces:
Running
Running
| """ | |
| Manual grader testing CLI. | |
| Run a simulated episode and score it with the official grader. | |
| Usage: | |
| python scripts/run_grader.py --task single_crash --agent correct | |
| python scripts/run_grader.py --task cascading_failure --agent wrong | |
| python scripts/run_grader.py --task silent_degradation --agent correct | |
| python scripts/run_grader.py --all | |
| """ | |
| import argparse | |
| import sys | |
| import os | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from server.environment import LogTriageEnvironment | |
| from server.models import TriageAction | |
| from server.graders import score_episode | |
| # βββ CORRECT AGENT SCRIPTS ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CORRECT_ACTIONS = { | |
| "single_crash": [ | |
| TriageAction(action_type="classify_severity", value="P1", confidence=0.95), | |
| TriageAction(action_type="identify_root_cause", value="payment-service", confidence=0.90), | |
| TriageAction(action_type="remediate", value="restart:payment-service", confidence=0.85), | |
| TriageAction(action_type="resolve", value="resolved", confidence=1.00), | |
| ], | |
| "cascading_failure": [ | |
| TriageAction(action_type="classify_severity", value="P1", confidence=0.90), | |
| TriageAction(action_type="identify_root_cause", value="user-db", confidence=0.85), | |
| TriageAction(action_type="remediate", value="kill-query:user-db", confidence=0.90), | |
| TriageAction(action_type="resolve", value="resolved", confidence=1.00), | |
| ], | |
| "silent_degradation": [ | |
| TriageAction(action_type="request_more_logs", value="payment-db", confidence=0.70), | |
| TriageAction(action_type="classify_severity", value="P2", confidence=0.80), | |
| TriageAction(action_type="identify_root_cause", value="payment-db", confidence=0.85), | |
| TriageAction(action_type="remediate", value="flush-cache:payment-db", confidence=0.80), | |
| TriageAction(action_type="resolve", value="resolved", confidence=1.00), | |
| ], | |
| } | |
| # βββ WRONG AGENT SCRIPTS ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| WRONG_ACTIONS = { | |
| "single_crash": [ | |
| TriageAction(action_type="classify_severity", value="P3", confidence=0.50), | |
| TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.50), | |
| TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.50), | |
| TriageAction(action_type="resolve", value="resolved", confidence=1.00), | |
| ], | |
| "cascading_failure": [ | |
| TriageAction(action_type="classify_severity", value="P2", confidence=0.60), | |
| TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.60), | |
| TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.60), | |
| TriageAction(action_type="resolve", value="resolved", confidence=1.00), | |
| ], | |
| "silent_degradation": [ | |
| TriageAction(action_type="classify_severity", value="P1", confidence=0.90), | |
| TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.70), | |
| TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.70), | |
| TriageAction(action_type="resolve", value="resolved", confidence=1.00), | |
| ], | |
| } | |
| def run_test(task_id: str, agent_type: str, seed: int = 42) -> dict: | |
| """Run a full episode with given actions and return grader result.""" | |
| env = LogTriageEnvironment() | |
| env.reset(task_id=task_id, seed=seed) | |
| actions = CORRECT_ACTIONS[task_id] if agent_type == "correct" else WRONG_ACTIONS[task_id] | |
| for action in actions: | |
| obs = env.step(action) | |
| if obs.done: | |
| break | |
| result = score_episode(task_id, env.state) | |
| return result | |
| def print_result(task_id: str, agent_type: str, result: dict): | |
| score = result["score"] | |
| print(f"\n{'='*60}") | |
| print(f"Task: {task_id}") | |
| print(f"Agent: {agent_type}") | |
| print(f"Score: {score:.4f}") | |
| print(f"Steps: {result['steps_taken']}/{result['max_steps']}") | |
| print(f"Resolved: {result['resolved']}") | |
| print(f"\nBreakdown:") | |
| for key, val in result.get("breakdown", {}).items(): | |
| print(f" {key:<20} {val}") | |
| print(f"{'='*60}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Test LogTriageEnv graders") | |
| parser.add_argument("--task", choices=["single_crash", "cascading_failure", "silent_degradation"], | |
| help="Task to test") | |
| parser.add_argument("--agent", choices=["correct", "wrong"], default="correct", | |
| help="Agent type to simulate") | |
| parser.add_argument("--all", action="store_true", | |
| help="Run all tasks with both correct and wrong agents") | |
| args = parser.parse_args() | |
| if args.all: | |
| tasks = ["single_crash", "cascading_failure", "silent_degradation"] | |
| print("\n[TEST] Running all tasks with correct and wrong agents...\n") | |
| print(f"{'Task':<25} {'Agent':<10} {'Score':<8} {'Variance'}") | |
| print("-" * 60) | |
| for task in tasks: | |
| correct_result = run_test(task, "correct") | |
| wrong_result = run_test(task, "wrong") | |
| correct_score = correct_result["score"] | |
| wrong_score = wrong_result["score"] | |
| variance = correct_score - wrong_score | |
| status = "[OK]" if variance > 0.10 else "[LOW]" | |
| print(f"{task:<25} correct {correct_score:.4f}") | |
| print(f"{task:<25} wrong {wrong_score:.4f} delta={variance:.4f} {status}") | |
| print() | |
| elif args.task: | |
| result = run_test(args.task, args.agent) | |
| print_result(args.task, args.agent, result) | |
| else: | |
| parser.print_help() | |
| if __name__ == "__main__": | |
| main() | |