| """Evaluate heuristic baseline on all tasks and print scores.""" | |
| import copy | |
| import json | |
| import sys | |
| from src.tasks import TASKS | |
| from src.grader import RobustnessGrader | |
| from src.baseline import heuristic_policy | |
| def main(n_episodes: int = 10): | |
| all_results = {} | |
| for tid, cfg in TASKS.items(): | |
| try: | |
| grader = RobustnessGrader(copy.deepcopy(cfg)) | |
| result = grader.evaluate_policy( | |
| heuristic_policy, n_episodes=n_episodes | |
| ) | |
| all_results[tid] = result | |
| print(f"{tid}:") | |
| for k, v in result.items(): | |
| print(f" {k}: {v}") | |
| print() | |
| except Exception as e: | |
| all_results[tid] = {"error": str(e)} | |
| print(f"{tid}: FAILED — {e}\n") | |
| return all_results | |
| if __name__ == "__main__": | |
| episodes = int(sys.argv[1]) if len(sys.argv) > 1 else 10 | |
| main(n_episodes=episodes) | |