phonepilot-qwen7b / eval /null_complex_multi_objective_dinner.jsonl
pranav-1100's picture
eval: 5 baselines, 3 seeds
bcff434 verified
{"task_id": "complex_multi_objective_dinner", "policy": "null", "seed": 1, "total_reward": -0.23999999999999996, "reward_components": {"goal": 0.0, "truthfulness": 0.0, "efficiency": -0.23999999999999996, "appropriateness": 0.0, "format": 0.0, "calibration": 0.0}, "steps_taken": 12, "terminated": true, "end_claim": null, "end_summary": "", "end_confidence": null}
{"task_id": "complex_multi_objective_dinner", "policy": "null", "seed": 2, "total_reward": -0.23999999999999996, "reward_components": {"goal": 0.0, "truthfulness": 0.0, "efficiency": -0.23999999999999996, "appropriateness": 0.0, "format": 0.0, "calibration": 0.0}, "steps_taken": 12, "terminated": true, "end_claim": null, "end_summary": "", "end_confidence": null}
{"task_id": "complex_multi_objective_dinner", "policy": "null", "seed": 3, "total_reward": -0.23999999999999996, "reward_components": {"goal": 0.0, "truthfulness": 0.0, "efficiency": -0.23999999999999996, "appropriateness": 0.0, "format": 0.0, "calibration": 0.0}, "steps_taken": 12, "terminated": true, "end_claim": null, "end_summary": "", "end_confidence": null}