def compute_grade(action_taken, correct_action, output, correct_answer): """ Returns score between 0.0 and 1.0 """ # 1. Action correctness action_correct = 1.0 if action_taken == correct_action else 0.0 # 2. Answer correctness answer_correct = 1.0 if output == correct_answer else 0.0 # 3. Efficiency (simple version) if action_taken in ["use_calculator", "use_search"]: efficiency = 0.5 # using tool has cost else: efficiency = 1.0 # direct answer is efficient # Final score score = ( 0.4 * action_correct + 0.5 * answer_correct + 0.1 * efficiency ) return round(score, 2)