# inference.py import os import json from agent_llm import get_action from app.env import CustomerSupportEnv from graders import grade_easy, grade_medium, grade_hard #from tasks import TASKS from app.env import get_tasks import sys TASKS = get_tasks() def compute_score(task_type, env, success, steps, rewards): if task_type == "easy": return grade_easy(env, success, steps, rewards) elif task_type == "medium": return grade_medium(env, success, steps, rewards) elif task_type == "hard": return grade_hard(env, success, steps, rewards) return 0.5 # fallback (should never hit) # ========================= # ACTION FORMATTER # ========================= def format_action(action: dict) -> str: if not action: return "null" action_type = action.get("type") if action_type == "ask_info": return f"ask_info('{action.get('field')}')" elif action_type == "resolve": return "resolve()" elif action_type == "classify": return "classify()" return str(action) # ========================= # RUN SINGLE TASK # ========================= def run_single_task(task): task_name = task["id"] task_type = task["difficulty"] #env = CustomerSupportEnv() env = CustomerSupportEnv(difficulty=task["difficulty"]) obs = env.reset() step_count = 0 rewards = [] success = False try: done = False while not done: valid_actions = [ {"type": "ask_info", "field": "order_id"}, {"type": "ask_info", "field": "account_email"}, {"type": "ask_info", "field": "device_type"}, {"type": "ask_info", "field": "browser"}, {"type": "resolve"}, {"type": "classify"}, ] action = get_action(obs, valid_actions) next_obs, reward, done, info = env.step(action) step_count += 1 rewards.append(reward) print( f"[STEP] task={task_name} step={step_count} " f"action={format_action(action)} " f"reward={reward:.2f} " f"done={'true' if done else 'false'} " f"error=null" ) obs = next_obs success = info.get("task_success", False) except Exception as e: print( f"[STEP] task={task_name} step={step_count+1} " f"action=null reward=0.00 done=true error={str(e)}" ) # ========================= # SCORE USING TASK-SPECIFIC GRADER # ========================= #score = compute_score(task_type, env, success, step_count, rewards) grader = task.get("grader") if grader: score = grader(env, success, step_count, rewards) else: score = 0.5 """ if task_type == "easy": score = grade_easy(env) elif task_type == "medium": score = grade_medium(env) elif task_type == "hard": score = grade_hard(env) else: score = 0.5 """ rewards_str = ",".join(f"{r:.2f}" for r in rewards) print( f"[END] task={task_name} " f"success={'true' if success else 'false'} " f"steps={step_count} " f"score={score:.2f} " f"rewards={rewards_str}" ) # ========================= # CRITICAL: JSON OUTPUT (GRADER SIGNAL) # ========================= #print(f"\n") print(json.dumps({ "task_id": task_name, "score": float(round(score, 4)) }), flush=True) #print(f"\n") # ========================= # MAIN # ========================= """ def main(): model_name = os.getenv("MODEL_NAME", "unknown-model") api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") print(f"[CONFIG] api_base_url={api_base_url}") print(f"[START] task=customer-support env=openenv model={model_name}") #print(f"[DEBUG] Running {len(TASKS)} tasks") # RUN DISTINCT TASKS (NOT LOOP COPIES) for task in TASKS: run_single_task(task) """ def main(): model_name = os.getenv("MODEL_NAME", "unknown-model") api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") print(f"[CONFIG] api_base_url={api_base_url}") benchmark = "openenv" # 🚨 CRITICAL: One START per task (validator reads this) for task in TASKS: task_name = task["id"] print(f"[START] task={task_name} env={benchmark} model={model_name}") run_single_task(task) if __name__ == "__main__": main()