Spaces:
Sleeping
Sleeping
File size: 4,580 Bytes
d6a76d5 bce7dd2 d6a76d5 bf34481 ecf4764 d6a76d5 bf34481 d6a76d5 ecf4764 bce7dd2 d6a76d5 12d1907 d6a76d5 bce7dd2 d6a76d5 8e156dc 6819726 0894e25 b03ffc7 d6a76d5 79c8057 d6a76d5 79c8057 d6a76d5 bce7dd2 bf34481 c4b20ec bf34481 c4b20ec d6a76d5 b03ffc7 12d1907 b03ffc7 79c8057 b03ffc7 12d1907 b03ffc7 bce7dd2 c69c67b bce7dd2 c69c67b bce7dd2 e3b41fc b03ffc7 bce7dd2 b03ffc7 276f522 8f59669 bce7dd2 e3b41fc b03ffc7 e3b41fc 8e156dc e3b41fc d6a76d5 bce7dd2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# inference.py
import os
import json
from agent_llm import get_action
from app.env import CustomerSupportEnv
from graders import grade_easy, grade_medium, grade_hard
#from tasks import TASKS
from app.env import get_tasks
import sys
TASKS = get_tasks()
def compute_score(task_type, env, success, steps, rewards):
if task_type == "easy":
return grade_easy(env, success, steps, rewards)
elif task_type == "medium":
return grade_medium(env, success, steps, rewards)
elif task_type == "hard":
return grade_hard(env, success, steps, rewards)
return 0.5 # fallback (should never hit)
# =========================
# ACTION FORMATTER
# =========================
def format_action(action: dict) -> str:
if not action:
return "null"
action_type = action.get("type")
if action_type == "ask_info":
return f"ask_info('{action.get('field')}')"
elif action_type == "resolve":
return "resolve()"
elif action_type == "classify":
return "classify()"
return str(action)
# =========================
# RUN SINGLE TASK
# =========================
def run_single_task(task):
task_name = task["id"]
task_type = task["difficulty"]
#env = CustomerSupportEnv()
env = CustomerSupportEnv(difficulty=task["difficulty"])
obs = env.reset()
step_count = 0
rewards = []
success = False
try:
done = False
while not done:
valid_actions = [
{"type": "ask_info", "field": "order_id"},
{"type": "ask_info", "field": "account_email"},
{"type": "ask_info", "field": "device_type"},
{"type": "ask_info", "field": "browser"},
{"type": "resolve"},
{"type": "classify"},
]
action = get_action(obs, valid_actions)
next_obs, reward, done, info = env.step(action)
step_count += 1
rewards.append(reward)
print(
f"[STEP] task={task_name} step={step_count} "
f"action={format_action(action)} "
f"reward={reward:.2f} "
f"done={'true' if done else 'false'} "
f"error=null"
)
obs = next_obs
success = info.get("task_success", False)
except Exception as e:
print(
f"[STEP] task={task_name} step={step_count+1} "
f"action=null reward=0.00 done=true error={str(e)}"
)
# =========================
# SCORE USING TASK-SPECIFIC GRADER
# =========================
#score = compute_score(task_type, env, success, step_count, rewards)
grader = task.get("grader")
if grader:
score = grader(env, success, step_count, rewards)
else:
score = 0.5
"""
if task_type == "easy":
score = grade_easy(env)
elif task_type == "medium":
score = grade_medium(env)
elif task_type == "hard":
score = grade_hard(env)
else:
score = 0.5
"""
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(
f"[END] task={task_name} "
f"success={'true' if success else 'false'} "
f"steps={step_count} "
f"score={score:.2f} "
f"rewards={rewards_str}"
)
# =========================
# CRITICAL: JSON OUTPUT (GRADER SIGNAL)
# =========================
#print(f"\n")
print(json.dumps({
"task_id": task_name,
"score": float(round(score, 4))
}), flush=True)
#print(f"\n")
# =========================
# MAIN
# =========================
"""
def main():
model_name = os.getenv("MODEL_NAME", "unknown-model")
api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
print(f"[CONFIG] api_base_url={api_base_url}")
print(f"[START] task=customer-support env=openenv model={model_name}")
#print(f"[DEBUG] Running {len(TASKS)} tasks")
# RUN DISTINCT TASKS (NOT LOOP COPIES)
for task in TASKS:
run_single_task(task)
"""
def main():
model_name = os.getenv("MODEL_NAME", "unknown-model")
api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
print(f"[CONFIG] api_base_url={api_base_url}")
benchmark = "openenv"
# 🚨 CRITICAL: One START per task (validator reads this)
for task in TASKS:
task_name = task["id"]
print(f"[START] task={task_name} env={benchmark} model={model_name}")
run_single_task(task)
if __name__ == "__main__":
main()
|