""" Baseline inference script for the API Debug Environment. MANDATORY: - Must be named inference.py and placed in the root directory. - Must use OpenAI Client for all LLM calls. - Must read env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN. - Must emit [START], [STEP], [END] structured logs to stdout. STDOUT FORMAT: [START] task= env= model= [STEP] step= action= reward=<0.00> done= error= [END] success= steps= score= rewards= """ import asyncio import json import os import re import textwrap from typing import List, Optional from openai import OpenAI from client import APIDebugEnv from models import APIDebugAction # Environment variables (mandatory for hackathon evaluation) API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1" HF_TOKEN = os.getenv("HF_TOKEN") API_KEY = HF_TOKEN or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY") MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct" ENV_URL = os.getenv("ENV_URL") or "https://avichauhan-api-debug-env.hf.space" IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") # Task configuration TASKS = ["easy", "classify", "medium", "headers", "response", "hard"] EPISODES_PER_TASK = 3 MAX_STEPS = {"easy": 3, "classify": 4, "medium": 5, "headers": 4, "response": 4, "hard": 7} BENCHMARK_NAME = "api_debug" # ========================================================================= # Structured logging (exact format required by evaluator) # ========================================================================= def log_start(task: str, env: str, model: str) -> None: print(f"[START] task={task} env={env} model={model}", flush=True) def log_step( step: int, action: str, reward: float, done: bool, error: Optional[str] ) -> None: done_val = str(done).lower() error_val = error if error else "null" print( f"[STEP] step={step} action={action} reward={reward:.4f} " f"done={done_val} error={error_val}", flush=True, ) def log_end( success: bool, steps: int, score: float, rewards: List[float] ) -> None: rewards_str = ",".join(f"{r:.4f}" for r in rewards) print( f"[END] success={str(success).lower()} steps={steps} " f"score={score:.4f} rewards={rewards_str}", flush=True, ) # ========================================================================= # System prompts per task # ========================================================================= SYSTEM_PROMPTS = { "easy": textwrap.dedent(""" You are an API debugging expert. You receive a broken API request and its specification. Your job: identify the error type and the affected fields. Respond with ONLY a JSON object in this format: {"error_type": "", "affected_fields": ["field1", "field2"]} Valid error types: missing_required_field, wrong_field_type, invalid_email_format, missing_auth_header, extra_unknown_field, null_value_in_required, wrong_http_method, malformed_json_value, invalid_enum_value, datetime_format_error, wrong_content_type, expired_auth_token """).strip(), "classify": textwrap.dedent(""" You are an API debugging expert. You receive a broken API request with MULTIPLE errors. Your job: identify ALL error types and ALL affected fields. Respond with ONLY a JSON object in this format: {"error_types": ["type1", "type2"], "affected_fields": ["field1", "field2"]} Valid error types: missing_required_field, wrong_field_type, invalid_email_format, missing_auth_header, extra_unknown_field, null_value_in_required, wrong_http_method, malformed_json_value, invalid_enum_value, datetime_format_error, wrong_content_type, expired_auth_token """).strip(), "medium": textwrap.dedent(""" You are an API debugging expert. You receive a broken API request and its specification. Your job: fix the request so it matches the spec. Respond with ONLY a JSON object in this format: {"fixed_request": "", "fixed_headers": {"Header": "value"}} The fixed_request must be a valid JSON string. Include all required fields with correct types. """).strip(), "headers": textwrap.dedent(""" You are an API debugging expert. You receive a broken API request with header-level errors. Your job: identify the header error type and provide the corrected headers. Respond with ONLY a JSON object in this format: {"error_type": "", "fixed_headers": {"Header-Name": "correct-value"}} Valid header error types: missing_auth_header, wrong_content_type, expired_auth_token Common headers: Authorization (Bearer token), Content-Type (application/json) """).strip(), "response": textwrap.dedent(""" You are an API response validation expert. You receive an API request, its specification, and the server's response. Your job: identify issues in the response. Respond with ONLY a JSON object in this format: {"response_issues": ["issue_type1", "issue_type2"], "affected_fields": ["field1"], "expected_status_code": 200} Valid response issue types: wrong_status_code, missing_response_field, wrong_response_type, extra_response_field, inconsistent_error_format Only include expected_status_code if you detect a wrong_status_code issue. """).strip(), "hard": textwrap.dedent(""" You are an API debugging expert. You receive a broken API request with multiple errors. Your job: diagnose the errors, fix the request, and explain the fix for a developer. Respond with ONLY a JSON object in this format: { "error_type": "", "affected_fields": ["field1"], "fixed_request": "", "fixed_headers": {"Header": "value"}, "explanation": "Clear explanation of what was wrong and how to fix it." } """).strip(), } # ========================================================================= # Prompt building # ========================================================================= def build_user_prompt(obs, step_num: int) -> str: """Build the user prompt from the observation.""" parts = [ f"API: {obs.http_method} {obs.endpoint} ({obs.api_name})", f"Error count: {obs.error_count}", f"Step {step_num}/{obs.max_steps}", f"\nRequest body:\n{obs.broken_request}", f"\nRequest headers: {json.dumps(obs.broken_headers)}", f"\nAPI Specification:\n{obs.api_spec}", ] # Include response data for response validation task if obs.response_body: parts.append(f"\nResponse status code: {obs.response_status_code}") parts.append(f"\nResponse body:\n{obs.response_body}") if obs.feedback: parts.append(f"\nFeedback from previous attempt:\n{obs.feedback}") return "\n".join(parts) # ========================================================================= # LLM response parsing # ========================================================================= def parse_llm_response(text: str) -> dict: """Extract a JSON object from the LLM response. Handles cases where the LLM wraps JSON in markdown code blocks or adds extra text around it. """ if not text: return {} # Try direct parse first try: return json.loads(text) except json.JSONDecodeError: pass # Try extracting from markdown code block code_block = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL) if code_block: try: return json.loads(code_block.group(1)) except json.JSONDecodeError: pass # Try finding any JSON object in the text brace_match = re.search(r"\{[^{}]*\}", text, re.DOTALL) if brace_match: try: return json.loads(brace_match.group(0)) except json.JSONDecodeError: pass return {} def build_action(data: dict) -> APIDebugAction: """Convert parsed JSON dict to APIDebugAction.""" # Handle fixed_request: if it's a dict, serialize to JSON string fixed_req = data.get("fixed_request") if isinstance(fixed_req, dict): fixed_req = json.dumps(fixed_req) return APIDebugAction( error_type=data.get("error_type"), error_types=data.get("error_types"), affected_fields=data.get("affected_fields"), fixed_request=fixed_req, fixed_headers=data.get("fixed_headers"), explanation=data.get("explanation"), response_issues=data.get("response_issues"), expected_status_code=data.get("expected_status_code"), ) # ========================================================================= # Episode runner # ========================================================================= async def run_episode( env: APIDebugEnv, llm_client: OpenAI, task: str, ) -> float: """Run a single episode for the given task. Returns the final score.""" log_start(task=task, env=BENCHMARK_NAME, model=MODEL_NAME) result = await env.reset(task=task) obs = result.observation rewards: List[float] = [] steps_taken = 0 max_steps = MAX_STEPS[task] for step in range(1, max_steps + 1): if result.done: break user_prompt = build_user_prompt(obs, step) # Call the LLM try: completion = llm_client.chat.completions.create( model=MODEL_NAME, messages=[ {"role": "system", "content": SYSTEM_PROMPTS[task]}, {"role": "user", "content": user_prompt}, ], max_tokens=500, temperature=0.0, ) llm_text = completion.choices[0].message.content or "" except Exception as exc: print(f"[DEBUG] LLM request failed: {exc}", flush=True) llm_text = "" # Parse LLM output into action parsed = parse_llm_response(llm_text) action = build_action(parsed) # Step the environment result = await env.step(action) obs = result.observation reward = result.reward or 0.0 done = result.done rewards.append(reward) steps_taken = step # Build a short action summary for the log action_summary = _action_summary(action, task) log_step(step=step, action=action_summary, reward=reward, done=done, error=None) if done: break # Final score is the max reward achieved (environment already tracks best) # Clamp to open interval (0, 1) - evaluator rejects exactly 0.0 and 1.0 score = max(rewards) if rewards else 0.001 score = min(max(score, 0.001), 0.999) success = score >= 0.5 log_end(success=success, steps=steps_taken, score=score, rewards=rewards) return score def _action_summary(action: APIDebugAction, task: str) -> str: """Short summary of the action for logging.""" if task == "easy": return f"diagnose:{action.error_type or 'none'}" elif task == "classify": types = action.error_types or [action.error_type or "none"] return f"classify:{','.join(str(t) for t in types)}" elif task == "medium": fix_len = len(action.fixed_request or "") return f"fix:len={fix_len}" elif task == "headers": hdr_count = len(action.fixed_headers or {}) return f"headers:{action.error_type or 'none'}+fix:{hdr_count}" elif task == "response": issues = action.response_issues or [] return f"response:{','.join(issues) or 'none'}+status:{action.expected_status_code or 'none'}" else: fix_len = len(action.fixed_request or "") exp_len = len(action.explanation or "") return f"fix:len={fix_len}+explain:len={exp_len}" # ========================================================================= # Main # ========================================================================= async def main() -> None: llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) # Connect to environment (via Docker image or direct URL) # Use longer timeout for HF Spaces (LLM calls can be slow) if IMAGE_NAME: try: env = await APIDebugEnv.from_docker_image(IMAGE_NAME) except Exception as exc: print(f"[DEBUG] from_docker_image failed ({exc}), falling back to ENV_URL", flush=True) env = APIDebugEnv(base_url=ENV_URL, message_timeout_s=120.0) else: env = APIDebugEnv(base_url=ENV_URL, message_timeout_s=120.0) all_scores: dict = {} try: for task in TASKS: task_scores = [] for ep in range(EPISODES_PER_TASK): try: score = await run_episode(env, llm_client, task) except Exception as exc: print(f"[DEBUG] Episode failed: {exc}", flush=True) # Reconnect on WebSocket failure try: await env.close() except Exception: pass env = APIDebugEnv(base_url=ENV_URL, message_timeout_s=120.0) score = 0.0 task_scores.append(score) avg = sum(task_scores) / len(task_scores) all_scores[task] = avg # Print summary print("\n--- Baseline Scores ---", flush=True) for task, avg in all_scores.items(): print(f" {task}: {avg:.3f}", flush=True) overall = sum(all_scores.values()) / len(all_scores) print(f" overall: {overall:.3f}", flush=True) finally: try: await env.close() except Exception as e: print(f"[DEBUG] env.close() error: {e}", flush=True) if __name__ == "__main__": asyncio.run(main())