Spaces:

avichauhan
/

api-debug-env

Running

App Files Files Community

api-debug-env / inference.py

avichauhan

Upload folder using huggingface_hub

d73bfc0 verified 6 days ago

raw

history blame contribute delete

14.2 kB

	"""
	Baseline inference script for the API Debug Environment.

	MANDATORY:
	- Must be named inference.py and placed in the root directory.
	- Must use OpenAI Client for all LLM calls.
	- Must read env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN.
	- Must emit [START], [STEP], [END] structured logs to stdout.

	STDOUT FORMAT:
	[START] task=<task_name> env=<benchmark> model=<model_name>
	[STEP] step=<n> action=<action_str> reward=<0.00> done=<true\|false> error=<msg\|null>
	[END] success=<true\|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
	"""

	import asyncio
	import json
	import os
	import re
	import textwrap
	from typing import List, Optional

	from openai import OpenAI

	from client import APIDebugEnv
	from models import APIDebugAction

	# Environment variables (mandatory for hackathon evaluation)
	API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
	HF_TOKEN = os.getenv("HF_TOKEN")
	API_KEY = HF_TOKEN or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
	MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
	ENV_URL = os.getenv("ENV_URL") or "https://avichauhan-api-debug-env.hf.space"
	IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")

	# Task configuration
	TASKS = ["easy", "classify", "medium", "headers", "response", "hard"]
	EPISODES_PER_TASK = 3
	MAX_STEPS = {"easy": 3, "classify": 4, "medium": 5, "headers": 4, "response": 4, "hard": 7}
	BENCHMARK_NAME = "api_debug"


	# =========================================================================
	# Structured logging (exact format required by evaluator)
	# =========================================================================

	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(
	step: int, action: str, reward: float, done: bool, error: Optional[str]
	) -> None:
	done_val = str(done).lower()
	error_val = error if error else "null"
	print(
	f"[STEP] step={step} action={action} reward={reward:.4f} "
	f"done={done_val} error={error_val}",
	flush=True,
	)


	def log_end(
	success: bool, steps: int, score: float, rewards: List[float]
	) -> None:
	rewards_str = ",".join(f"{r:.4f}" for r in rewards)
	print(
	f"[END] success={str(success).lower()} steps={steps} "
	f"score={score:.4f} rewards={rewards_str}",
	flush=True,
	)


	# =========================================================================
	# System prompts per task
	# =========================================================================

	SYSTEM_PROMPTS = {
	"easy": textwrap.dedent("""
	You are an API debugging expert. You receive a broken API request and its specification.
	Your job: identify the error type and the affected fields.

	Respond with ONLY a JSON object in this format:
	{"error_type": "<type>", "affected_fields": ["field1", "field2"]}

	Valid error types:
	missing_required_field, wrong_field_type, invalid_email_format,
	missing_auth_header, extra_unknown_field, null_value_in_required,
	wrong_http_method, malformed_json_value, invalid_enum_value,
	datetime_format_error, wrong_content_type, expired_auth_token
	""").strip(),

	"classify": textwrap.dedent("""
	You are an API debugging expert. You receive a broken API request with MULTIPLE errors.
	Your job: identify ALL error types and ALL affected fields.

	Respond with ONLY a JSON object in this format:
	{"error_types": ["type1", "type2"], "affected_fields": ["field1", "field2"]}

	Valid error types:
	missing_required_field, wrong_field_type, invalid_email_format,
	missing_auth_header, extra_unknown_field, null_value_in_required,
	wrong_http_method, malformed_json_value, invalid_enum_value,
	datetime_format_error, wrong_content_type, expired_auth_token
	""").strip(),

	"medium": textwrap.dedent("""
	You are an API debugging expert. You receive a broken API request and its specification.
	Your job: fix the request so it matches the spec.

	Respond with ONLY a JSON object in this format:
	{"fixed_request": "<valid JSON string matching the spec>", "fixed_headers": {"Header": "value"}}

	The fixed_request must be a valid JSON string. Include all required fields with correct types.
	""").strip(),

	"headers": textwrap.dedent("""
	You are an API debugging expert. You receive a broken API request with header-level errors.
	Your job: identify the header error type and provide the corrected headers.

	Respond with ONLY a JSON object in this format:
	{"error_type": "<type>", "fixed_headers": {"Header-Name": "correct-value"}}

	Valid header error types:
	missing_auth_header, wrong_content_type, expired_auth_token

	Common headers: Authorization (Bearer token), Content-Type (application/json)
	""").strip(),

	"response": textwrap.dedent("""
	You are an API response validation expert. You receive an API request, its specification,
	and the server's response. Your job: identify issues in the response.

	Respond with ONLY a JSON object in this format:
	{"response_issues": ["issue_type1", "issue_type2"], "affected_fields": ["field1"], "expected_status_code": 200}

	Valid response issue types:
	wrong_status_code, missing_response_field, wrong_response_type,
	extra_response_field, inconsistent_error_format

	Only include expected_status_code if you detect a wrong_status_code issue.
	""").strip(),

	"hard": textwrap.dedent("""
	You are an API debugging expert. You receive a broken API request with multiple errors.
	Your job: diagnose the errors, fix the request, and explain the fix for a developer.

	Respond with ONLY a JSON object in this format:
	{
	"error_type": "<primary error type>",
	"affected_fields": ["field1"],
	"fixed_request": "<valid JSON string>",
	"fixed_headers": {"Header": "value"},
	"explanation": "Clear explanation of what was wrong and how to fix it."
	}
	""").strip(),
	}


	# =========================================================================
	# Prompt building
	# =========================================================================

	def build_user_prompt(obs, step_num: int) -> str:
	"""Build the user prompt from the observation."""
	parts = [
	f"API: {obs.http_method} {obs.endpoint} ({obs.api_name})",
	f"Error count: {obs.error_count}",
	f"Step {step_num}/{obs.max_steps}",
	f"\nRequest body:\n{obs.broken_request}",
	f"\nRequest headers: {json.dumps(obs.broken_headers)}",
	f"\nAPI Specification:\n{obs.api_spec}",
	]
	# Include response data for response validation task
	if obs.response_body:
	parts.append(f"\nResponse status code: {obs.response_status_code}")
	parts.append(f"\nResponse body:\n{obs.response_body}")
	if obs.feedback:
	parts.append(f"\nFeedback from previous attempt:\n{obs.feedback}")
	return "\n".join(parts)


	# =========================================================================
	# LLM response parsing
	# =========================================================================

	def parse_llm_response(text: str) -> dict:
	"""Extract a JSON object from the LLM response.

	Handles cases where the LLM wraps JSON in markdown code blocks
	or adds extra text around it.
	"""
	if not text:
	return {}

	# Try direct parse first
	try:
	return json.loads(text)
	except json.JSONDecodeError:
	pass

	# Try extracting from markdown code block
	code_block = re.search(r"```(?:json)?\s\n?(.?)\n?\s*```", text, re.DOTALL)
	if code_block:
	try:
	return json.loads(code_block.group(1))
	except json.JSONDecodeError:
	pass

	# Try finding any JSON object in the text
	brace_match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
	if brace_match:
	try:
	return json.loads(brace_match.group(0))
	except json.JSONDecodeError:
	pass

	return {}


	def build_action(data: dict) -> APIDebugAction:
	"""Convert parsed JSON dict to APIDebugAction."""
	# Handle fixed_request: if it's a dict, serialize to JSON string
	fixed_req = data.get("fixed_request")
	if isinstance(fixed_req, dict):
	fixed_req = json.dumps(fixed_req)

	return APIDebugAction(
	error_type=data.get("error_type"),
	error_types=data.get("error_types"),
	affected_fields=data.get("affected_fields"),
	fixed_request=fixed_req,
	fixed_headers=data.get("fixed_headers"),
	explanation=data.get("explanation"),
	response_issues=data.get("response_issues"),
	expected_status_code=data.get("expected_status_code"),
	)


	# =========================================================================
	# Episode runner
	# =========================================================================

	async def run_episode(
	env: APIDebugEnv,
	llm_client: OpenAI,
	task: str,
	) -> float:
	"""Run a single episode for the given task. Returns the final score."""
	log_start(task=task, env=BENCHMARK_NAME, model=MODEL_NAME)

	result = await env.reset(task=task)
	obs = result.observation
	rewards: List[float] = []
	steps_taken = 0

	max_steps = MAX_STEPS[task]

	for step in range(1, max_steps + 1):
	if result.done:
	break

	user_prompt = build_user_prompt(obs, step)

	# Call the LLM
	try:
	completion = llm_client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPTS[task]},
	{"role": "user", "content": user_prompt},
	],
	max_tokens=500,
	temperature=0.0,
	)
	llm_text = completion.choices[0].message.content or ""
	except Exception as exc:
	print(f"[DEBUG] LLM request failed: {exc}", flush=True)
	llm_text = ""

	# Parse LLM output into action
	parsed = parse_llm_response(llm_text)
	action = build_action(parsed)

	# Step the environment
	result = await env.step(action)
	obs = result.observation
	reward = result.reward or 0.0
	done = result.done

	rewards.append(reward)
	steps_taken = step

	# Build a short action summary for the log
	action_summary = _action_summary(action, task)
	log_step(step=step, action=action_summary, reward=reward, done=done, error=None)

	if done:
	break

	# Final score is the max reward achieved (environment already tracks best)
	# Clamp to open interval (0, 1) - evaluator rejects exactly 0.0 and 1.0
	score = max(rewards) if rewards else 0.001
	score = min(max(score, 0.001), 0.999)
	success = score >= 0.5

	log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
	return score


	def _action_summary(action: APIDebugAction, task: str) -> str:
	"""Short summary of the action for logging."""
	if task == "easy":
	return f"diagnose:{action.error_type or 'none'}"
	elif task == "classify":
	types = action.error_types or [action.error_type or "none"]
	return f"classify:{','.join(str(t) for t in types)}"
	elif task == "medium":
	fix_len = len(action.fixed_request or "")
	return f"fix:len={fix_len}"
	elif task == "headers":
	hdr_count = len(action.fixed_headers or {})
	return f"headers:{action.error_type or 'none'}+fix:{hdr_count}"
	elif task == "response":
	issues = action.response_issues or []
	return f"response:{','.join(issues) or 'none'}+status:{action.expected_status_code or 'none'}"
	else:
	fix_len = len(action.fixed_request or "")
	exp_len = len(action.explanation or "")
	return f"fix:len={fix_len}+explain:len={exp_len}"


	# =========================================================================
	# Main
	# =========================================================================

	async def main() -> None:
	llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	# Connect to environment (via Docker image or direct URL)
	# Use longer timeout for HF Spaces (LLM calls can be slow)
	if IMAGE_NAME:
	try:
	env = await APIDebugEnv.from_docker_image(IMAGE_NAME)
	except Exception as exc:
	print(f"[DEBUG] from_docker_image failed ({exc}), falling back to ENV_URL", flush=True)
	env = APIDebugEnv(base_url=ENV_URL, message_timeout_s=120.0)
	else:
	env = APIDebugEnv(base_url=ENV_URL, message_timeout_s=120.0)

	all_scores: dict = {}

	try:
	for task in TASKS:
	task_scores = []
	for ep in range(EPISODES_PER_TASK):
	try:
	score = await run_episode(env, llm_client, task)
	except Exception as exc:
	print(f"[DEBUG] Episode failed: {exc}", flush=True)
	# Reconnect on WebSocket failure
	try:
	await env.close()
	except Exception:
	pass
	env = APIDebugEnv(base_url=ENV_URL, message_timeout_s=120.0)
	score = 0.0
	task_scores.append(score)
	avg = sum(task_scores) / len(task_scores)
	all_scores[task] = avg

	# Print summary
	print("\n--- Baseline Scores ---", flush=True)
	for task, avg in all_scores.items():
	print(f" {task}: {avg:.3f}", flush=True)
	overall = sum(all_scores.values()) / len(all_scores)
	print(f" overall: {overall:.3f}", flush=True)

	finally:
	try:
	await env.close()
	except Exception as e:
	print(f"[DEBUG] env.close() error: {e}", flush=True)


	if __name__ == "__main__":
	asyncio.run(main())