Spaces:

ark406
/

openenv_project

Sleeping

App Files Files Community

openenv_project / inference.py

ark406

Deploy OpenEnv Submission

0b55673 verified 12 days ago

raw

history blame contribute delete

6.13 kB

	"""
	inference.py — Baseline inference script for Python Bug Fixer OpenEnv.

	Usage:
	export API_BASE_URL="https://api-inference.huggingface.co/v1"
	export MODEL_NAME="meta-llama/Meta-Llama-3-8B-Instruct"
	export HF_TOKEN="hf_YOUR_TOKEN"
	export SPACE_URL="https://YOUR_USERNAME-python-bug-fixer.hf.space"
	python inference.py

	Log format (required — do not change):
	[START] {...json...}
	[STEP] {...json...}
	[END] {...json...}
	"""

	import os
	import json
	import requests
	from datetime import datetime, timezone
	from openai import OpenAI

	# ── Environment variables ──────────────────────────────────────────────────────
	# Defaults are placeholders only — real values must be set via env vars.
	API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
	MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct")
	HF_TOKEN = os.getenv("HF_TOKEN", "hf_YOUR_TOKEN")
	SPACE_URL = os.getenv("SPACE_URL", "http://localhost:7860")

	# ── OpenAI client (uses API_BASE_URL + HF_TOKEN) ──────────────────────────────
	client = OpenAI(
	base_url=API_BASE_URL,
	api_key=HF_TOKEN,
	)

	# Tasks to evaluate (in order)
	TASK_IDS = ["task_easy", "task_medium", "task_hard"]

	# System prompt for the debugger agent
	SYSTEM_PROMPT = (
	"You are an expert Python developer and debugger. "
	"You will be shown buggy Python code along with the expected output. "
	"Your job is to return ONLY the corrected Python code — raw Python, "
	"no explanations, no markdown, no code fences (no ```). "
	"The code you return will be executed directly. Make it print the exact expected output."
	)


	# ── Helper functions ───────────────────────────────────────────────────────────

	def now_iso() -> str:
	return datetime.now(timezone.utc).isoformat()


	def reset_task(task_id: str) -> dict:
	"""Call POST /reset and return the response JSON."""
	resp = requests.post(
	f"{SPACE_URL}/reset",
	json={"task_id": task_id},
	timeout=30,
	)
	resp.raise_for_status()
	return resp.json()


	def step_task(session_id: str, action: str) -> dict:
	"""Call POST /step with the fixed code and return the response JSON."""
	resp = requests.post(
	f"{SPACE_URL}/step",
	json={"session_id": session_id, "action": action},
	timeout=30,
	)
	resp.raise_for_status()
	return resp.json()


	def get_fixed_code(observation: str) -> str:
	"""
	Call the LLM with the buggy-code observation and return fixed code.
	Uses the OpenAI client configured via API_BASE_URL + MODEL_NAME.
	"""
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": observation},
	],
	max_tokens=1000,
	temperature=0.1,
	)
	return response.choices[0].message.content.strip()


	# ── Core task runner ───────────────────────────────────────────────────────────

	def run_task(task_id: str) -> dict:
	"""
	Run a single task episode from reset to done.
	Emits [START], [STEP], [END] logs to stdout.
	Returns summary dict.
	"""
	# Reset
	reset_data = reset_task(task_id)
	session_id = reset_data["session_id"]
	observation = reset_data["observation"]

	# [START] log — required format
	start_log = {
	"task_id": task_id,
	"session_id": session_id,
	"model": MODEL_NAME,
	"timestamp": now_iso(),
	}
	print(f"[START] {json.dumps(start_log)}", flush=True)

	step_num = 0
	reward = 0.0
	done = False

	while not done:
	step_num += 1

	# Get action from LLM
	action = get_fixed_code(observation)

	# Submit action to environment
	result = step_task(session_id, action)
	observation = result["observation"]
	reward = result["reward"]
	done = result["done"]

	# [STEP] log — required format
	step_log = {
	"step": step_num,
	"action_chars": len(action),
	"reward": reward,
	"done": done,
	"observation": observation[:200], # truncated for log readability
	}
	print(f"[STEP] {json.dumps(step_log)}", flush=True)

	# [END] log — required format
	end_log = {
	"task_id": task_id,
	"session_id": session_id,
	"total_reward": reward,
	"steps": step_num,
	"success": reward >= 0.8,
	"timestamp": now_iso(),
	}
	print(f"[END] {json.dumps(end_log)}", flush=True)

	return {"task_id": task_id, "reward": reward, "steps": step_num, "success": reward >= 0.8}


	# ── Entry point ────────────────────────────────────────────────────────────────

	def main():
	print(f"Starting inference — model={MODEL_NAME} space={SPACE_URL}", flush=True)
	print("-" * 60, flush=True)

	results = []
	for task_id in TASK_IDS:
	result = run_task(task_id)
	results.append(result)
	print("-" * 60, flush=True)

	# Summary
	print("\n=== SUMMARY ===")
	total_reward = 0.0
	for r in results:
	status = "PASS" if r["success"] else "FAIL"
	print(f" [{status}] {r['task_id']:15s} reward={r['reward']:.2f} steps={r['steps']}")
	total_reward += r["reward"]
	avg = total_reward / len(results)
	print(f"\n Average reward: {avg:.2f}")
	print("=== END SUMMARY ===")


	if __name__ == "__main__":
	main()