openenv_project / inference.py
ark406's picture
Deploy OpenEnv Submission
0b55673 verified
"""
inference.py β€” Baseline inference script for Python Bug Fixer OpenEnv.
Usage:
export API_BASE_URL="https://api-inference.huggingface.co/v1"
export MODEL_NAME="meta-llama/Meta-Llama-3-8B-Instruct"
export HF_TOKEN="hf_YOUR_TOKEN"
export SPACE_URL="https://YOUR_USERNAME-python-bug-fixer.hf.space"
python inference.py
Log format (required β€” do not change):
[START] {...json...}
[STEP] {...json...}
[END] {...json...}
"""
import os
import json
import requests
from datetime import datetime, timezone
from openai import OpenAI
# ── Environment variables ──────────────────────────────────────────────────────
# Defaults are placeholders only β€” real values must be set via env vars.
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct")
HF_TOKEN = os.getenv("HF_TOKEN", "hf_YOUR_TOKEN")
SPACE_URL = os.getenv("SPACE_URL", "http://localhost:7860")
# ── OpenAI client (uses API_BASE_URL + HF_TOKEN) ──────────────────────────────
client = OpenAI(
base_url=API_BASE_URL,
api_key=HF_TOKEN,
)
# Tasks to evaluate (in order)
TASK_IDS = ["task_easy", "task_medium", "task_hard"]
# System prompt for the debugger agent
SYSTEM_PROMPT = (
"You are an expert Python developer and debugger. "
"You will be shown buggy Python code along with the expected output. "
"Your job is to return ONLY the corrected Python code β€” raw Python, "
"no explanations, no markdown, no code fences (no ```). "
"The code you return will be executed directly. Make it print the exact expected output."
)
# ── Helper functions ───────────────────────────────────────────────────────────
def now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def reset_task(task_id: str) -> dict:
"""Call POST /reset and return the response JSON."""
resp = requests.post(
f"{SPACE_URL}/reset",
json={"task_id": task_id},
timeout=30,
)
resp.raise_for_status()
return resp.json()
def step_task(session_id: str, action: str) -> dict:
"""Call POST /step with the fixed code and return the response JSON."""
resp = requests.post(
f"{SPACE_URL}/step",
json={"session_id": session_id, "action": action},
timeout=30,
)
resp.raise_for_status()
return resp.json()
def get_fixed_code(observation: str) -> str:
"""
Call the LLM with the buggy-code observation and return fixed code.
Uses the OpenAI client configured via API_BASE_URL + MODEL_NAME.
"""
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": observation},
],
max_tokens=1000,
temperature=0.1,
)
return response.choices[0].message.content.strip()
# ── Core task runner ───────────────────────────────────────────────────────────
def run_task(task_id: str) -> dict:
"""
Run a single task episode from reset to done.
Emits [START], [STEP], [END] logs to stdout.
Returns summary dict.
"""
# Reset
reset_data = reset_task(task_id)
session_id = reset_data["session_id"]
observation = reset_data["observation"]
# [START] log β€” required format
start_log = {
"task_id": task_id,
"session_id": session_id,
"model": MODEL_NAME,
"timestamp": now_iso(),
}
print(f"[START] {json.dumps(start_log)}", flush=True)
step_num = 0
reward = 0.0
done = False
while not done:
step_num += 1
# Get action from LLM
action = get_fixed_code(observation)
# Submit action to environment
result = step_task(session_id, action)
observation = result["observation"]
reward = result["reward"]
done = result["done"]
# [STEP] log β€” required format
step_log = {
"step": step_num,
"action_chars": len(action),
"reward": reward,
"done": done,
"observation": observation[:200], # truncated for log readability
}
print(f"[STEP] {json.dumps(step_log)}", flush=True)
# [END] log β€” required format
end_log = {
"task_id": task_id,
"session_id": session_id,
"total_reward": reward,
"steps": step_num,
"success": reward >= 0.8,
"timestamp": now_iso(),
}
print(f"[END] {json.dumps(end_log)}", flush=True)
return {"task_id": task_id, "reward": reward, "steps": step_num, "success": reward >= 0.8}
# ── Entry point ────────────────────────────────────────────────────────────────
def main():
print(f"Starting inference β€” model={MODEL_NAME} space={SPACE_URL}", flush=True)
print("-" * 60, flush=True)
results = []
for task_id in TASK_IDS:
result = run_task(task_id)
results.append(result)
print("-" * 60, flush=True)
# Summary
print("\n=== SUMMARY ===")
total_reward = 0.0
for r in results:
status = "PASS" if r["success"] else "FAIL"
print(f" [{status}] {r['task_id']:15s} reward={r['reward']:.2f} steps={r['steps']}")
total_reward += r["reward"]
avg = total_reward / len(results)
print(f"\n Average reward: {avg:.2f}")
print("=== END SUMMARY ===")
if __name__ == "__main__":
main()