Spaces:
Sleeping
Sleeping
| """ | |
| inference.py β Baseline inference script for Python Bug Fixer OpenEnv. | |
| Usage: | |
| export API_BASE_URL="https://api-inference.huggingface.co/v1" | |
| export MODEL_NAME="meta-llama/Meta-Llama-3-8B-Instruct" | |
| export HF_TOKEN="hf_YOUR_TOKEN" | |
| export SPACE_URL="https://YOUR_USERNAME-python-bug-fixer.hf.space" | |
| python inference.py | |
| Log format (required β do not change): | |
| [START] {...json...} | |
| [STEP] {...json...} | |
| [END] {...json...} | |
| """ | |
| import os | |
| import json | |
| import requests | |
| from datetime import datetime, timezone | |
| from openai import OpenAI | |
| # ββ Environment variables ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Defaults are placeholders only β real values must be set via env vars. | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct") | |
| HF_TOKEN = os.getenv("HF_TOKEN", "hf_YOUR_TOKEN") | |
| SPACE_URL = os.getenv("SPACE_URL", "http://localhost:7860") | |
| # ββ OpenAI client (uses API_BASE_URL + HF_TOKEN) ββββββββββββββββββββββββββββββ | |
| client = OpenAI( | |
| base_url=API_BASE_URL, | |
| api_key=HF_TOKEN, | |
| ) | |
| # Tasks to evaluate (in order) | |
| TASK_IDS = ["task_easy", "task_medium", "task_hard"] | |
| # System prompt for the debugger agent | |
| SYSTEM_PROMPT = ( | |
| "You are an expert Python developer and debugger. " | |
| "You will be shown buggy Python code along with the expected output. " | |
| "Your job is to return ONLY the corrected Python code β raw Python, " | |
| "no explanations, no markdown, no code fences (no ```). " | |
| "The code you return will be executed directly. Make it print the exact expected output." | |
| ) | |
| # ββ Helper functions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def now_iso() -> str: | |
| return datetime.now(timezone.utc).isoformat() | |
| def reset_task(task_id: str) -> dict: | |
| """Call POST /reset and return the response JSON.""" | |
| resp = requests.post( | |
| f"{SPACE_URL}/reset", | |
| json={"task_id": task_id}, | |
| timeout=30, | |
| ) | |
| resp.raise_for_status() | |
| return resp.json() | |
| def step_task(session_id: str, action: str) -> dict: | |
| """Call POST /step with the fixed code and return the response JSON.""" | |
| resp = requests.post( | |
| f"{SPACE_URL}/step", | |
| json={"session_id": session_id, "action": action}, | |
| timeout=30, | |
| ) | |
| resp.raise_for_status() | |
| return resp.json() | |
| def get_fixed_code(observation: str) -> str: | |
| """ | |
| Call the LLM with the buggy-code observation and return fixed code. | |
| Uses the OpenAI client configured via API_BASE_URL + MODEL_NAME. | |
| """ | |
| response = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": observation}, | |
| ], | |
| max_tokens=1000, | |
| temperature=0.1, | |
| ) | |
| return response.choices[0].message.content.strip() | |
| # ββ Core task runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_task(task_id: str) -> dict: | |
| """ | |
| Run a single task episode from reset to done. | |
| Emits [START], [STEP], [END] logs to stdout. | |
| Returns summary dict. | |
| """ | |
| # Reset | |
| reset_data = reset_task(task_id) | |
| session_id = reset_data["session_id"] | |
| observation = reset_data["observation"] | |
| # [START] log β required format | |
| start_log = { | |
| "task_id": task_id, | |
| "session_id": session_id, | |
| "model": MODEL_NAME, | |
| "timestamp": now_iso(), | |
| } | |
| print(f"[START] {json.dumps(start_log)}", flush=True) | |
| step_num = 0 | |
| reward = 0.0 | |
| done = False | |
| while not done: | |
| step_num += 1 | |
| # Get action from LLM | |
| action = get_fixed_code(observation) | |
| # Submit action to environment | |
| result = step_task(session_id, action) | |
| observation = result["observation"] | |
| reward = result["reward"] | |
| done = result["done"] | |
| # [STEP] log β required format | |
| step_log = { | |
| "step": step_num, | |
| "action_chars": len(action), | |
| "reward": reward, | |
| "done": done, | |
| "observation": observation[:200], # truncated for log readability | |
| } | |
| print(f"[STEP] {json.dumps(step_log)}", flush=True) | |
| # [END] log β required format | |
| end_log = { | |
| "task_id": task_id, | |
| "session_id": session_id, | |
| "total_reward": reward, | |
| "steps": step_num, | |
| "success": reward >= 0.8, | |
| "timestamp": now_iso(), | |
| } | |
| print(f"[END] {json.dumps(end_log)}", flush=True) | |
| return {"task_id": task_id, "reward": reward, "steps": step_num, "success": reward >= 0.8} | |
| # ββ Entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| print(f"Starting inference β model={MODEL_NAME} space={SPACE_URL}", flush=True) | |
| print("-" * 60, flush=True) | |
| results = [] | |
| for task_id in TASK_IDS: | |
| result = run_task(task_id) | |
| results.append(result) | |
| print("-" * 60, flush=True) | |
| # Summary | |
| print("\n=== SUMMARY ===") | |
| total_reward = 0.0 | |
| for r in results: | |
| status = "PASS" if r["success"] else "FAIL" | |
| print(f" [{status}] {r['task_id']:15s} reward={r['reward']:.2f} steps={r['steps']}") | |
| total_reward += r["reward"] | |
| avg = total_reward / len(results) | |
| print(f"\n Average reward: {avg:.2f}") | |
| print("=== END SUMMARY ===") | |
| if __name__ == "__main__": | |
| main() | |