#!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. """ Baseline inference script for the Self-Healing DevOps Sandbox. Uses an LLM (via the OpenAI-compatible API) to diagnose and fix a broken Node.js backend running inside a Docker container. """ import json import os import sys try: from openai import OpenAI except ImportError: print("ERROR: 'openai' package is required. Install with: pip install openai") sys.exit(1) from client import DevopsSandboxEnv from models import BashAction # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1" MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini" HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY") ENV_URL = os.getenv("DEVOPS_SANDBOX_URL", "http://localhost:8000") TASK_NAME = os.getenv("MY_ENV_V4_TASK", "devops_sandbox") BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "devops_sandbox") MAX_TURNS = int(os.getenv("MAX_TURNS", "8")) SYSTEM_PROMPT = """\ You are an expert DevOps engineer and Node.js developer. You have been dropped into a Linux container with a broken Express.js backend in /app. Your goal is to diagnose and fix ALL bugs so the app runs correctly. RULES: 1. Respond ONLY with a JSON object: {"command": ""} 2. Use standard bash/Linux commands (ls, cat, grep, sed, node, npm, etc.) 3. Do NOT use interactive editors (vi, nano). Use sed or echo/cat with redirection. 4. After fixing bugs, restart the app with: cd /app && npm start & 5. Be methodical: read files first, understand the bug, then fix it. EXPECTED FINAL STATE: - App starts without errors on port 3000 - GET /health → 200 - GET /api/users → 200 with JSON containing "users" array - GET /api/data → 200 with JSON containing "records" array """ def extract_command(llm_response: str) -> str: """Extract a bash command from the LLM's response (JSON or raw text).""" try: data = json.loads(llm_response.strip()) if isinstance(data, dict) and "command" in data: return data["command"] except (json.JSONDecodeError, TypeError): pass if "```" in llm_response: lines = llm_response.split("```") for block in lines[1::2]: code = block.strip() if code.startswith("json"): code = code[4:].strip() try: data = json.loads(code) if isinstance(data, dict) and "command" in data: return data["command"] except (json.JSONDecodeError, TypeError): pass elif code.startswith("bash") or code.startswith("sh"): code = code.split("\n", 1)[-1].strip() return code else: first_line = code.split("\n")[0].strip() if first_line: return first_line cmd = llm_response.strip().strip("`").strip() if cmd.startswith("{"): try: return json.loads(cmd)["command"] except Exception: pass return cmd def main(): if not HF_TOKEN: pass # we can let it fail or use empty key depending on endpoint client = OpenAI(api_key=HF_TOKEN or "dummy_key", base_url=API_BASE_URL) TASKS = ["easy", "medium", "hard"] # Note: openenv evaluation specifically needs exactly 3 things: [START], [STEP] logs, [END] for task_name in TASKS: messages = [{"role": "system", "content": SYSTEM_PROMPT}] try: with DevopsSandboxEnv(base_url=ENV_URL).sync() as env: result = env.reset(task_name=task_name) obs = result.observation print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True) messages.append({ "role": "user", "content": ( f"Here is the initial state of the broken app:\n\n" f"```\n{obs.stdout}\n```\n\n" f"Current directory: {obs.current_dir}\n" f"Score: {obs.grader_score}/1.0\n\n" f"What bash command should I run first?" ), }) rewards = [] is_done = False steps_taken = 0 final_score = getattr(obs, 'grader_score', 0.01) for turn in range(1, MAX_TURNS + 1): try: response = client.chat.completions.create( model=MODEL_NAME, messages=messages, temperature=0.2, max_tokens=256, ) llm_text = response.choices[0].message.content or "" except Exception as e: err_msg = str(e).replace('"', "'") break command = extract_command(llm_text) if not command: command = "ls -la /app" error_msg = "null" try: result = env.step(BashAction(command=command)) obs = result.observation except Exception as e: obs = env.state # Mock failed obs error_msg = str(e).replace('\n', ' ') steps_taken += 1 reward_val = obs.reward if hasattr(obs, 'reward') else getattr(obs, 'grader_score', 0.01) rewards.append(f"{reward_val:.2f}") is_done = result.done if hasattr(result, 'done') else getattr(obs, 'done', False) done_str = "true" if is_done else "false" action_str = command.replace('\n', ' ; ') print(f"[STEP] step={steps_taken} action={action_str} reward={reward_val:.2f} done={done_str} error={error_msg}", flush=True) messages.append({"role": "assistant", "content": llm_text}) messages.append({ "role": "user", "content": ( f"Command output:\n" f"stdout:\n```\n{getattr(obs, 'stdout', '')}\n```\n" f"stderr:\n```\n{getattr(obs, 'stderr', '')}\n```\n" f"Current score: {getattr(obs, 'grader_score', 0.01)}/1.0\n" f"Grader feedback: {getattr(obs, 'grader_feedback', '')}\n\n" f"What command should I run next?" ), }) final_score = getattr(obs, 'grader_score', 0.01) if final_score >= 0.99 or getattr(obs, 'done', False) or (hasattr(result, 'done') and result.done): break # Clamp final score strictly within (0, 1) final_score = max(0.01, min(0.99, final_score)) success_str = "true" if final_score >= 0.99 else "false" rewards_str = ",".join(rewards) if rewards else "0.01" print(f"[END] success={success_str} steps={steps_taken} score={final_score:.2f} rewards={rewards_str}", flush=True) except Exception as e: # Make sure to emit END log even on catastrophic wrapper failures so Hackathon doesn't crash inference.py print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True) if __name__ == "__main__": main()