""" SecureCodeEnv - Baseline Inference Script Required by hackathon. Runs an LLM agent through the environment. Outputs clamped [START]/[STEP]/[END] blocks to pass range validation. """ import os import json import time import sys import requests from openai import OpenAI from typing import Dict, List, Any # ── Configuration ────────────────────────────────────────────────────────── API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1") MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini") HF_TOKEN = os.environ.get("HF_TOKEN", "") ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860").rstrip("/") client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder") def clamp_score(score: float) -> float: """Ensures score is strictly between 0 and 1 (0.001 … 0.999).""" epsilon = 0.001 try: v = float(score) except (TypeError, ValueError): return 0.5 if v != v: # NaN return 0.5 return max(epsilon, min(1.0 - epsilon, v)) def clean_code(raw: str) -> str: """Removes markdown code fences safely.""" lines = [line for line in raw.splitlines() if not line.strip().startswith("```")] return "\n".join(lines).strip() SYSTEM_PROMPT = """You are a senior Python security engineer. Output ONLY raw Python code — no markdown, no explanations. Your code must: 1. Solve the problem correctly 2. Resist SQL injection, path traversal, and auth bypass attacks 3. Use parameterized queries — never f-string SQL 4. Use secrets module (not random) for tokens 5. Use bcrypt (not hashlib) for passwords 6. Use hmac.compare_digest for secret comparison 7. Have type hints and docstrings on every function""" def run_episode(difficulty: str) -> None: """Runs one episode and prints [START], [STEP], [END] blocks.""" try: r = requests.post( f"{ENV_URL}/reset", json={"difficulty": difficulty}, timeout=30, ) r.raise_for_status() data = r.json() except Exception as e: print(f"Failed to reset {difficulty}: {e}", file=sys.stderr) return sid = data["session_id"] tid = data["task_id"] print(f"[START] task={tid} difficulty={difficulty}", flush=True) final_score = clamp_score(0.0) # starts at epsilon, not 0.0 total_steps = 0 for i in range(1, 6): total_steps = i context_str = json.dumps(data.get("codegraph", {}))[:2000] prev_fb = data.get("last_feedback", "") user_msg = ( f"Task: {data['problem_statement']}\n\n" f"Security targets: {data.get('cwe_targets', [])}\n\n" f"Codebase context:\n{context_str}" ) if prev_fb: user_msg += f"\n\nPrevious feedback:\n{prev_fb}" user_msg += "\n\nWrite the complete Python implementation now:" try: resp = client.chat.completions.create( model=MODEL_NAME, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_msg}, ], max_tokens=1500, temperature=0.1, ) code = clean_code(resp.choices[0].message.content or "") if not code.strip(): code = "def placeholder(): pass" step_r = requests.post( f"{ENV_URL}/step", json={ "session_id": sid, "code": code, "filename": f"step_{i}.py", "task_id": tid, }, timeout=65, ) step_r.raise_for_status() res = step_r.json() raw_reward = res.get("total_reward", 0.0) clamped = clamp_score(raw_reward) final_score = clamped print(f"[STEP] step={i} reward={clamped:.4f}", flush=True) if res.get("done"): break # Feed updated context back for next step data["codegraph"] = res.get("codegraph", {}) data["last_feedback"] = res.get("feedback", {}).get("summary", "") except Exception as e: print(f"Error in step {i}: {e}", file=sys.stderr) # Don't break — try remaining steps time.sleep(1) print(f"[END] task={tid} score={final_score:.4f} steps={total_steps}", flush=True) def main(): # Health check try: requests.get(f"{ENV_URL}/health", timeout=10).raise_for_status() print(f"Environment healthy: {ENV_URL}", file=sys.stderr) except Exception as e: print(f"Health check failed: {e}", file=sys.stderr) sys.exit(1) for diff in ["easy", "medium", "hard"]: run_episode(diff) time.sleep(2) if __name__ == "__main__": main()