forgeenv source snapshot for training job

Browse files

Files changed (7) hide show

demo-space/app.py +283 -46
demo-space/test_heuristic.py +99 -0
scripts/jobs/train_repair_agent.py +24 -3
scripts/submit_training_job.py +35 -16
scripts/tail_training_job.py +34 -0
scripts/test_live_env.py +76 -0
scripts/test_repair_agent.py +123 -0

demo-space/app.py CHANGED Viewed

@@ -1,22 +1,32 @@
 """Gradio demo Space for the ForgeEnv Repair Agent.
-Loads the trained LoRA adapter from the Hub and exposes a 2-input form:
-broken script + error trace. Output is a unified diff. Inference runs on
-ZeroGPU (`@spaces.GPU`) so we don't pay for idle GPU time.
-If the trained adapter isn't yet uploaded, the demo falls back to the
-deterministic ``BaselineRepairAgent`` so the Space still works end-to-end.
 """
 from __future__ import annotations
 import json
 import os
 import traceback
 from typing import Optional
 import gradio as gr
-BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-3B-Instruct")
 ADAPTER_REPO = os.environ.get("ADAPTER_REPO", "akhiilll/forgeenv-repair-agent")
 _TITLE = "ForgeEnv Repair Agent — fix HuggingFace scripts under library drift"
@@ -25,7 +35,9 @@ _DESCRIPTION = (
     "produced. The Repair Agent returns a minimal unified diff. The model "
     "was trained inside [ForgeEnv](https://huggingface.co/spaces/"
     "akhiilll/forgeenv) using GRPO (TRL + Unsloth) with R-Zero-style "
-    "Challenger / Solver co-evolution."
 )
 _EXAMPLES = [
@@ -80,6 +92,29 @@ _tokenizer = None
 _load_error: Optional[str] = None
 def _load_model() -> None:
     """Lazy-load the trained LoRA on first GPU invocation."""
     global _model, _tokenizer, _load_error
@@ -96,10 +131,18 @@ def _load_model() -> None:
             torch_dtype=torch.float16,
             device_map="auto",
         )
-        try:
-            model = PeftModel.from_pretrained(base, ADAPTER_REPO)
-        except Exception as e:  # noqa: BLE001
-            print(f"[demo] adapter not found ({e}); using base model")
             model = base
         _model = model.eval()
         _tokenizer = tokenizer
@@ -107,36 +150,34 @@ def _load_model() -> None:
         _load_error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
-def _baseline_fallback(script: str, error_trace: str) -> str:
-    """Deterministic repair if the trained model isn't available.
-    Uses the in-repo BaselineRepairAgent if the package is installed; else
-    just returns an explanatory message.
-    """
-    try:
-        from forgeenv.roles.repair_agent import BaselineRepairAgent
-        agent = BaselineRepairAgent()
-        return agent.repair(script, breakage_spec=None, original_script=None)
-    except Exception:  # noqa: BLE001
-        return (
-            "# (Fallback) Trained adapter unavailable in this Space.\n"
-            "# Likely fix based on the error trace:\n"
-            f"# {error_trace.splitlines()[0] if error_trace else ''}\n"
-        )
-def _generate_with_model(prompt: str, max_new_tokens: int = 512) -> str:
     import torch
-    inputs = _tokenizer(prompt, return_tensors="pt").to(_model.device)
     with torch.no_grad():
         out = _model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=0.3,
-            top_p=0.9,
             pad_token_id=_tokenizer.eos_token_id,
         )
     completion = _tokenizer.decode(
@@ -145,8 +186,160 @@ def _generate_with_model(prompt: str, max_new_tokens: int = 512) -> str:
     return completion.strip()
-# Wrap inference in a `@spaces.GPU` decorator if available so we get a free
-# ZeroGPU slice. Outside ZeroGPU it's a no-op.
 try:
     import spaces  # type: ignore
@@ -161,22 +354,66 @@ def repair_script(script: str, error_trace: str) -> str:
     if not script.strip():
         return "# Paste a broken script first."
     _load_model()
-    if _model is None:
-        return _baseline_fallback(script, error_trace)
-    versions = json.dumps(
-        {"transformers": "4.45.0", "datasets": "2.20.0", "torch": "2.4.0"}
-    )
-    prompt = _PROMPT_TEMPLATE.format(
-        versions=versions, script=script, trace=error_trace or "(no trace)"
     )
-    try:
-        return _generate_with_model(prompt)
-    except Exception as e:  # noqa: BLE001
-        return f"# generation failed: {e}\n" + _baseline_fallback(script, error_trace)
 with gr.Blocks(title="ForgeEnv Repair Agent") as demo:
     gr.Markdown(f"# {_TITLE}\n\n{_DESCRIPTION}")
     with gr.Row():

 """Gradio demo Space for the ForgeEnv Repair Agent.
+Three-tier repair pipeline so the demo always returns a useful diff:
+1. **Trained LoRA model** — Qwen 2.5 + ForgeEnv GRPO adapter. If the model
+   emits a diff that, when applied, actually changes the broken script,
+   we use it.
+2. **Error-trace heuristic** — extracts the fix signal from the Python
+   traceback (Did you mean / unexpected kwarg / No module named) and
+   emits a clean canonical diff. Handles the most common drift patterns.
+3. **Model reasoning hint** — if heuristic fails, surface the model's
+   natural-language reasoning (it usually explains the bug correctly even
+   when its diff syntax is broken) alongside a "no patch produced" note.
+This separation means the demo is robust regardless of how well the
+LoRA generalises on a given input — and it's honest about what each
+component contributed.
 """
 from __future__ import annotations
 import json
 import os
+import re
 import traceback
 from typing import Optional
 import gradio as gr
+BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-7B-Instruct")
 ADAPTER_REPO = os.environ.get("ADAPTER_REPO", "akhiilll/forgeenv-repair-agent")
 _TITLE = "ForgeEnv Repair Agent — fix HuggingFace scripts under library drift"
     "produced. The Repair Agent returns a minimal unified diff. The model "
     "was trained inside [ForgeEnv](https://huggingface.co/spaces/"
     "akhiilll/forgeenv) using GRPO (TRL + Unsloth) with R-Zero-style "
+    "Challenger / Solver co-evolution. The agent is backed by a heuristic "
+    "fallback that parses error traces directly when the LoRA's diff is "
+    "malformed — keeps the demo robust on out-of-distribution inputs."
 )
 _EXAMPLES = [
 _load_error: Optional[str] = None
+# ----------------------------------------------------------------- model io
+def _adapter_compatible_with_base(adapter_repo: str, base_name: str) -> bool:
+    """Cheap pre-check: pull adapter_config.json and compare base_model_name."""
+    try:
+        from huggingface_hub import hf_hub_download
+        cfg_path = hf_hub_download(
+            repo_id=adapter_repo,
+            filename="adapter_config.json",
+            token=os.environ.get("HF_TOKEN"),
+        )
+        with open(cfg_path) as f:
+            cfg = json.load(f)
+        adapter_base = (cfg.get("base_model_name_or_path") or "").lower()
+        # Match by family substring -- "qwen2.5-coder-7b" must be present in
+        # the base name, otherwise the adapter targets a different arch.
+        family = base_name.split("/")[-1].lower().replace("-instruct", "")
+        return family in adapter_base
+    except Exception as e:  # noqa: BLE001
+        print(f"[demo] adapter_config check failed ({e}); attempting load anyway")
+        return True
 def _load_model() -> None:
     """Lazy-load the trained LoRA on first GPU invocation."""
     global _model, _tokenizer, _load_error
             torch_dtype=torch.float16,
             device_map="auto",
         )
+        if _adapter_compatible_with_base(ADAPTER_REPO, BASE_MODEL):
+            try:
+                model = PeftModel.from_pretrained(base, ADAPTER_REPO)
+                print(f"[demo] LoRA attached: {ADAPTER_REPO}")
+            except Exception as e:  # noqa: BLE001
+                print(f"[demo] adapter load failed ({e}); using base model")
+                model = base
+        else:
+            print(
+                f"[demo] adapter at {ADAPTER_REPO} was trained on a different "
+                f"base; using {BASE_MODEL} alone until matching adapter ships"
+            )
             model = base
         _model = model.eval()
         _tokenizer = tokenizer
         _load_error = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
+_SYSTEM_PROMPT = (
+    "You are an expert ML engineer who fixes broken HuggingFace training "
+    "scripts caused by library version drift. Output ONLY a unified diff."
+)
+def _generate_with_model(prompt: str, max_new_tokens: int = 384) -> str:
+    """Greedy decode using the base model's chat template (Qwen ChatML)."""
     import torch
+    messages = [
+        {"role": "system", "content": _SYSTEM_PROMPT},
+        {"role": "user", "content": prompt},
+    ]
+    try:
+        text = _tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+    except Exception:  # noqa: BLE001
+        text = prompt
+    inputs = _tokenizer(text, return_tensors="pt").to(_model.device)
     with torch.no_grad():
         out = _model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
+            do_sample=False,
+            temperature=0.0,
+            repetition_penalty=1.15,
             pad_token_id=_tokenizer.eos_token_id,
         )
     completion = _tokenizer.decode(
     return completion.strip()
+# -------------------------------------------------------- diff extraction
+_FENCE_RE = re.compile(r"```(?:diff|patch)?\n([\s\S]*?)```", re.IGNORECASE)
+_HUNK_RE = re.compile(r"^@@.*@@", re.MULTILINE)
+def _extract_diff_block(raw: str) -> str:
+    """Pull the *first* fenced diff out of the model's raw output."""
+    if not raw:
+        return ""
+    m = _FENCE_RE.search(raw)
+    if m:
+        return m.group(1).strip()
+    # otherwise grab from the first '---' / '+++' / '@@' onwards
+    for marker in ("--- ", "+++ ", "@@"):
+        idx = raw.find(marker)
+        if idx >= 0:
+            return raw[idx:].strip()
+    return ""
+def _diff_actually_changes_script(broken: str, diff_text: str) -> bool:
+    """Try to apply the diff. Returns True iff the result differs from input."""
+    if not diff_text:
+        return False
+    try:
+        from forgeenv.env.diff_utils import apply_unified_diff
+        repaired = apply_unified_diff(broken, diff_text)
+        return bool(repaired) and repaired.strip() != broken.strip()
+    except Exception:  # noqa: BLE001
+        return False
+def _canonicalise(broken: str, diff_text: str) -> str:
+    """Apply diff -> rebuild a clean canonical unified diff."""
+    from forgeenv.env.diff_utils import apply_unified_diff, make_unified_diff
+    repaired = apply_unified_diff(broken, diff_text)
+    if not repaired or repaired.strip() == broken.strip():
+        return ""
+    return make_unified_diff(broken, repaired)
+def _extract_model_reasoning(raw: str) -> str:
+    """Pull the natural-language reasoning out of the model's output (if any)."""
+    if not raw:
+        return ""
+    text = re.sub(_FENCE_RE, "", raw).strip()
+    text = re.sub(r"^[\s\-+@]+", "", text, flags=re.MULTILINE).strip()
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    sentences: list[str] = []
+    for ln in lines:
+        if ln.startswith(("---", "+++", "@@", "-", "+")):
+            continue
+        if len(ln) < 10:
+            continue
+        sentences.append(ln)
+        if len(sentences) >= 3:
+            break
+    return " ".join(sentences)
+# ---------------------------------------------------- error-trace heuristic
+_DID_YOU_MEAN_RE = re.compile(r"Did you mean[:\s]+['`\"]?(\w+)['`\"]?", re.IGNORECASE)
+_NO_ATTR_RE = re.compile(
+    r"has no attribute ['`\"]?(\w+)['`\"]?", re.IGNORECASE
+)
+_NO_MODULE_RE = re.compile(
+    r"No module named ['`\"]([\w\.]+)['`\"]", re.IGNORECASE
+)
+_BAD_KWARG_RE = re.compile(
+    r"unexpected keyword argument ['`\"](\w+)['`\"]", re.IGNORECASE
+)
+_USE_INSTEAD_RE = re.compile(
+    r"use\s+[`'\"]*(\w+)[\w=`'\"\s.\-]*instead", re.IGNORECASE
+)
+def _heuristic_repair(broken: str, error_trace: str) -> tuple[str, str]:
+    """Produce a (repaired_script, fix_description) pair from the trace.
+    Patterns covered:
+      * AttributeError + "Did you mean: 'X'?"  -> rename method
+      * AttributeError without hint            -> remove the call (rarely useful)
+      * ModuleNotFoundError 'X.Y'              -> drop the .Y submodule
+      * TypeError unexpected kwarg + 'use Y'   -> swap kwarg
+      * TypeError unexpected kwarg, no hint    -> drop the kwarg
+    """
+    if not error_trace:
+        return broken, ""
+    trace = error_trace.strip()
+    repaired = broken
+    description = ""
+    # 1. AttributeError 'X' + Did you mean 'Y'
+    if "AttributeError" in trace or "has no attribute" in trace:
+        old = _NO_ATTR_RE.search(trace)
+        new = _DID_YOU_MEAN_RE.search(trace)
+        if old and new and old.group(1) != new.group(1):
+            old_name, new_name = old.group(1), new.group(1)
+            pattern = re.compile(rf"\b{re.escape(old_name)}\b")
+            if pattern.search(repaired):
+                repaired = pattern.sub(new_name, repaired)
+                description = (
+                    f"`{old_name}` is no longer an attribute on this object; "
+                    f"renamed call to `{new_name}` per the traceback hint."
+                )
+    # 2. ModuleNotFoundError 'X.Y' (or 'X')
+    if not description and "No module named" in trace:
+        m = _NO_MODULE_RE.search(trace)
+        if m:
+            mod = m.group(1)
+            if "." in mod:
+                parent, child = mod.rsplit(".", 1)
+                pat_full = re.compile(rf"\b{re.escape(mod)}\b")
+                if pat_full.search(repaired):
+                    repaired = pat_full.sub(parent, repaired)
+                    description = (
+                        f"`{mod}` was removed; replaced with parent module "
+                        f"`{parent}`."
+                    )
+    # 3. TypeError unexpected kwarg
+    if not description and "unexpected keyword argument" in trace:
+        bad = _BAD_KWARG_RE.search(trace)
+        good = _USE_INSTEAD_RE.search(trace)
+        if bad:
+            bad_kw = bad.group(1)
+            if good:
+                good_kw = good.group(1)
+                pat = re.compile(rf"\b{re.escape(bad_kw)}\s*=")
+                if pat.search(repaired):
+                    repaired = pat.sub(f"{good_kw}=", repaired)
+                    # if old kwarg was a boolean-ish, also swap the value
+                    # (pad_to_max_length=True -> padding=True is fine)
+                    description = (
+                        f"`{bad_kw}` was renamed to `{good_kw}`; updated "
+                        f"keyword to match the new API."
+                    )
+            else:
+                # remove the kwarg entirely (best-effort)
+                pat = re.compile(rf",?\s*\b{re.escape(bad_kw)}\s*=\s*[^,)\n]+")
+                if pat.search(repaired):
+                    repaired = pat.sub("", repaired)
+                    description = (
+                        f"`{bad_kw}` is no longer accepted; removed the "
+                        f"keyword argument."
+                    )
+    return repaired, description
+# ------------------------------------------------------------- entry point
 try:
     import spaces  # type: ignore
     if not script.strip():
         return "# Paste a broken script first."
+    # Tier 1: trained LoRA
+    model_raw = ""
+    model_diff_canonical = ""
+    model_reasoning = ""
     _load_model()
+    if _model is not None:
+        try:
+            versions = json.dumps(
+                {"transformers": "4.45.0", "datasets": "2.20.0", "torch": "2.4.0"}
+            )
+            prompt = _PROMPT_TEMPLATE.format(
+                versions=versions,
+                script=script,
+                trace=error_trace or "(no trace)",
+            )
+            model_raw = _generate_with_model(prompt)
+            model_diff_text = _extract_diff_block(model_raw)
+            if _diff_actually_changes_script(script, model_diff_text):
+                model_diff_canonical = _canonicalise(script, model_diff_text)
+            model_reasoning = _extract_model_reasoning(model_raw)
+        except Exception as e:  # noqa: BLE001
+            print(f"[demo] model generation failed: {e}")
+    if model_diff_canonical:
+        header = (
+            "# Source: trained LoRA (ForgeEnv GRPO adapter)\n"
+            "# The model produced a valid diff that successfully patches the script.\n"
+        )
+        return header + "\n" + model_diff_canonical
+    # Tier 2: error-trace heuristic
+    repaired, description = _heuristic_repair(script, error_trace)
+    if description and repaired != script:
+        from forgeenv.env.diff_utils import make_unified_diff
+        diff = make_unified_diff(script, repaired)
+        header_lines = [
+            "# Source: error-trace heuristic (LoRA diff was malformed; "
+            "fell back to deterministic repair).",
+            f"# Fix: {description}",
+        ]
+        if model_reasoning:
+            header_lines.append(f"# Trained model said: {model_reasoning}")
+        return "\n".join(header_lines) + "\n\n" + diff
+    # Tier 3: nothing worked -- surface what we know
+    msg_lines = ["# Could not produce a confident patch."]
+    if model_reasoning:
+        msg_lines.append(f"# Trained model reasoning: {model_reasoning}")
+    if error_trace:
+        msg_lines.append(f"# Error trace summary: {error_trace.splitlines()[-1]}")
+    msg_lines.append(
+        "# Try a more specific error trace (the heuristic looks for "
+        "'Did you mean', 'No module named', or 'unexpected keyword argument')."
     )
+    return "\n".join(msg_lines)
+# ----------------------------------------------------------------- gradio
 with gr.Blocks(title="ForgeEnv Repair Agent") as demo:
     gr.Markdown(f"# {_TITLE}\n\n{_DESCRIPTION}")
     with gr.Row():

demo-space/test_heuristic.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Quick local sanity check for the heuristic repair fallback.
+Run with::
+    python demo-space/test_heuristic.py
+Each case must produce a non-empty fix description and a script that
+differs from the input.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+REPO = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO))
+sys.path.insert(0, str(REPO / "demo-space"))
+from app import _heuristic_repair  # noqa: E402
+CASES = [
+    {
+        "name": "AttributeError + Did you mean",
+        "script": (
+            "from transformers import Trainer, TrainingArguments\n"
+            "from datasets import load_dataset\n\n"
+            "ds = load_dataset('glue', 'sst2')\n"
+            "args = TrainingArguments(output_dir='out')\n"
+            "trainer = Trainer(model=None, args=args, train_dataset=ds['train'])\n"
+            "trainer.start_training()\n"
+        ),
+        "trace": (
+            "AttributeError: 'Trainer' object has no attribute 'start_training'. "
+            "Did you mean: 'train'?"
+        ),
+        "expect_in_repaired": "trainer.train()",
+        "expect_not_in_repaired": "start_training",
+    },
+    {
+        "name": "ModuleNotFoundError submodule",
+        "script": (
+            "import torch.legacy as torch\n"
+            "x = torch.randn(2, 3)\n"
+            "print(x)\n"
+        ),
+        "trace": "ModuleNotFoundError: No module named 'torch.legacy'",
+        "expect_in_repaired": "import torch",
+        "expect_not_in_repaired": "torch.legacy",
+    },
+    {
+        "name": "TypeError + use ... instead",
+        "script": (
+            "from transformers import AutoTokenizer\n"
+            "tok = AutoTokenizer.from_pretrained('bert-base-uncased')\n"
+            "out = tok(['hello world'], pad_to_max_length=True, truncate=True)\n"
+            "print(out)\n"
+        ),
+        "trace": (
+            "TypeError: __call__() got an unexpected keyword argument "
+            "'pad_to_max_length' (use `padding=True` instead)."
+        ),
+        "expect_in_repaired": "padding=True",
+        "expect_not_in_repaired": "pad_to_max_length",
+    },
+]
+def run_one(case: dict) -> bool:
+    name = case["name"]
+    repaired, description = _heuristic_repair(case["script"], case["trace"])
+    ok_changed = repaired != case["script"]
+    ok_desc = bool(description)
+    ok_in = case["expect_in_repaired"] in repaired
+    ok_not = case["expect_not_in_repaired"] not in repaired
+    status = "PASS" if (ok_changed and ok_desc and ok_in and ok_not) else "FAIL"
+    print(f"[{status}] {name}")
+    print(f"  description: {description!r}")
+    print(f"  changed?    {ok_changed}")
+    print(f"  '{case['expect_in_repaired']}' in repaired? {ok_in}")
+    print(f"  '{case['expect_not_in_repaired']}' NOT in repaired? {ok_not}")
+    if status == "FAIL":
+        print("  --- repaired script ---")
+        print(repaired)
+        print("  -----------------------")
+    return status == "PASS"
+def main() -> int:
+    results = [run_one(c) for c in CASES]
+    print()
+    n_pass = sum(results)
+    print(f"summary: {n_pass}/{len(results)} passed")
+    return 0 if all(results) else 1
+if __name__ == "__main__":
+    sys.exit(main())

scripts/jobs/train_repair_agent.py CHANGED Viewed

@@ -206,11 +206,32 @@ from forgeenv.training.plots import (  # noqa: E402
     plot_success_rate_by_category,
 )
-trainer_state = GRPO_DIR / "trainer_state.json"
 training_rewards: list[float] = []
-if trainer_state.exists():
     state = json.loads(trainer_state.read_text())
-    for log in state.get("log_history", []):
         # TRL emits a few different reward keys depending on version;
         # try the most specific first, then fall back.
         candidates = [

     plot_success_rate_by_category,
 )
+# TRL writes trainer_state.json under each checkpoint dir, not directly
+# at output_dir. Pick the latest checkpoint, fall back to output_dir.
+def _find_trainer_state(grpo_dir: Path) -> Optional[Path]:  # type: ignore[name-defined]
+    direct = grpo_dir / "trainer_state.json"
+    if direct.exists():
+        return direct
+    ckpts = sorted(
+        (p for p in grpo_dir.glob("checkpoint-*") if (p / "trainer_state.json").exists()),
+        key=lambda p: int(p.name.split("-")[-1]) if p.name.split("-")[-1].isdigit() else -1,
+    )
+    return (ckpts[-1] / "trainer_state.json") if ckpts else None
+from typing import Optional  # noqa: E402
+trainer_state = _find_trainer_state(GRPO_DIR)
+print(f"[job] trainer_state path: {trainer_state}", flush=True)
 training_rewards: list[float] = []
+if trainer_state is not None and trainer_state.exists():
     state = json.loads(trainer_state.read_text())
+    log_history = state.get("log_history", [])
+    print(f"[job] log_history rows: {len(log_history)}", flush=True)
+    if log_history:
+        sample_keys = sorted(set().union(*(log.keys() for log in log_history)))
+        print(f"[job] log keys present: {sample_keys}", flush=True)
+    for log in log_history:
         # TRL emits a few different reward keys depending on version;
         # try the most specific first, then fall back.
         candidates = [

scripts/submit_training_job.py CHANGED Viewed

@@ -87,17 +87,23 @@ def submit_job(
     base_model: str,
     timeout: str,
 ) -> JobInfo:
-    script_path = REPO_ROOT / "scripts" / "jobs" / "train_repair_agent.py"
-    script = script_path.read_text(encoding="utf-8")
     job = api.run_uv_job(
-        script=script,
         dependencies=[
             "huggingface_hub>=0.27",
             "requests",
         ],
         flavor=flavor,
         timeout=timeout,
         env={
             "HF_USERNAME": user,
             "ENV_URL": f"https://{user}-forgeenv.hf.space",
@@ -114,29 +120,42 @@ def submit_job(
     return job
-def tail_logs(api: HfApi, token: str, job_id: str) -> int:
     print(f"\n[launcher] streaming logs for job {job_id} (Ctrl-C to stop tailing) ...\n", flush=True)
-    last_status = None
     try:
-        for line in api.fetch_job_logs(job_id=job_id, token=token):
             print(line, flush=True)
     except KeyboardInterrupt:
         print("\n[launcher] log stream interrupted by user.", flush=True)
     except Exception as e:  # noqa: BLE001
         print(f"\n[launcher] log stream ended ({e}); polling status ...", flush=True)
     while True:
-        info = api.inspect_job(job_id=job_id, token=token)
-        status = getattr(info, "status", None)
-        if status != last_status:
-            print(f"[launcher] status: {status}", flush=True)
-            last_status = status
-        if status in {"COMPLETED", "FAILED", "CANCELLED", "ERROR"}:
             break
-        time.sleep(15)
-    print(f"[launcher] final status: {last_status}", flush=True)
-    return 0 if last_status == "COMPLETED" else 1
 def main() -> int:
@@ -176,7 +195,7 @@ def main() -> int:
     if args.no_tail:
         return 0
-    return tail_logs(api, token, job_id)
 if __name__ == "__main__":

     base_model: str,
     timeout: str,
 ) -> JobInfo:
+    # The training script lives in the published source repo. Pass its
+    # raw Hub URL — `run_uv_job` accepts a URL/path/command, not the
+    # script body itself.
+    script_url = (
+        f"https://huggingface.co/{user}/forgeenv-source/"
+        "resolve/main/scripts/jobs/train_repair_agent.py"
+    )
     job = api.run_uv_job(
+        script=script_url,
         dependencies=[
             "huggingface_hub>=0.27",
             "requests",
         ],
         flavor=flavor,
         timeout=timeout,
+        namespace=user,
         env={
             "HF_USERNAME": user,
             "ENV_URL": f"https://{user}-forgeenv.hf.space",
     return job
+_TERMINAL_STAGES = {"COMPLETED", "FAILED", "CANCELLED", "ERROR", "DELETED"}
+def _stage_of(info) -> str:
+    status = getattr(info, "status", None)
+    if status is None:
+        return "UNKNOWN"
+    stage = getattr(status, "stage", None)
+    if stage is None:
+        return str(status)
+    return str(stage)
+def tail_logs(api: HfApi, token: str, job_id: str, namespace: str | None = None) -> int:
     print(f"\n[launcher] streaming logs for job {job_id} (Ctrl-C to stop tailing) ...\n", flush=True)
     try:
+        for line in api.fetch_job_logs(job_id=job_id, namespace=namespace, token=token):
             print(line, flush=True)
     except KeyboardInterrupt:
         print("\n[launcher] log stream interrupted by user.", flush=True)
     except Exception as e:  # noqa: BLE001
         print(f"\n[launcher] log stream ended ({e}); polling status ...", flush=True)
+    last_stage: str | None = None
     while True:
+        info = api.inspect_job(job_id=job_id, namespace=namespace, token=token)
+        stage = _stage_of(info)
+        if stage != last_stage:
+            print(f"[launcher] status: {stage}", flush=True)
+            last_stage = stage
+        if stage in _TERMINAL_STAGES:
             break
+        time.sleep(20)
+    print(f"[launcher] final status: {last_stage}", flush=True)
+    return 0 if last_stage == "COMPLETED" else 1
 def main() -> int:
     if args.no_tail:
         return 0
+    return tail_logs(api, token, job_id, namespace=args.user)
 if __name__ == "__main__":

scripts/tail_training_job.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/env python
+"""Re-attach to an in-flight HF Jobs run and stream its logs.
+Usage::
+    $env:HF_TOKEN = "hf_..."
+    python scripts/tail_training_job.py 69ec88dfd70108f37acde39d
+"""
+from __future__ import annotations
+import os
+import sys
+from huggingface_hub import HfApi
+from submit_training_job import tail_logs  # type: ignore[import-not-found]
+def main() -> int:
+    if len(sys.argv) < 2:
+        print("usage: python scripts/tail_training_job.py <job_id> [namespace]", file=sys.stderr)
+        return 2
+    job_id = sys.argv[1]
+    namespace = sys.argv[2] if len(sys.argv) > 2 else "akhiilll"
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        print("ERROR: set HF_TOKEN in the environment first.", file=sys.stderr)
+        return 2
+    api = HfApi()
+    return tail_logs(api, token, job_id, namespace=namespace)
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/test_live_env.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Smoke-test the live ForgeEnv Space end-to-end via the OpenEnv client.
+Runs one full episode against the deployed Space:
+    reset()                 -> drift-gen turn
+    step(DriftAction)       -> repair turn
+    step(RepairAction)      -> reward + verifier breakdown
+This is the simplest possible "is the deployed env working?" check
+and a clean standalone artifact for the hackathon writeup/video.
+Usage::
+    python scripts/test_live_env.py
+"""
+from __future__ import annotations
+import asyncio
+import json
+from openenv.core import GenericAction, GenericEnvClient
+ENV_URL = "https://akhiilll-forgeenv.hf.space"
+def _summary(result, label: str) -> None:
+    obs = result.observation if isinstance(result.observation, dict) else {}
+    print(f"\n=== {label} ===")
+    print(f"phase           : {obs.get('current_phase')}")
+    print(f"task_id         : {obs.get('task_id')}")
+    print(f"target_category : {obs.get('target_category')}")
+    print(f"reward          : {result.reward}")
+    print(f"done            : {result.done}")
+    breakdown = obs.get("reward_breakdown")
+    if breakdown:
+        print("reward_breakdown:")
+        print(json.dumps(breakdown, indent=2))
+    script = obs.get("script_content") or obs.get("broken_script") or ""
+    if script:
+        preview = script.splitlines()[:8]
+        print("script preview  :")
+        for line in preview:
+            print(f"  | {line}")
+        if len(script.splitlines()) > 8:
+            print("  | ...")
+async def main(seed: int = 42) -> None:
+    print(f"connecting to {ENV_URL} (seed={seed}) ...")
+    client = GenericEnvClient(base_url=ENV_URL)
+    res = await client.reset(seed=seed, options={"difficulty": "medium"})
+    _summary(res, "after reset()")
+    target = res.observation.get("target_category", "RenameApiCall") if isinstance(res.observation, dict) else "RenameApiCall"
+    res = await client.step(GenericAction(
+        breakage={"action_type": "breakage", "primitive_type": target, "params": {}},
+        repair=None,
+    ))
+    _summary(res, "after drift step (Challenger)")
+    # empty diff = no-op repair: shows the verifier marking the script as still broken
+    res = await client.step(GenericAction(
+        breakage=None,
+        repair={"action_type": "repair", "unified_diff": ""},
+    ))
+    _summary(res, "after repair step (Solver, no-op)")
+    print("\nOK -- reset + 2 steps round-trip the deployed env.")
+if __name__ == "__main__":
+    import sys
+    seed = int(sys.argv[1]) if len(sys.argv) > 1 else 42
+    asyncio.run(main(seed=seed))

scripts/test_repair_agent.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""Smoke-test the trained Repair Agent locally on one episode.
+Loads the LoRA adapter pushed to ``akhiilll/forgeenv-repair-agent``, hits
+the live ForgeEnv Space for a fresh broken script, asks the model to
+emit a unified diff, applies it, and prints the verifier breakdown.
+Usage::
+    python scripts/test_repair_agent.py --seed 7
+    python scripts/test_repair_agent.py --seed 7 --base-model unsloth/Qwen2.5-Coder-1.5B-Instruct
+Requires GPU + transformers/peft. Skip this if you only want a quick
+demo -- use ``scripts/test_live_env.py`` or the Gradio Space instead.
+"""
+from __future__ import annotations
+import argparse
+import asyncio
+import json
+from openenv.core import GenericAction, GenericEnvClient
+ENV_URL = "https://akhiilll-forgeenv.hf.space"
+LORA_REPO = "akhiilll/forgeenv-repair-agent"
+REPAIR_PROMPT = """\
+You are a senior ML engineer fixing a HuggingFace training script that just broke.
+Output ONLY a unified diff (`--- a/script.py` / `+++ b/script.py`) that fixes the
+breakage signaled by the error trace. No prose, no fences, no explanation.
+# Broken script
+```python
+{script}
+```
+# Error trace
+```
+{error}
+```
+# Diff
+"""
+async def fetch_broken_episode(seed: int):
+    client = GenericEnvClient(base_url=ENV_URL)
+    res = await client.reset(seed=seed, options={"difficulty": "medium"})
+    target = res.observation["target_category"]
+    res = await client.step(GenericAction(
+        breakage={"action_type": "breakage", "primitive_type": target, "params": {}},
+        repair=None,
+    ))
+    obs = res.observation
+    return client, obs.get("script_content") or obs.get("broken_script") or "", obs.get("error_trace", "")
+async def submit_repair(client: GenericEnvClient, diff: str):
+    res = await client.step(GenericAction(
+        breakage=None,
+        repair={"action_type": "repair", "unified_diff": diff},
+    ))
+    return res
+def generate_diff(base_model: str, lora_repo: str, prompt: str) -> str:
+    import torch
+    from peft import PeftModel
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    print(f"loading base model: {base_model}")
+    tok = AutoTokenizer.from_pretrained(base_model)
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    print(f"attaching LoRA: {lora_repo}")
+    model = PeftModel.from_pretrained(model, lora_repo)
+    model.eval()
+    inputs = tok(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            do_sample=False,
+            temperature=0.0,
+            pad_token_id=tok.eos_token_id,
+        )
+    text = tok.decode(out[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    return text.strip()
+async def main(args) -> None:
+    print(f"--- pulling broken episode (seed={args.seed}) from {ENV_URL}")
+    client, broken_script, error_trace = await fetch_broken_episode(args.seed)
+    if not broken_script:
+        raise SystemExit("env returned empty script_content; pick a different seed")
+    print(f"broken script length: {len(broken_script)} chars")
+    print(f"error trace        : {(error_trace[:200] + '...') if len(error_trace) > 200 else error_trace}")
+    prompt = REPAIR_PROMPT.format(script=broken_script, error=error_trace or "<env did not surface a trace>")
+    diff = generate_diff(args.base_model, args.lora_repo, prompt)
+    print("\n=== model diff ===")
+    print(diff)
+    print("\n=== submitting diff to env ===")
+    res = await submit_repair(client, diff)
+    print(f"reward: {res.reward}    done: {res.done}")
+    breakdown = res.observation.get("reward_breakdown") if isinstance(res.observation, dict) else None
+    if breakdown:
+        print("reward_breakdown:")
+        print(json.dumps(breakdown, indent=2))
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--seed", type=int, default=7)
+    p.add_argument("--base-model", default="unsloth/Qwen2.5-Coder-1.5B-Instruct")
+    p.add_argument("--lora-repo", default=LORA_REPO)
+    args = p.parse_args()
+    asyncio.run(main(args))