akhiilll
/

forgeenv-source

Model card Files Files and versions

xet

Community

akhiilll commited on 13 days ago

Commit

cdeb5bc

verified ·

1 Parent(s): 2cf3915

forgeenv source snapshot for training job

Browse files

Files changed (1) hide show

scripts/jobs/train_repair_agent.py +15 -4

scripts/jobs/train_repair_agent.py CHANGED Viewed

@@ -68,21 +68,30 @@ _sh([
 # venv. We still run pip install for any setuptools side-effects.
 sys.path.insert(0, str(src_dir))
-step("1. pip install + verify GPU")
-_sh([sys.executable, "-m", "pip", "install", "-e", f"{src_dir}[openenv]"])
 _sh([
     sys.executable, "-m", "pip", "install",
     "trl", "peft", "accelerate", "datasets", "bitsandbytes",
     "matplotlib", "pyyaml", "nltk", "scikit-learn",
-    "openenv-core>=0.2.0",
 ])
 try:
-    _sh([sys.executable, "-m", "pip", "install", "unsloth"])
 except subprocess.CalledProcessError:
     print("[job] WARN: unsloth install failed — trainer will use plain HF.", flush=True)
 import torch  # noqa: E402
 print(f"[job] CUDA available: {torch.cuda.is_available()}", flush=True)
 if torch.cuda.is_available():
     print(f"[job] GPU: {torch.cuda.get_device_name(0)}", flush=True)
@@ -91,6 +100,8 @@ if torch.cuda.is_available():
         f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB",
         flush=True,
     )
 step("2. ping live env Space + verify forgeenv import")
 import requests  # noqa: E402

 # venv. We still run pip install for any setuptools side-effects.
 sys.path.insert(0, str(src_dir))
+step("1. pip install (no torch/transformers churn) + verify GPU")
+# IMPORTANT: do NOT run `pip install -e .[openenv]` — it transitively
+# downgrades torch via openenv-core, which breaks CUDA on H100/H200.
+# We rely on sys.path (set above) for `import forgeenv`.
+_sh([
+    sys.executable, "-m", "pip", "install", "--no-deps",
+    "openenv-core>=0.2.0",
+])
 _sh([
     sys.executable, "-m", "pip", "install",
     "trl", "peft", "accelerate", "datasets", "bitsandbytes",
     "matplotlib", "pyyaml", "nltk", "scikit-learn",
+    "fastapi", "uvicorn", "pydantic", "requests",
 ])
 try:
+    # --no-deps is critical: prevents unsloth from pulling in a CPU-only
+    # torch wheel that overwrites the uv image's GPU torch.
+    _sh([sys.executable, "-m", "pip", "install", "--no-deps", "unsloth", "unsloth-zoo"])
 except subprocess.CalledProcessError:
     print("[job] WARN: unsloth install failed — trainer will use plain HF.", flush=True)
 import torch  # noqa: E402
+print(f"[job] torch: {torch.__version__}", flush=True)
 print(f"[job] CUDA available: {torch.cuda.is_available()}", flush=True)
 if torch.cuda.is_available():
     print(f"[job] GPU: {torch.cuda.get_device_name(0)}", flush=True)
         f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB",
         flush=True,
     )
+else:
+    raise SystemExit("[job] FATAL: no CUDA — refusing to run training on CPU.")
 step("2. ping live env Space + verify forgeenv import")
 import requests  # noqa: E402