forgeenv source snapshot for training job
Browse files
scripts/jobs/train_repair_agent.py
CHANGED
|
@@ -68,21 +68,30 @@ _sh([
|
|
| 68 |
# venv. We still run pip install for any setuptools side-effects.
|
| 69 |
sys.path.insert(0, str(src_dir))
|
| 70 |
|
| 71 |
-
step("1. pip install + verify GPU")
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
_sh([
|
| 74 |
sys.executable, "-m", "pip", "install",
|
| 75 |
"trl", "peft", "accelerate", "datasets", "bitsandbytes",
|
| 76 |
"matplotlib", "pyyaml", "nltk", "scikit-learn",
|
| 77 |
-
"
|
| 78 |
])
|
| 79 |
try:
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
except subprocess.CalledProcessError:
|
| 82 |
print("[job] WARN: unsloth install failed — trainer will use plain HF.", flush=True)
|
| 83 |
|
| 84 |
import torch # noqa: E402
|
| 85 |
|
|
|
|
| 86 |
print(f"[job] CUDA available: {torch.cuda.is_available()}", flush=True)
|
| 87 |
if torch.cuda.is_available():
|
| 88 |
print(f"[job] GPU: {torch.cuda.get_device_name(0)}", flush=True)
|
|
@@ -91,6 +100,8 @@ if torch.cuda.is_available():
|
|
| 91 |
f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB",
|
| 92 |
flush=True,
|
| 93 |
)
|
|
|
|
|
|
|
| 94 |
|
| 95 |
step("2. ping live env Space + verify forgeenv import")
|
| 96 |
import requests # noqa: E402
|
|
|
|
| 68 |
# venv. We still run pip install for any setuptools side-effects.
|
| 69 |
sys.path.insert(0, str(src_dir))
|
| 70 |
|
| 71 |
+
step("1. pip install (no torch/transformers churn) + verify GPU")
|
| 72 |
+
# IMPORTANT: do NOT run `pip install -e .[openenv]` — it transitively
|
| 73 |
+
# downgrades torch via openenv-core, which breaks CUDA on H100/H200.
|
| 74 |
+
# We rely on sys.path (set above) for `import forgeenv`.
|
| 75 |
+
_sh([
|
| 76 |
+
sys.executable, "-m", "pip", "install", "--no-deps",
|
| 77 |
+
"openenv-core>=0.2.0",
|
| 78 |
+
])
|
| 79 |
_sh([
|
| 80 |
sys.executable, "-m", "pip", "install",
|
| 81 |
"trl", "peft", "accelerate", "datasets", "bitsandbytes",
|
| 82 |
"matplotlib", "pyyaml", "nltk", "scikit-learn",
|
| 83 |
+
"fastapi", "uvicorn", "pydantic", "requests",
|
| 84 |
])
|
| 85 |
try:
|
| 86 |
+
# --no-deps is critical: prevents unsloth from pulling in a CPU-only
|
| 87 |
+
# torch wheel that overwrites the uv image's GPU torch.
|
| 88 |
+
_sh([sys.executable, "-m", "pip", "install", "--no-deps", "unsloth", "unsloth-zoo"])
|
| 89 |
except subprocess.CalledProcessError:
|
| 90 |
print("[job] WARN: unsloth install failed — trainer will use plain HF.", flush=True)
|
| 91 |
|
| 92 |
import torch # noqa: E402
|
| 93 |
|
| 94 |
+
print(f"[job] torch: {torch.__version__}", flush=True)
|
| 95 |
print(f"[job] CUDA available: {torch.cuda.is_available()}", flush=True)
|
| 96 |
if torch.cuda.is_available():
|
| 97 |
print(f"[job] GPU: {torch.cuda.get_device_name(0)}", flush=True)
|
|
|
|
| 100 |
f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB",
|
| 101 |
flush=True,
|
| 102 |
)
|
| 103 |
+
else:
|
| 104 |
+
raise SystemExit("[job] FATAL: no CUDA — refusing to run training on CPU.")
|
| 105 |
|
| 106 |
step("2. ping live env Space + verify forgeenv import")
|
| 107 |
import requests # noqa: E402
|