forgeenv source snapshot for training job
Browse files
scripts/jobs/train_repair_agent.py
CHANGED
|
@@ -71,11 +71,13 @@ sys.path.insert(0, str(src_dir))
|
|
| 71 |
step("1. pin torch (cu124) + install GPU-stable deps")
|
| 72 |
# Force a CUDA 12.4 torch wheel BEFORE anything else so other packages'
|
| 73 |
# resolvers don't pull a cu130 wheel that mismatches the host driver
|
| 74 |
-
# (
|
|
|
|
|
|
|
| 75 |
_sh([
|
| 76 |
sys.executable, "-m", "pip", "install",
|
| 77 |
"--index-url", "https://download.pytorch.org/whl/cu124",
|
| 78 |
-
"torch==2.
|
| 79 |
])
|
| 80 |
# `--no-deps` on openenv-core: it pins a different transformers/torch
|
| 81 |
# stack that we don't want. We still need its *runtime* imports:
|
|
|
|
| 71 |
step("1. pin torch (cu124) + install GPU-stable deps")
|
| 72 |
# Force a CUDA 12.4 torch wheel BEFORE anything else so other packages'
|
| 73 |
# resolvers don't pull a cu130 wheel that mismatches the host driver
|
| 74 |
+
# (Error 802 on some HF Job flavors). TRL 1.2+ imports ``FSDPModule`` from
|
| 75 |
+
# ``torch.distributed.fsdp``, which exists only in PyTorch >= 2.6 — do not
|
| 76 |
+
# pin to 2.5.x.
|
| 77 |
_sh([
|
| 78 |
sys.executable, "-m", "pip", "install",
|
| 79 |
"--index-url", "https://download.pytorch.org/whl/cu124",
|
| 80 |
+
"torch==2.6.0", "torchvision==0.21.0",
|
| 81 |
])
|
| 82 |
# `--no-deps` on openenv-core: it pins a different transformers/torch
|
| 83 |
# stack that we don't want. We still need its *runtime* imports:
|