akhiilll commited on
Commit
f17aac5
·
verified ·
1 Parent(s): 8be8ee0

forgeenv source snapshot for training job

Browse files
Files changed (1) hide show
  1. scripts/jobs/train_repair_agent.py +4 -2
scripts/jobs/train_repair_agent.py CHANGED
@@ -71,11 +71,13 @@ sys.path.insert(0, str(src_dir))
71
  step("1. pin torch (cu124) + install GPU-stable deps")
72
  # Force a CUDA 12.4 torch wheel BEFORE anything else so other packages'
73
  # resolvers don't pull a cu130 wheel that mismatches the host driver
74
- # (this is what causes "Error 802: system not yet initialized" on H200).
 
 
75
  _sh([
76
  sys.executable, "-m", "pip", "install",
77
  "--index-url", "https://download.pytorch.org/whl/cu124",
78
- "torch==2.5.1", "torchvision==0.20.1",
79
  ])
80
  # `--no-deps` on openenv-core: it pins a different transformers/torch
81
  # stack that we don't want. We still need its *runtime* imports:
 
71
  step("1. pin torch (cu124) + install GPU-stable deps")
72
  # Force a CUDA 12.4 torch wheel BEFORE anything else so other packages'
73
  # resolvers don't pull a cu130 wheel that mismatches the host driver
74
+ # (Error 802 on some HF Job flavors). TRL 1.2+ imports ``FSDPModule`` from
75
+ # ``torch.distributed.fsdp``, which exists only in PyTorch >= 2.6 — do not
76
+ # pin to 2.5.x.
77
  _sh([
78
  sys.executable, "-m", "pip", "install",
79
  "--index-url", "https://download.pytorch.org/whl/cu124",
80
+ "torch==2.6.0", "torchvision==0.21.0",
81
  ])
82
  # `--no-deps` on openenv-core: it pins a different transformers/torch
83
  # stack that we don't want. We still need its *runtime* imports: