akhiilll commited on
Commit
cdeb5bc
·
verified ·
1 Parent(s): 2cf3915

forgeenv source snapshot for training job

Browse files
Files changed (1) hide show
  1. scripts/jobs/train_repair_agent.py +15 -4
scripts/jobs/train_repair_agent.py CHANGED
@@ -68,21 +68,30 @@ _sh([
68
  # venv. We still run pip install for any setuptools side-effects.
69
  sys.path.insert(0, str(src_dir))
70
 
71
- step("1. pip install + verify GPU")
72
- _sh([sys.executable, "-m", "pip", "install", "-e", f"{src_dir}[openenv]"])
 
 
 
 
 
 
73
  _sh([
74
  sys.executable, "-m", "pip", "install",
75
  "trl", "peft", "accelerate", "datasets", "bitsandbytes",
76
  "matplotlib", "pyyaml", "nltk", "scikit-learn",
77
- "openenv-core>=0.2.0",
78
  ])
79
  try:
80
- _sh([sys.executable, "-m", "pip", "install", "unsloth"])
 
 
81
  except subprocess.CalledProcessError:
82
  print("[job] WARN: unsloth install failed — trainer will use plain HF.", flush=True)
83
 
84
  import torch # noqa: E402
85
 
 
86
  print(f"[job] CUDA available: {torch.cuda.is_available()}", flush=True)
87
  if torch.cuda.is_available():
88
  print(f"[job] GPU: {torch.cuda.get_device_name(0)}", flush=True)
@@ -91,6 +100,8 @@ if torch.cuda.is_available():
91
  f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB",
92
  flush=True,
93
  )
 
 
94
 
95
  step("2. ping live env Space + verify forgeenv import")
96
  import requests # noqa: E402
 
68
  # venv. We still run pip install for any setuptools side-effects.
69
  sys.path.insert(0, str(src_dir))
70
 
71
+ step("1. pip install (no torch/transformers churn) + verify GPU")
72
+ # IMPORTANT: do NOT run `pip install -e .[openenv]` — it transitively
73
+ # downgrades torch via openenv-core, which breaks CUDA on H100/H200.
74
+ # We rely on sys.path (set above) for `import forgeenv`.
75
+ _sh([
76
+ sys.executable, "-m", "pip", "install", "--no-deps",
77
+ "openenv-core>=0.2.0",
78
+ ])
79
  _sh([
80
  sys.executable, "-m", "pip", "install",
81
  "trl", "peft", "accelerate", "datasets", "bitsandbytes",
82
  "matplotlib", "pyyaml", "nltk", "scikit-learn",
83
+ "fastapi", "uvicorn", "pydantic", "requests",
84
  ])
85
  try:
86
+ # --no-deps is critical: prevents unsloth from pulling in a CPU-only
87
+ # torch wheel that overwrites the uv image's GPU torch.
88
+ _sh([sys.executable, "-m", "pip", "install", "--no-deps", "unsloth", "unsloth-zoo"])
89
  except subprocess.CalledProcessError:
90
  print("[job] WARN: unsloth install failed — trainer will use plain HF.", flush=True)
91
 
92
  import torch # noqa: E402
93
 
94
+ print(f"[job] torch: {torch.__version__}", flush=True)
95
  print(f"[job] CUDA available: {torch.cuda.is_available()}", flush=True)
96
  if torch.cuda.is_available():
97
  print(f"[job] GPU: {torch.cuda.get_device_name(0)}", flush=True)
 
100
  f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB",
101
  flush=True,
102
  )
103
+ else:
104
+ raise SystemExit("[job] FATAL: no CUDA — refusing to run training on CPU.")
105
 
106
  step("2. ping live env Space + verify forgeenv import")
107
  import requests # noqa: E402