anugrah55 commited on
Commit
4fc573b
·
verified ·
1 Parent(s): 1db8346

Update CERNenv Space

Browse files
space/training/app.py CHANGED
@@ -269,7 +269,9 @@ def _training_pipeline(config: Dict[str, Any]) -> None:
269
  log,
270
  )
271
  if rc != 0:
272
- raise RuntimeError(f"pre-train eval failed (rc={rc})")
 
 
273
 
274
  log.write(f"\n--- GRPO training ({config['num_gpus']} GPU process(es)) ---\n")
275
  log.flush()
@@ -293,7 +295,8 @@ def _training_pipeline(config: Dict[str, Any]) -> None:
293
  log,
294
  )
295
  if rc != 0:
296
- raise RuntimeError(f"post-train eval failed (rc={rc})")
 
297
 
298
  log.write("\n--- evidence: before/after summary, distribution, trajectories ---\n")
299
  log.flush()
 
269
  log,
270
  )
271
  if rc != 0:
272
+ # don't abort we still want training + post-eval evidence.
273
+ log.write(f"\n[warn] pre-train eval failed (rc={rc}); continuing without baseline\n")
274
+ log.flush()
275
 
276
  log.write(f"\n--- GRPO training ({config['num_gpus']} GPU process(es)) ---\n")
277
  log.flush()
 
295
  log,
296
  )
297
  if rc != 0:
298
+ log.write(f"\n[warn] post-train eval failed (rc={rc}); evidence will be partial\n")
299
+ log.flush()
300
 
301
  log.write("\n--- evidence: before/after summary, distribution, trajectories ---\n")
302
  log.flush()
space/training/requirements.txt CHANGED
@@ -1,14 +1,20 @@
1
  --extra-index-url https://download.pytorch.org/whl/cu121
2
- torch==2.4.0
3
- unsloth
4
- unsloth_zoo
5
- transformers>=4.44.0
6
- trl>=0.9.0
7
- peft>=0.10.0
8
- accelerate>=1.0.0
9
- vllm>=0.5.0
10
- datasets>=2.18.0
11
- bitsandbytes>=0.43.0
 
 
 
 
 
 
12
  matplotlib>=3.8.0
13
  numpy>=1.24.0
14
  scipy>=1.10.0
 
1
  --extra-index-url https://download.pytorch.org/whl/cu121
2
+ # Pin a torch + torchao + transformers triple known to work with Unsloth on
3
+ # CUDA 12.1. We avoid newer transformers/torchao because they require
4
+ # torch>=2.6 (torch.int1 added in 2.6); we pin torch 2.5.1 which is the
5
+ # latest stable cu121 wheel that pairs cleanly with Unsloth's kernels.
6
+ torch==2.5.1
7
+ torchvision==0.20.1
8
+ torchaudio==2.5.1
9
+ torchao==0.7.0
10
+ transformers==4.46.3
11
+ trl==0.12.2
12
+ peft==0.13.2
13
+ accelerate==1.1.1
14
+ datasets==3.1.0
15
+ bitsandbytes==0.44.1
16
+ unsloth==2024.12.4
17
+ unsloth_zoo==2024.12.7
18
  matplotlib>=3.8.0
19
  numpy>=1.24.0
20
  scipy>=1.10.0