Don Rishabh Claude Opus 4.7 (1M context) commited on
Commit
156145e
·
1 Parent(s): 02851f3

Persist training artifacts: upload metrics + plots alongside adapter

Browse files

Before: train_grpo.py only pushed adapter_final/ to the Hub. Plot
rendering happened in a separate shell-script step; train_metrics.jsonl
never left the container. When the job ended, curves were gone.

Now: train_grpo.py renders plots inline (via subprocess on
training/make_plots.py) and uploads adapter + train_metrics.jsonl +
config.json + plots/ to the same Hub model repo. One source of truth
for reproducing the run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. training/train_grpo.py +43 -2
training/train_grpo.py CHANGED
@@ -379,18 +379,59 @@ def main() -> None:
379
  tokenizer.save_pretrained(str(adapter_dir))
380
  print(f"[save] adapter at {adapter_dir}", flush=True)
381
 
382
- # ----- Push to hub -----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  if args.push_to_hub:
384
  from huggingface_hub import HfApi
385
  api = HfApi()
386
  api.create_repo(args.push_to_hub, exist_ok=True, repo_type="model")
 
 
387
  api.upload_folder(
388
  folder_path=str(adapter_dir),
389
  repo_id=args.push_to_hub,
390
  repo_type="model",
391
  commit_message=f"GRPO adapter, steps={args.max_steps}",
392
  )
393
- print(f"[push] uploaded to https://huggingface.co/{args.push_to_hub}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
 
396
  if __name__ == "__main__":
 
379
  tokenizer.save_pretrained(str(adapter_dir))
380
  print(f"[save] adapter at {adapter_dir}", flush=True)
381
 
382
+ # ----- Render plots inline so they land in output_dir/plots/ -----
383
+ metrics_path = output_dir / "train_metrics.jsonl"
384
+ plots_dir = output_dir / "plots"
385
+ if metrics_path.exists():
386
+ try:
387
+ import subprocess
388
+ subprocess.run(
389
+ ["python", "-u", "training/make_plots.py",
390
+ "--metrics", str(metrics_path),
391
+ "--out-dir", str(plots_dir)],
392
+ check=False,
393
+ )
394
+ print(f"[plots] rendered to {plots_dir}", flush=True)
395
+ except Exception as e:
396
+ print(f"[plots] render failed: {e}", flush=True)
397
+
398
+ # ----- Push adapter + metrics + plots + config to hub -----
399
  if args.push_to_hub:
400
  from huggingface_hub import HfApi
401
  api = HfApi()
402
  api.create_repo(args.push_to_hub, exist_ok=True, repo_type="model")
403
+
404
+ # 1. Adapter files at repo root (so PeftModel.from_pretrained(repo_id) works)
405
  api.upload_folder(
406
  folder_path=str(adapter_dir),
407
  repo_id=args.push_to_hub,
408
  repo_type="model",
409
  commit_message=f"GRPO adapter, steps={args.max_steps}",
410
  )
411
+
412
+ # 2. Training artifacts (metrics, config) at root
413
+ for fname in ("train_metrics.jsonl", "config.json"):
414
+ src = output_dir / fname
415
+ if src.exists():
416
+ api.upload_file(
417
+ path_or_fileobj=str(src),
418
+ path_in_repo=fname,
419
+ repo_id=args.push_to_hub,
420
+ repo_type="model",
421
+ commit_message=f"upload {fname}",
422
+ )
423
+
424
+ # 3. Plots under plots/
425
+ if plots_dir.exists():
426
+ api.upload_folder(
427
+ folder_path=str(plots_dir),
428
+ repo_id=args.push_to_hub,
429
+ path_in_repo="plots",
430
+ repo_type="model",
431
+ commit_message="training plots",
432
+ )
433
+
434
+ print(f"[push] uploaded adapter + artifacts to https://huggingface.co/{args.push_to_hub}", flush=True)
435
 
436
 
437
  if __name__ == "__main__":