Ashira Pitchayapakayakul commited on
Commit
ec28ba1
Β·
1 Parent(s): 4e9d4f7

v18: HUB_ID base-aware + T4 FP8 guard

Browse files

Two compat fixes for base-model swap:

1. HUB_MODEL_ID was hardcoded by detected SIZE (7B β†’ v1.3-polymath path).
Setting BASE_MODEL=qwen3.5-9b would silently overwrite the v1.3 Qwen2.5
adapter on Hub. Now derives a short base tag and embeds it in the path:
Qwen2.5-Coder-7B-Instruct β†’ axentx/surrogate-1-7B-v1.3-polymath (kept)
Qwen3.5-9B β†’ axentx/surrogate-1-qwen3.5-9b-lora-v1.6
Qwen3.6-27B β†’ axentx/surrogate-1-qwen3.6-27b-lora-v1.6

2. T4 (SM 7.5) cannot run FP8. Pre-quantized FP8 bases (e.g. Qwen3.6-27B-FP8)
crash on load or dequant silently. Trainer now detects SM<9 + 'fp8' in
path and drops the -FP8 suffix to load raw BF16 weights instead.

Files changed (1) hide show
  1. bin/kaggle-trainer.sh +43 -9
bin/kaggle-trainer.sh CHANGED
@@ -478,20 +478,54 @@ _user_base = os.environ.get("BASE_MODEL", _auto_base)
478
  BASE = _BASE_ALIASES.get(_user_base, _user_base) # alias OR full HF path
479
  if _user_base != BASE:
480
  print(f" resolved BASE_MODEL alias '{_user_base}' β†’ '{BASE}'")
 
 
 
 
 
 
 
 
 
 
 
 
481
  MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "100000"))
482
  EPOCHS = float(os.environ.get("EPOCHS", "1"))
483
 
484
- # HUB_MODEL_ID auto-suffixes by detected size unless explicitly set.
485
- # Strategy ladder per user 2026-05-01:
 
 
 
486
  # v1 7B + minimal LoRA (existing baseline, on Hub)
487
- # v1.1-extended 7B + FULL R1-12 + EXTENDED stack (Kaggle T4Γ—2 β€” VALIDATE)
488
- # v1.5 14B/32B + winning techniques (after v1.1-extended validates)
489
  # v2 72B magnificent run (Civo $250, far future)
490
- _default_hub = {
491
- 32.0: "axentx/surrogate-1-coder-32B-v1.5",
492
- 14.0: "axentx/surrogate-1-coder-14B-v1.5-mid",
493
- 7.0: "axentx/surrogate-1-7B-v1.3-polymath", # ← V11: full ingest + TruthRL
494
- }.get(_auto_size, "axentx/surrogate-1-7B-v1.3-polymath")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  HUB_ID = os.environ.get("HUB_MODEL_ID", _default_hub)
496
  # seq_len auto-shrinks for smaller hardware budget
497
  _default_seq = {32.0: 2048, 14.0: 4096, 7.0: 8192}.get(_auto_size, 2048)
 
478
  BASE = _BASE_ALIASES.get(_user_base, _user_base) # alias OR full HF path
479
  if _user_base != BASE:
480
  print(f" resolved BASE_MODEL alias '{_user_base}' β†’ '{BASE}'")
481
+
482
+ # V18 hardware-vs-base sanity check. T4 (SM 7.5) cannot execute FP8 ops,
483
+ # so any pre-quantized FP8 base will either crash on load or silently
484
+ # dequantize to BF16 with severe perf penalty. GPTQ-Int4 is fine on T4.
485
+ if torch.cuda.is_available():
486
+ _sm = torch.cuda.get_device_capability(0)
487
+ if _sm[0] < 9 and "fp8" in BASE.lower():
488
+ print(f" ⚠ FP8 base '{BASE}' on SM {_sm[0]}.{_sm[1]} (T4=7.5, A100=8.0).")
489
+ print(f" FP8 needs Hopper (H100, SM 9.0) or Ada (L40, SM 8.9).")
490
+ print(f" Dropping '-FP8' suffix and loading raw BF16 weights instead.")
491
+ _alt = BASE.replace("-FP8", "").replace("-fp8", "")
492
+ BASE = _alt
493
  MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "100000"))
494
  EPOCHS = float(os.environ.get("EPOCHS", "1"))
495
 
496
+ # HUB_MODEL_ID auto-suffixes by detected size + base family unless explicitly set.
497
+ # V18 fix (2026-04-30): adapters from different bases CANNOT load on each other
498
+ # (LoRA shapes are arch-locked). Embedding the base family in the hub path
499
+ # prevents Qwen3.5-9B run from overwriting Qwen2.5-Coder-7B v1.3 baseline.
500
+ # Strategy ladder per owner 2026-05-01:
501
  # v1 7B + minimal LoRA (existing baseline, on Hub)
502
+ # v1.1-extended 7B + FULL R1-12 + EXTENDED stack (Kaggle T4Γ—2 β€” VALIDATED)
503
+ # v1.5 14B/32B + winning techniques
504
  # v2 72B magnificent run (Civo $250, far future)
505
+ def _base_shortname(hf_path: str) -> str:
506
+ """Map HF repo path β†’ short tag for hub-id suffixing.
507
+ Examples:
508
+ Qwen/Qwen2.5-Coder-7B-Instruct β†’ qwen2.5-coder-7b
509
+ Qwen/Qwen3.5-9B β†’ qwen3.5-9b
510
+ Qwen/Qwen3.6-27B β†’ qwen3.6-27b
511
+ zai-org/glm-4-9b-chat β†’ glm-4-9b
512
+ """
513
+ tail = hf_path.split("/", 1)[-1].lower()
514
+ for kill in ("-instruct", "-chat", "-base", "-fp8", "-int4", "-gptq"):
515
+ tail = tail.replace(kill, "")
516
+ return tail.replace("--", "-").strip("-")
517
+
518
+ _base_short = _base_shortname(BASE)
519
+ _default_hub_by_size = {
520
+ 32.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
521
+ 14.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
522
+ 7.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
523
+ }
524
+ _default_hub = _default_hub_by_size.get(_auto_size,
525
+ f"axentx/surrogate-1-{_base_short}-lora-v1.6")
526
+ # Backward-compat: keep existing v1.3-polymath path if base is original Qwen2.5-Coder-7B
527
+ if BASE == "Qwen/Qwen2.5-Coder-7B-Instruct":
528
+ _default_hub = "axentx/surrogate-1-7B-v1.3-polymath"
529
  HUB_ID = os.environ.get("HUB_MODEL_ID", _default_hub)
530
  # seq_len auto-shrinks for smaller hardware budget
531
  _default_seq = {32.0: 2048, 14.0: 4096, 7.0: 8192}.get(_auto_size, 2048)