Spaces:

axentx
/

surrogate-1

Runtime error

Ashira Pitchayapakayakul commited on 9 days ago

Commit

ec28ba1

1 Parent(s): 4e9d4f7

v18: HUB_ID base-aware + T4 FP8 guard

Two compat fixes for base-model swap:

1. HUB_MODEL_ID was hardcoded by detected SIZE (7B → v1.3-polymath path).
Setting BASE_MODEL=qwen3.5-9b would silently overwrite the v1.3 Qwen2.5
adapter on Hub. Now derives a short base tag and embeds it in the path:
Qwen2.5-Coder-7B-Instruct → axentx/surrogate-1-7B-v1.3-polymath (kept)
Qwen3.5-9B → axentx/surrogate-1-qwen3.5-9b-lora-v1.6
Qwen3.6-27B → axentx/surrogate-1-qwen3.6-27b-lora-v1.6

2. T4 (SM 7.5) cannot run FP8. Pre-quantized FP8 bases (e.g. Qwen3.6-27B-FP8)
crash on load or dequant silently. Trainer now detects SM<9 + 'fp8' in
path and drops the -FP8 suffix to load raw BF16 weights instead.

Files changed (1) hide show

bin/kaggle-trainer.sh +43 -9

bin/kaggle-trainer.sh CHANGED Viewed

@@ -478,20 +478,54 @@ _user_base = os.environ.get("BASE_MODEL", _auto_base)
 BASE = _BASE_ALIASES.get(_user_base, _user_base)  # alias OR full HF path
 if _user_base != BASE:
     print(f"  resolved BASE_MODEL alias '{_user_base}' → '{BASE}'")
 MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "100000"))
 EPOCHS = float(os.environ.get("EPOCHS", "1"))
-# HUB_MODEL_ID auto-suffixes by detected size unless explicitly set.
-# Strategy ladder per user 2026-05-01:
 #   v1            7B + minimal LoRA (existing baseline, on Hub)
-#   v1.1-extended 7B + FULL R1-12 + EXTENDED stack (Kaggle T4×2 — VALIDATE)
-#   v1.5          14B/32B + winning techniques (after v1.1-extended validates)
 #   v2            72B magnificent run (Civo $250, far future)
-_default_hub = {
-    32.0: "axentx/surrogate-1-coder-32B-v1.5",
-    14.0: "axentx/surrogate-1-coder-14B-v1.5-mid",
-    7.0:  "axentx/surrogate-1-7B-v1.3-polymath",   # ← V11: full ingest + TruthRL
-}.get(_auto_size, "axentx/surrogate-1-7B-v1.3-polymath")
 HUB_ID = os.environ.get("HUB_MODEL_ID", _default_hub)
 # seq_len auto-shrinks for smaller hardware budget
 _default_seq = {32.0: 2048, 14.0: 4096, 7.0: 8192}.get(_auto_size, 2048)

 BASE = _BASE_ALIASES.get(_user_base, _user_base)  # alias OR full HF path
 if _user_base != BASE:
     print(f"  resolved BASE_MODEL alias '{_user_base}' → '{BASE}'")
+# V18 hardware-vs-base sanity check.  T4 (SM 7.5) cannot execute FP8 ops,
+# so any pre-quantized FP8 base will either crash on load or silently
+# dequantize to BF16 with severe perf penalty.  GPTQ-Int4 is fine on T4.
+if torch.cuda.is_available():
+    _sm = torch.cuda.get_device_capability(0)
+    if _sm[0] < 9 and "fp8" in BASE.lower():
+        print(f"  ⚠ FP8 base '{BASE}' on SM {_sm[0]}.{_sm[1]} (T4=7.5, A100=8.0).")
+        print(f"    FP8 needs Hopper (H100, SM 9.0) or Ada (L40, SM 8.9).")
+        print(f"    Dropping '-FP8' suffix and loading raw BF16 weights instead.")
+        _alt = BASE.replace("-FP8", "").replace("-fp8", "")
+        BASE = _alt
 MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "100000"))
 EPOCHS = float(os.environ.get("EPOCHS", "1"))
+# HUB_MODEL_ID auto-suffixes by detected size + base family unless explicitly set.
+# V18 fix (2026-04-30): adapters from different bases CANNOT load on each other
+# (LoRA shapes are arch-locked). Embedding the base family in the hub path
+# prevents Qwen3.5-9B run from overwriting Qwen2.5-Coder-7B v1.3 baseline.
+# Strategy ladder per owner 2026-05-01:
 #   v1            7B + minimal LoRA (existing baseline, on Hub)
+#   v1.1-extended 7B + FULL R1-12 + EXTENDED stack (Kaggle T4×2 — VALIDATED)
+#   v1.5          14B/32B + winning techniques
 #   v2            72B magnificent run (Civo $250, far future)
+def _base_shortname(hf_path: str) -> str:
+    """Map HF repo path → short tag for hub-id suffixing.
+    Examples:
+      Qwen/Qwen2.5-Coder-7B-Instruct → qwen2.5-coder-7b
+      Qwen/Qwen3.5-9B                → qwen3.5-9b
+      Qwen/Qwen3.6-27B               → qwen3.6-27b
+      zai-org/glm-4-9b-chat          → glm-4-9b
+    """
+    tail = hf_path.split("/", 1)[-1].lower()
+    for kill in ("-instruct", "-chat", "-base", "-fp8", "-int4", "-gptq"):
+        tail = tail.replace(kill, "")
+    return tail.replace("--", "-").strip("-")
+_base_short = _base_shortname(BASE)
+_default_hub_by_size = {
+    32.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
+    14.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
+    7.0:  f"axentx/surrogate-1-{_base_short}-lora-v1.6",
+}
+_default_hub = _default_hub_by_size.get(_auto_size,
+    f"axentx/surrogate-1-{_base_short}-lora-v1.6")
+# Backward-compat: keep existing v1.3-polymath path if base is original Qwen2.5-Coder-7B
+if BASE == "Qwen/Qwen2.5-Coder-7B-Instruct":
+    _default_hub = "axentx/surrogate-1-7B-v1.3-polymath"
 HUB_ID = os.environ.get("HUB_MODEL_ID", _default_hub)
 # seq_len auto-shrinks for smaller hardware budget
 _default_seq = {32.0: 2048, 14.0: 4096, 7.0: 8192}.get(_auto_size, 2048)