Ashira Pitchayapakayakul commited on
Commit
8aaeb2d
Β·
1 Parent(s): ec28ba1

v18: simplify hub naming to surrogate-1-{SIZE}B-v1.5 (owner directive)

Browse files

Owner wants the consistent ladder:
axentx/surrogate-1-7B-v1.3-polymath (existing 7B baseline β€” kept)
axentx/surrogate-1-9B-v1.5 (Qwen3.5-9B test, V18 stack)
axentx/surrogate-1-27B-v1.5 (Qwen3.6-27B if T4x2 fits)
axentx/surrogate-1-4B-v1.5 (Qwen3.5-4B fast iteration)
axentx/surrogate-1-35B-v1.5 (35B-A3B MoE, needs L40S+)

No '-coder' or '-lora' suffix in the path; size detected via regex on the
base model name (Qwen3.5-9B β†’ '9B', Qwen3.6-35B-A3B β†’ '35B' from leading
total-param number for MoE).

Files changed (1) hide show
  1. bin/kaggle-trainer.sh +21 -22
bin/kaggle-trainer.sh CHANGED
@@ -502,28 +502,27 @@ EPOCHS = float(os.environ.get("EPOCHS", "1"))
502
  # v1.1-extended 7B + FULL R1-12 + EXTENDED stack (Kaggle T4Γ—2 β€” VALIDATED)
503
  # v1.5 14B/32B + winning techniques
504
  # v2 72B magnificent run (Civo $250, far future)
505
- def _base_shortname(hf_path: str) -> str:
506
- """Map HF repo path β†’ short tag for hub-id suffixing.
507
- Examples:
508
- Qwen/Qwen2.5-Coder-7B-Instruct β†’ qwen2.5-coder-7b
509
- Qwen/Qwen3.5-9B β†’ qwen3.5-9b
510
- Qwen/Qwen3.6-27B β†’ qwen3.6-27b
511
- zai-org/glm-4-9b-chat β†’ glm-4-9b
512
- """
513
- tail = hf_path.split("/", 1)[-1].lower()
514
- for kill in ("-instruct", "-chat", "-base", "-fp8", "-int4", "-gptq"):
515
- tail = tail.replace(kill, "")
516
- return tail.replace("--", "-").strip("-")
517
-
518
- _base_short = _base_shortname(BASE)
519
- _default_hub_by_size = {
520
- 32.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
521
- 14.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
522
- 7.0: f"axentx/surrogate-1-{_base_short}-lora-v1.6",
523
- }
524
- _default_hub = _default_hub_by_size.get(_auto_size,
525
- f"axentx/surrogate-1-{_base_short}-lora-v1.6")
526
- # Backward-compat: keep existing v1.3-polymath path if base is original Qwen2.5-Coder-7B
527
  if BASE == "Qwen/Qwen2.5-Coder-7B-Instruct":
528
  _default_hub = "axentx/surrogate-1-7B-v1.3-polymath"
529
  HUB_ID = os.environ.get("HUB_MODEL_ID", _default_hub)
 
502
  # v1.1-extended 7B + FULL R1-12 + EXTENDED stack (Kaggle T4Γ—2 β€” VALIDATED)
503
  # v1.5 14B/32B + winning techniques
504
  # v2 72B magnificent run (Civo $250, far future)
505
+ import re as _re_size
506
+ def _detect_base_size(hf_path: str) -> str:
507
+ """Extract param-size tag from model name. Catches 7B, 9B, 27B, 1.5B, etc.
508
+ For MoE the leading total-param number is used (35B-A3B β†’ '35B').
509
+ Returns the matched tag (incl. trailing 'B') or empty string."""
510
+ tail = hf_path.split("/", 1)[-1]
511
+ m = _re_size.search(r"(\d+(?:\.\d+)?B)", tail, _re_size.I)
512
+ return m.group(1).upper() if m else ""
513
+
514
+ # Naming convention (owner directive 2026-05-01):
515
+ # axentx/surrogate-1-{SIZE}B-v{VERSION}[-tag]
516
+ # Examples:
517
+ # Qwen2.5-Coder-7B-Instruct β†’ axentx/surrogate-1-7B-v1.3-polymath (kept; existing baseline)
518
+ # Qwen3.5-9B β†’ axentx/surrogate-1-9B-v1.5
519
+ # Qwen3.6-27B β†’ axentx/surrogate-1-27B-v1.5
520
+ # Qwen3.5-4B β†’ axentx/surrogate-1-4B-v1.5
521
+ # v1.5 = V18 stack (R6 datasets + Phases 78-96 wired). Bump to v1.6+ when
522
+ # specialty DoRA composition or merge recipes finalize.
523
+ _size_tag = _detect_base_size(BASE) or "unknown"
524
+ _default_hub = f"axentx/surrogate-1-{_size_tag}-v1.5"
525
+ # Backward-compat: keep existing v1.3-polymath path for the original Qwen2.5-Coder-7B baseline.
 
526
  if BASE == "Qwen/Qwen2.5-Coder-7B-Instruct":
527
  _default_hub = "axentx/surrogate-1-7B-v1.3-polymath"
528
  HUB_ID = os.environ.get("HUB_MODEL_ID", _default_hub)