Spaces:
Runtime error
v18: add Qwen3.5/3.6 aliases (2026-04 releases) — Qwen3.5-9B = sweet spot for T4x2
Browse filesHF API audit 2026-04-30 confirmed Qwen3.5 (Apr-23) and Qwen3.6 (Apr-24)
families are live. User flagged that earlier base-swap recommendations
defaulted on Qwen2.5 without surfacing the Apr-2026 options.
Reality on T4x2 16GB/card (V5 OOM trace shows 14B already at the edge):
- Qwen3.6-35B-A3B / Qwen3.6-27B → tight to OOM, MoE/dense both ≥13.5GB
- Qwen3.5-35B-A3B / Qwen3.5-27B → same range
- Qwen3.5-9B (dense) → ~4.5GB at 4-bit, comfortable
- Qwen3.5-4B (dense) → trivial
New aliases (set via BASE_MODEL env):
- qwen3.5-4b / qwen3.5-9b / qwen3.5-27b / qwen3.5-27b-int4 / qwen3.5-35b-a3b
- qwen3.5-122b-a10b / qwen3.5-397b-a17b
- qwen3.6-27b / qwen3.6-27b-fp8 / qwen3.6-35b-a3b / qwen3.6-35b-fp8
- qwen3-coder-next / qwen3-coder-480b
Auto-pick stays on Qwen2.5-Coder-7B-Instruct because the existing
axentx/surrogate-1-* HF datasets were distilled with that tokenizer;
flipping auto-default would silently re-tokenize and invalidate the v1
baseline comparison. Override is a one-line BASE_MODEL=qwen3.5-9b.
- bin/kaggle-trainer.sh +38 -10
|
@@ -416,6 +416,13 @@ def pick_base_for_hardware():
|
|
| 416 |
# 32B 4-bit (16GB) — needs per-GPU ≥30GB (forward+backward eats 2× weights)
|
| 417 |
# 14B 4-bit (7GB) — needs per-GPU ≥22GB safely (V5 OOM proof at 16GB)
|
| 418 |
# 7B 4-bit (3.5GB) — fits T4 16GB with margin (~12GB peak)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
if per_gpu_gb >= 30:
|
| 420 |
return "Qwen/Qwen2.5-Coder-32B-Instruct", 32.0
|
| 421 |
if per_gpu_gb >= 22:
|
|
@@ -425,24 +432,45 @@ def pick_base_for_hardware():
|
|
| 425 |
|
| 426 |
_auto_base, _auto_size = pick_base_for_hardware()
|
| 427 |
_BASE_ALIASES = {
|
| 428 |
-
# short-name → real HF path
|
| 429 |
-
#
|
| 430 |
-
#
|
| 431 |
-
|
|
|
|
|
|
|
| 432 |
"qwen-coder-14b": "Qwen/Qwen2.5-Coder-14B-Instruct",
|
| 433 |
"qwen-coder-32b": "Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 434 |
-
|
| 435 |
-
"olmoe-1b-7b": "allenai/OLMoE-1B-7B-0924-Instruct",
|
| 436 |
-
"qwen3-coder-7b": "Qwen/Qwen3-Coder-7B-Instruct",
|
| 437 |
-
"qwen3-coder-30b": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
| 438 |
"qwen3-7b-instruct": "Qwen/Qwen3-7B-Instruct",
|
| 439 |
"qwen3-8b-instruct": "Qwen/Qwen3-8B-Instruct",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
"glm-4-9b-chat": "zai-org/glm-4-9b-chat",
|
| 441 |
"glm-4-9b-chat-1m": "zai-org/glm-4-9b-chat-1m",
|
| 442 |
"glm-4.1v-9b-think": "zai-org/GLM-4.1V-9B-Thinking",
|
| 443 |
"glm-4.7-flash": "zai-org/GLM-4.7-Flash",
|
| 444 |
-
"glm-4.5-air": "zai-org/GLM-4.5-Air-Base",
|
| 445 |
-
"glm-5": "zai-org/GLM-5",
|
| 446 |
"glm-5.1": "zai-org/GLM-5.1",
|
| 447 |
"glm-5.1-fp8": "zai-org/GLM-5.1-FP8",
|
| 448 |
}
|
|
|
|
| 416 |
# 32B 4-bit (16GB) — needs per-GPU ≥30GB (forward+backward eats 2× weights)
|
| 417 |
# 14B 4-bit (7GB) — needs per-GPU ≥22GB safely (V5 OOM proof at 16GB)
|
| 418 |
# 7B 4-bit (3.5GB) — fits T4 16GB with margin (~12GB peak)
|
| 419 |
+
# V18 update (2026-04-30): Qwen3.5-9B (released 2026-04-23) is the
|
| 420 |
+
# newest dense Qwen that fits T4x2 16GB/card at 4-bit (~4.5GB weights).
|
| 421 |
+
# Qwen3.6 family is 27B+ — too big for T4x2. Auto-pick stays on
|
| 422 |
+
# Qwen2.5-Coder-7B-Instruct because the existing axentx/surrogate-1-*
|
| 423 |
+
# corpus was distilled with that tokenizer; switching auto-default
|
| 424 |
+
# would silently re-tokenize all data and invalidate v1 baseline.
|
| 425 |
+
# Set BASE_MODEL=qwen3.5-9b explicitly for the new-model test path.
|
| 426 |
if per_gpu_gb >= 30:
|
| 427 |
return "Qwen/Qwen2.5-Coder-32B-Instruct", 32.0
|
| 428 |
if per_gpu_gb >= 22:
|
|
|
|
| 432 |
|
| 433 |
_auto_base, _auto_size = pick_base_for_hardware()
|
| 434 |
_BASE_ALIASES = {
|
| 435 |
+
# short-name → real HF path. BASE_MODEL accepts either the alias OR the
|
| 436 |
+
# full HF repo path; aliases are resolved here, anything else passes
|
| 437 |
+
# through unchanged. Audit dates: 2026-04-30 HF API sweep.
|
| 438 |
+
#
|
| 439 |
+
# Qwen2.5-Coder family (current corpus tokenizer match)
|
| 440 |
+
"qwen-coder-7b": "Qwen/Qwen2.5-Coder-7B-Instruct", # Kaggle default
|
| 441 |
"qwen-coder-14b": "Qwen/Qwen2.5-Coder-14B-Instruct",
|
| 442 |
"qwen-coder-32b": "Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 443 |
+
# Qwen3 dense Instruct (250k vocab, native Thai BPE)
|
|
|
|
|
|
|
|
|
|
| 444 |
"qwen3-7b-instruct": "Qwen/Qwen3-7B-Instruct",
|
| 445 |
"qwen3-8b-instruct": "Qwen/Qwen3-8B-Instruct",
|
| 446 |
+
# Qwen3-Coder family
|
| 447 |
+
"qwen3-coder-7b": "Qwen/Qwen3-Coder-7B-Instruct",
|
| 448 |
+
"qwen3-coder-30b": "Qwen/Qwen3-Coder-30B-A3B-Instruct", # MoE 3B-active
|
| 449 |
+
"qwen3-coder-next": "Qwen/Qwen3-Coder-Next", # 80B-A3B Next
|
| 450 |
+
"qwen3-coder-480b": "Qwen/Qwen3-Coder-480B-A35B-Instruct", # frontier MoE
|
| 451 |
+
# Qwen3.5 series (2026-04-23 release, "newest that fits T4x2")
|
| 452 |
+
"qwen3.5-4b": "Qwen/Qwen3.5-4B", # ✅ T4x2 trivial
|
| 453 |
+
"qwen3.5-9b": "Qwen/Qwen3.5-9B", # ✅ T4x2 sweet spot
|
| 454 |
+
"qwen3.5-27b": "Qwen/Qwen3.5-27B", # ⚠️ T4x2 risky
|
| 455 |
+
"qwen3.5-27b-int4": "Qwen/Qwen3.5-27B-GPTQ-Int4", # pre-quant T4x2 OK
|
| 456 |
+
"qwen3.5-35b-a3b": "Qwen/Qwen3.5-35B-A3B", # MoE → L40S+
|
| 457 |
+
"qwen3.5-122b-a10b": "Qwen/Qwen3.5-122B-A10B", # MoE → H100
|
| 458 |
+
"qwen3.5-397b-a17b": "Qwen/Qwen3.5-397B-A17B", # frontier MoE
|
| 459 |
+
# Qwen3.6 series (2026-04-24 release, NEWEST — only 27B+ available)
|
| 460 |
+
"qwen3.6-27b": "Qwen/Qwen3.6-27B", # ⚠️ T4x2 risky
|
| 461 |
+
"qwen3.6-27b-fp8": "Qwen/Qwen3.6-27B-FP8", # FP8 inference
|
| 462 |
+
"qwen3.6-35b-a3b": "Qwen/Qwen3.6-35B-A3B", # MoE → L40S+
|
| 463 |
+
"qwen3.6-35b-fp8": "Qwen/Qwen3.6-35B-A3B-FP8",
|
| 464 |
+
# Other 7-9B class
|
| 465 |
+
"granite-4.1-8b": "ibm-granite/granite-4.1-8B-base",
|
| 466 |
+
"olmoe-1b-7b": "allenai/OLMoE-1B-7B-0924-Instruct",
|
| 467 |
+
# GLM family (template/tokenizer validation toward V19)
|
| 468 |
"glm-4-9b-chat": "zai-org/glm-4-9b-chat",
|
| 469 |
"glm-4-9b-chat-1m": "zai-org/glm-4-9b-chat-1m",
|
| 470 |
"glm-4.1v-9b-think": "zai-org/GLM-4.1V-9B-Thinking",
|
| 471 |
"glm-4.7-flash": "zai-org/GLM-4.7-Flash",
|
| 472 |
+
"glm-4.5-air": "zai-org/GLM-4.5-Air-Base", # ❌ T4x2
|
| 473 |
+
"glm-5": "zai-org/GLM-5", # ❌ V19 only
|
| 474 |
"glm-5.1": "zai-org/GLM-5.1",
|
| 475 |
"glm-5.1-fp8": "zai-org/GLM-5.1-FP8",
|
| 476 |
}
|