{ "arch_id": "qwen3-next-mtp", "artifact_role": "small-q4-speed-test", "base_trunk": "mlx-community/Qwen3.5-4B-MLX-4bit", "exactness_baseline": { "artifact": "/Users/youssof/.mtplx/qwen35_4b_final_mtp1_gate.json", "gate": "mtp1-greedy-ar-equivalence", "max_tokens": 16, "matches": 1, "scope": "greedy AR and MTP1 generated identical token sequence on the warm_code_continuation prompt", "seed": 0, "status": "passed", "total": 1 }, "mtp_depth_max": 2, "mtp_sidecar": "official Qwen/Qwen3.5-4B MTP tensors; quantization recorded in config.json", "mtplx_version": "0.1.0-preview", "recommended_draft_lm_head": { "bits": 4, "group_size": 64, "mode": "affine" }, "recommended_draft_sampler": { "temperature": 0.6, "top_k": 20, "top_p": 0.95 }, "recommended_profile": "performance-cold", "sampler": { "temperature": 0.6, "top_k": 20, "top_p": 0.95 }, "speed_evidence": { "acceptance_by_depth": [ 0.6666666666666666, 0.2222222222222222 ], "accepted_drafts": 16, "ar_tok_s": 108.4050622912913, "artifact": "/Users/youssof/.mtplx/qwen35_4b_depth_grid_q4_body4_draftt0p6.json", "draft_lm_head": "tied embedding reused as 4-bit affine group64 draft head", "draft_temperature": 0.6, "enable_thinking": false, "generated_tokens": 48, "mtp_depth": 2, "mtp_tok_s": 120.05572596536466, "note": "Qwen3.5-4B measured fastest at depth 2. Depth 3 over-drafts this small native-MTP head and can lose to AR.", "sampler": { "temperature": 0.6, "top_k": 20, "top_p": 0.95 }, "speedup_vs_ar": 1.1074734281575087 }, "verified_on": { "hardware": "Apple Silicon local MTPLX workstation", "machine_arch": "arm64", "model": "Qwen3.5-4B-MTPLX-Optimized-Speed", "timestamp": "2026-05-04T06:17:00+0100" } }