{ "arch_id": "qwen3-next-mtp", "artifact_role": "small-q8-speed-test", "base_trunk": "mlx-community/Qwen3.5-4B-MLX-8bit", "exactness_baseline": { "artifact": "/Users/youssof/.mtplx/qwen35_4b_q8trunk_mtp1_gate.json", "gate": "mtp1-greedy-ar-equivalence", "max_tokens": 16, "matches": 1, "scope": "greedy AR and MTP1 generated identical token sequence on the warm_code_continuation prompt", "seed": 0, "status": "passed", "total": 1 }, "mtp_depth_max": 2, "mtp_sidecar": "official Qwen/Qwen3.5-4B MTP tensors; quantization recorded in config.json", "mtplx_version": "0.1.0-preview", "recommended_draft_lm_head": { "bits": 4, "group_size": 64, "mode": "affine" }, "recommended_draft_sampler": { "temperature": 0.7, "top_k": 20, "top_p": 0.95 }, "recommended_profile": "performance-cold", "sampler": { "temperature": 0.6, "top_k": 20, "top_p": 0.95 }, "speed_evidence": { "acceptance_by_depth": [ 0.7647058823529411, 0.47058823529411764 ], "accepted_drafts": 21, "ar_tok_s": 75.62701983984475, "artifact": "/Users/youssof/.mtplx/qwen35_4b_depth_grid_q8_body4_draftt0p7.json", "draft_lm_head": "8-bit tied embedding requantized as 4-bit affine group64 draft head", "draft_temperature": 0.7, "enable_thinking": false, "generated_tokens": 48, "mtp_depth": 2, "mtp_tok_s": 105.20912101297236, "note": "Q8 trunk gives the best multiplier in the local one-prompt matrix, but the Q4 trunk remains faster in absolute tok/s.", "sampler": { "temperature": 0.6, "top_k": 20, "top_p": 0.95 }, "speedup_vs_ar": 1.391157832686963 }, "verified_on": { "hardware": "Apple Silicon local MTPLX workstation", "machine_arch": "arm64", "model": "Qwen3.5-4B-MTPLX-Optimized-Speed-Q8Trunk", "timestamp": "2026-05-04T06:17:00+0100" } }