perf: maddubs kernel + nrc=4 multi-row for Q1_0_g128 (3.5-3.75 t/s)

Browse files

Files changed (2) hide show

ggml/src/ggml-cpu/arch/x86/quants.c +182 -113
ggml/src/ggml-cpu/ggml-cpu.c +98 -20

ggml/src/ggml-cpu/arch/x86/quants.c CHANGED Viewed

@@ -65,56 +65,52 @@ static inline int hsum_i32_4(const __m128i a) {
     return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }
-#if defined(__AVX2__)
-    // AVX2: single-pass byte-level processing, fully unrolled k-loop.
-    // Pipeline: broadcast+shuffle -> AND+cmpeq -> XOR+SUB -> maddubs+madd -> cvt+fma
-    const __m256i ones_8  = _mm256_set1_epi8(1);
-    const __m256i ones_16 = _mm256_set1_epi16(1);
-    const __m256i byte_shuf = _mm256_setr_epi8(
-        0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
-        2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
-    const __m256i bit_masks = _mm256_setr_epi8(
-        1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128,
-        1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128);
-    const __m256i zero = _mm256_setzero_si256();
-    __m256 acc = _mm256_setzero_ps();
-    for (int ib = 0; ib < nb; ++ib) {
-        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        const uint32_t * qs32 = (const uint32_t *)x[ib].qs;
-#define Q1_AVX2_BLOCK(K) \
-        { \
-            const __m256i y = _mm256_loadu_si256((const __m256i *)y_ptr[K].qs); \
-            const __m256i sm = _mm256_cmpeq_epi8(_mm256_and_si256( \
-                _mm256_shuffle_epi8(_mm256_set1_epi32((int)qs32[K]), byte_shuf), \
-                bit_masks), zero); \
-            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(y, sm), sm); \
-            const __m256i s32 = _mm256_madd_epi16( \
-                _mm256_maddubs_epi16(ones_8, sy), ones_16); \
-            acc_block = (K == 0) \
-                ? _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
-                                _mm256_cvtepi32_ps(s32)) \
-                : _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
-                                   _mm256_cvtepi32_ps(s32), acc_block); \
-        }
-        const block_q8_0 * y_ptr = &y[ib*4];
-        __m256 acc_block;
-        Q1_AVX2_BLOCK(0)
-        Q1_AVX2_BLOCK(1)
-        Q1_AVX2_BLOCK(2)
-        Q1_AVX2_BLOCK(3)
-#undef Q1_AVX2_BLOCK
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
-    }
-    {
-        const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
-                                     _mm256_extractf128_ps(acc, 1));
-        const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
-        *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
-    }
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -657,99 +653,172 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
     const int nb = n / qk;
     assert(n % qk == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
     UNUSED(by);
-    UNUSED(bs);
-    const block_q1_0_g128 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-    float sumf = 0;
-#if defined(__AVX2__)
-    // AVX2: process 32 Q8_0 values per sub-block in two 16-element passes.
-    // Sign-extend int8->int16, expand 1-bit weights to masks, blend to negate,
-    // then madd->fma accumulation.
-    const __m256i ones_16 = _mm256_set1_epi16(1);
-    const __m256i bmask = _mm256_setr_epi16(
-        1<<0,  1<<1,  1<<2,  1<<3,  1<<4,  1<<5,  1<<6,  1<<7,
-        1<<8,  1<<9,  1<<10, 1<<11, 1<<12, 1<<13, 1<<14, (short)(1<<15));
-    __m256 acc = _mm256_setzero_ps();
-    for (int ib = 0; ib < nb; ++ib) {
-        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        __m256 acc_block = _mm256_setzero_ps();
-        for (int k = 0; k < 4; k++) {
-            const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
-            const __m256i y_bytes = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
-            uint32_t bits;
-            memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits));
-            // Lower 16 elements: sign-extend int8->int16, apply sign from weight bits
-            const __m256i y_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(y_bytes));
-            const __m256i neg_lo = _mm256_sub_epi16(_mm256_setzero_si256(), y_lo);
-            const __m256i mask_lo = _mm256_cmpeq_epi16(
-                _mm256_and_si256(_mm256_set1_epi16((short)(bits & 0xFFFF)), bmask), bmask);
-            const __m256i signed_lo = _mm256_blendv_epi8(neg_lo, y_lo, mask_lo);
-            // Upper 16 elements
-            const __m256i y_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(y_bytes, 1));
-            const __m256i neg_hi = _mm256_sub_epi16(_mm256_setzero_si256(), y_hi);
-            const __m256i mask_hi = _mm256_cmpeq_epi16(
-                _mm256_and_si256(_mm256_set1_epi16((short)(bits >> 16)), bmask), bmask);
-            const __m256i signed_hi = _mm256_blendv_epi8(neg_hi, y_hi, mask_hi);
-            // Pair-wise sum int16->int32, combine halves, convert to float, FMA
-            const __m256i sum_32 = _mm256_add_epi32(
-                _mm256_madd_epi16(signed_lo, ones_16),
-                _mm256_madd_epi16(signed_hi, ones_16));
-            acc_block = _mm256_fmadd_ps(_mm256_set1_ps(d1),
-                                         _mm256_cvtepi32_ps(sum_32), acc_block);
         }
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
-    }
-    // Horizontal reduction: 256 -> 128 -> scalar
-    {
-        const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
-                                     _mm256_extractf128_ps(acc, 1));
-        const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
-        *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
     }
 #else
-    // Scalar fallback
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
         float sumi = 0;
-        // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
         for (int k = 0; k < 4; k++) {
             const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
             int sumi_block = 0;
             for (int j = 0; j < QK8_0; j++) {
                 const int bit_index = k * QK8_0 + j;
                 const int byte_index = bit_index / 8;
                 const int bit_offset = bit_index % 8;
-                // Extract bit: 1 = +1, 0 = -1
                 const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
                 const int yi = y[ib*4 + k].qs[j];
                 sumi_block += xi * yi;
             }
             sumi += d1 * sumi_block;
         }
         sumf += d0 * sumi;
     }
     *s = sumf;
 #endif
 }

     return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }
+#if defined(__AVX2__) || defined(__AVX512F__)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return _mm256_maddubs_epi16(ax, sy);
+}
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
     const int nb = n / qk;
     assert(n % qk == 0);
     UNUSED(by);
+#if defined(__AVX2__)
+    // Maddubs kernel: uses the identity dot(w, a) = 2·Σ(a where bit=1) − Σ(a)
+    // for 1-bit weights w ∈ {-1,+1} encoded as bits b ∈ {0,1} where w = 2b−1.
+    //
+    // Bit expansion: broadcast uint32 weight bits → shuffle each byte to its
+    // 8-byte group → AND with per-position bit test → clamp to 0/1 with min.
+    // Then maddubs(selector, activations) gives masked pair-sums, and
+    // 2·masked − sum_all gives the signed dot product in int16 pairs.
+    //
+    // Multi-row (nrc>1): activation data loaded once per sub-block, reused
+    // across all weight rows. Saves ~75% activation bandwidth for nrc=4.
+    const __m256i shuf_mask = _mm256_setr_epi8(
+        0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
+        2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
+    const __m256i bit_test  = _mm256_set1_epi64x((long long)0x8040201008040201LL);
+    const __m256i ones_byte = _mm256_set1_epi8(1);
+    const __m256i ones_16   = _mm256_set1_epi16(1);
+    // Macro: compute one row's contribution for one sub-block.
+    // Expects ab (activation bytes) and sa (sum-all pairs) in scope.
+#define Q1G128_DOT_ROW(xptr, ib_idx, k_idx, ab, sa, scale, acc_r) \
+    do { \
+        uint32_t _bits; \
+        memcpy(&_bits, &(xptr)[(ib_idx)].qs[(k_idx) * 4], sizeof(_bits)); \
+        const __m256i _bexp = _mm256_shuffle_epi8(_mm256_set1_epi32((int)_bits), shuf_mask); \
+        const __m256i _sel  = _mm256_min_epu8(_mm256_and_si256(_bexp, bit_test), ones_byte); \
+        const __m256i _ps   = _mm256_maddubs_epi16(_sel, (ab)); \
+        const __m256i _dp   = _mm256_sub_epi16(_mm256_slli_epi16(_ps, 1), (sa)); \
+        const __m256i _d32  = _mm256_madd_epi16(_dp, ones_16); \
+        (acc_r) = _mm256_fmadd_ps(_mm256_set1_ps(scale), _mm256_cvtepi32_ps(_d32), (acc_r)); \
+    } while (0)
+    // Horizontal reduction: __m256 → scalar float
+#define Q1G128_HREDUCE(acc_r) do { \
+        const __m128 _h = _mm_add_ps(_mm256_extractf128_ps((acc_r), 0), \
+                                      _mm256_extractf128_ps((acc_r), 1)); \
+        const __m128 _q = _mm_add_ps(_h, _mm_movehl_ps(_h, _h)); \
+        _hresult = _mm_cvtss_f32(_mm_add_ss(_q, _mm_movehdup_ps(_q))); \
+    } while (0)
+    if (nrc == 1) {
+        // Single-row path: no multi-row overhead
+        UNUSED(bx); UNUSED(bs);
+        const block_q1_0_g128 * GGML_RESTRICT x = (const block_q1_0_g128 *)vx;
+        const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
+        __m256 acc = _mm256_setzero_ps();
+        for (int ib = 0; ib < nb; ++ib) {
+            const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+            for (int k = 0; k < 4; k++) {
+                const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+                const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+                const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
+                Q1G128_DOT_ROW(x, ib, k, ab, sa, d0 * d1, acc);
+            }
+        }
+        float _hresult;
+        Q1G128_HREDUCE(acc);
+        *s = _hresult;
+    } else if (nrc == 4) {
+        // 4-row path: load activation once, compute 4 dot products.
+        const block_q1_0_g128 * GGML_RESTRICT x0 = (const block_q1_0_g128 *)vx;
+        const block_q1_0_g128 * GGML_RESTRICT x1 = (const block_q1_0_g128 *)((const char *)vx + bx);
+        const block_q1_0_g128 * GGML_RESTRICT x2 = (const block_q1_0_g128 *)((const char *)vx + 2*bx);
+        const block_q1_0_g128 * GGML_RESTRICT x3 = (const block_q1_0_g128 *)((const char *)vx + 3*bx);
+        const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
+        __m256 a0 = _mm256_setzero_ps();
+        __m256 a1 = _mm256_setzero_ps();
+        __m256 a2 = _mm256_setzero_ps();
+        __m256 a3 = _mm256_setzero_ps();
+        for (int ib = 0; ib < nb; ++ib) {
+            const float d0_0 = GGML_CPU_FP16_TO_FP32(x0[ib].d);
+            const float d0_1 = GGML_CPU_FP16_TO_FP32(x1[ib].d);
+            const float d0_2 = GGML_CPU_FP16_TO_FP32(x2[ib].d);
+            const float d0_3 = GGML_CPU_FP16_TO_FP32(x3[ib].d);
+            for (int k = 0; k < 4; k++) {
+                const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+                const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+                const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
+                Q1G128_DOT_ROW(x0, ib, k, ab, sa, d0_0 * d1, a0);
+                Q1G128_DOT_ROW(x1, ib, k, ab, sa, d0_1 * d1, a1);
+                Q1G128_DOT_ROW(x2, ib, k, ab, sa, d0_2 * d1, a2);
+                Q1G128_DOT_ROW(x3, ib, k, ab, sa, d0_3 * d1, a3);
+            }
+        }
+        float _hresult;
+        Q1G128_HREDUCE(a0); *(float *)((char *)s + 0*bs) = _hresult;
+        Q1G128_HREDUCE(a1); *(float *)((char *)s + 1*bs) = _hresult;
+        Q1G128_HREDUCE(a2); *(float *)((char *)s + 2*bs) = _hresult;
+        Q1G128_HREDUCE(a3); *(float *)((char *)s + 3*bs) = _hresult;
+    } else {
+        // Generic multi-row path for nrc=2,3
+        assert(nrc >= 2 && nrc <= 4);
+        const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
+        const block_q1_0_g128 * GGML_RESTRICT xr[4];
+        __m256 acc_r[4];
+        for (int r = 0; r < nrc; r++) {
+            xr[r] = (const block_q1_0_g128 *)((const char *)vx + r * bx);
+            acc_r[r] = _mm256_setzero_ps();
+        }
+        for (int ib = 0; ib < nb; ++ib) {
+            float d0_r[4];
+            for (int r = 0; r < nrc; r++) {
+                d0_r[r] = GGML_CPU_FP16_TO_FP32(xr[r][ib].d);
+            }
+            for (int k = 0; k < 4; k++) {
+                const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+                const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+                const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
+                for (int r = 0; r < nrc; r++) {
+                    Q1G128_DOT_ROW(xr[r], ib, k, ab, sa, d0_r[r] * d1, acc_r[r]);
+                }
+            }
+        }
+        float _hresult;
+        for (int r = 0; r < nrc; r++) {
+            Q1G128_HREDUCE(acc_r[r]);
+            *(float *)((char *)s + r * bs) = _hresult;
         }
     }
+#undef Q1G128_DOT_ROW
+#undef Q1G128_HREDUCE
 #else
+    // Scalar fallback (nrc=1 only, multi-row handled by dispatch calling nrc=1)
+    assert(nrc == 1);
+    UNUSED(nrc); UNUSED(bx); UNUSED(bs);
+    const block_q1_0_g128 * GGML_RESTRICT x = (const block_q1_0_g128 *)vx;
+    const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
+    float sumf = 0;
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
         float sumi = 0;
         for (int k = 0; k < 4; k++) {
             const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
             int sumi_block = 0;
             for (int j = 0; j < QK8_0; j++) {
                 const int bit_index = k * QK8_0 + j;
                 const int byte_index = bit_index / 8;
                 const int bit_offset = bit_index % 8;
                 const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
                 const int yi = y[ib*4 + k].qs[j];
                 sumi_block += xi * yi;
             }
             sumi += d1 * sumi_block;
         }
         sumf += d0 * sumi;
     }
     *s = sumf;
 #endif
 }

ggml/src/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -227,6 +227,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .from_float               = quantize_row_q1_0_g128,
         .vec_dot                  = ggml_vec_dot_q1_0_g128_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
     },
     [GGML_TYPE_Q4_0] = {
@@ -1185,13 +1190,99 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     assert(ne12 % ne02 == 0);
     assert(ne13 % ne03 == 0);
-    // COM6-inspired block-tiling: larger blocks for Q1_0_g128 (1-bit weights are tiny,
-    // so we can fit more rows in L1). Prefetch next weight block while processing current.
-    const int64_t blck_0 = (type == GGML_TYPE_Q1_0_g128) ? 64 : 16;
-    const int64_t blck_1 = 16;
     const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
     // attempt to reduce false-sharing (does not seem to make a difference)
     // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
     float tmp[128];
@@ -1213,31 +1304,18 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                 const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
                 const char * src1_col = (const char*)wdata +
                     (src1_cont || src1->type != vec_dot_type
                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
                 float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
-                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-                // COM6-inspired: prefetch next weight rows while computing current ones.
-                const int64_t ir0_max = MIN(iir0 + blck_0, ir0_end);
-                for (int64_t ir0 = iir0; ir0 < ir0_max; ir0 += num_rows_per_vec_dot) {
-                    if (ir0 + 4 * num_rows_per_vec_dot < ir0_max) {
-                        __builtin_prefetch(src0_row + (ir0 + 4 * num_rows_per_vec_dot) * nb01, 0, 1);
-                    }
                     vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
-                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (ir0_max - iir0) * sizeof(float));
                 }
             }
         }

         .from_float               = quantize_row_q1_0_g128,
         .vec_dot                  = ggml_vec_dot_q1_0_g128_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
+        // nrows=1: the nrc=2 x86 kernel used s[bs/sizeof(float)] but the dispatch
+        // expects MMLA-style s[bs] (2 weight rows × 2 activation cols). This caused
+        // corrupt prompt-eval results. Generation (ne11=1) was unaffected since the
+        // ne11%2!=0 guard forced nrc=1. Kept at 1 until a proper 2×2 tile kernel
+        // is written.
         .nrows                    = 1,
     },
     [GGML_TYPE_Q4_0] = {
     assert(ne12 % ne02 == 0);
     assert(ne13 % ne03 == 0);
     const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+#if defined(__AVX2__)
+    // Q1_0_g128 fast path: nrc=4 kernel processes 4 weight rows per call,
+    // loading Q8 activations once per quad (~60% activation bandwidth savings).
+    // Write directly to dst (no tmp buffer needed).
+    if (type == GGML_TYPE_Q1_0_g128) {
+        // Resolve src0_row once: in mul_mat the broadcast dims (i02,i03)
+        // are uniform across ir1 in this thread's range for typical LLM
+        // matmuls (ne12==ne02, ne13==ne03), so we hoist it out. We still
+        // recompute per-ir1 to be safe when broadcasting is in effect.
+        const int64_t ne1xne12 = ne12 * ne1;
+        // GEPP-style outer tile: process 16 weight rows per outer iteration
+        // (4 × nrc=4 calls), prefetching the next group to hide DRAM latency.
+        // Each Q1_0_g128 row at K=4096 is 576 bytes; 16 rows = 9.2 KB (fits L1d).
+        int64_t ir0 = ir0_start;
+        for (; ir0 + 15 < ir0_end; ir0 += 16) {
+            for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
+                const int64_t i13 = (ir1 / ne1xne12);
+                const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
+                const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
+                const int64_t i03 = i13 / r3;
+                const int64_t i02 = i12 / r2;
+                const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
+                const char * src1_col = (const char*)wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
+                // Prefetch next 16-row tile's first cache lines
+                if (ir0 + 19 < ir0_end) {
+                    _mm_prefetch(src0_row + (ir0 + 16) * nb01, _MM_HINT_T1);
+                    _mm_prefetch(src0_row + (ir0 + 18) * nb01, _MM_HINT_T1);
+                }
+                vec_dot(ne00, &dst_col[ir0],    sizeof(float), src0_row + ir0       * nb01, nb01, src1_col, 0, 4);
+                vec_dot(ne00, &dst_col[ir0+4],  sizeof(float), src0_row + (ir0 + 4) * nb01, nb01, src1_col, 0, 4);
+                vec_dot(ne00, &dst_col[ir0+8],  sizeof(float), src0_row + (ir0 + 8) * nb01, nb01, src1_col, 0, 4);
+                vec_dot(ne00, &dst_col[ir0+12], sizeof(float), src0_row + (ir0 +12) * nb01, nb01, src1_col, 0, 4);
+            }
+        }
+        // Remaining rows in groups of 4
+        for (; ir0 + 3 < ir0_end; ir0 += 4) {
+            for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
+                const int64_t i13 = (ir1 / ne1xne12);
+                const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
+                const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
+                const int64_t i03 = i13 / r3;
+                const int64_t i02 = i12 / r2;
+                const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
+                const char * src1_col = (const char*)wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
+                vec_dot(ne00, &dst_col[ir0], sizeof(float),
+                        src0_row + ir0 * nb01, nb01,
+                        src1_col, 0, 4);
+            }
+        }
+        // Remainder rows: fall back to column-outer order (small tail)
+        for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
+            const int64_t i13 = (ir1 / ne1xne12);
+            const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
+            const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
+            const int64_t i03 = i13 / r3;
+            const int64_t i02 = i12 / r2;
+            const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
+            const char * src1_col = (const char*)wdata +
+                (src1_cont || src1->type != vec_dot_type
+                    ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                    : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+            float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
+            int64_t ir0t = ir0;
+            for (; ir0t + 1 < ir0_end; ir0t += 2) {
+                vec_dot(ne00, &dst_col[ir0t], sizeof(float),
+                        src0_row + ir0t * nb01, nb01,
+                        src1_col, 0, 2);
+            }
+            if (ir0t < ir0_end) {
+                vec_dot(ne00, &dst_col[ir0t], 0,
+                        src0_row + ir0t * nb01, 0,
+                        src1_col, 0, 1);
+            }
+        }
+        return;
+    }
+#endif
+    // Generic path for all other quant types
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
     // attempt to reduce false-sharing (does not seem to make a difference)
     // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
     float tmp[128];
                 const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
                 const char * src1_col = (const char*)wdata +
                     (src1_cont || src1->type != vec_dot_type
                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
                 float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
                     vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
                 }
             }
         }