OpenTransformer commited on
Commit
570ff77
·
verified ·
1 Parent(s): 165bcc5

perf: maddubs kernel + nrc=4 multi-row for Q1_0_g128 (3.5-3.75 t/s)

Browse files
ggml/src/ggml-cpu/arch/x86/quants.c CHANGED
@@ -65,56 +65,52 @@ static inline int hsum_i32_4(const __m128i a) {
65
  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
66
  }
67
 
68
- #if defined(__AVX2__)
69
- // AVX2: single-pass byte-level processing, fully unrolled k-loop.
70
- // Pipeline: broadcast+shuffle -> AND+cmpeq -> XOR+SUB -> maddubs+madd -> cvt+fma
71
- const __m256i ones_8 = _mm256_set1_epi8(1);
72
- const __m256i ones_16 = _mm256_set1_epi16(1);
73
- const __m256i byte_shuf = _mm256_setr_epi8(
74
- 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
75
- 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
76
- const __m256i bit_masks = _mm256_setr_epi8(
77
- 1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128,
78
- 1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128);
79
- const __m256i zero = _mm256_setzero_si256();
80
- __m256 acc = _mm256_setzero_ps();
81
 
82
- for (int ib = 0; ib < nb; ++ib) {
83
- const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
84
- const uint32_t * qs32 = (const uint32_t *)x[ib].qs;
85
-
86
- #define Q1_AVX2_BLOCK(K) \
87
- { \
88
- const __m256i y = _mm256_loadu_si256((const __m256i *)y_ptr[K].qs); \
89
- const __m256i sm = _mm256_cmpeq_epi8(_mm256_and_si256( \
90
- _mm256_shuffle_epi8(_mm256_set1_epi32((int)qs32[K]), byte_shuf), \
91
- bit_masks), zero); \
92
- const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(y, sm), sm); \
93
- const __m256i s32 = _mm256_madd_epi16( \
94
- _mm256_maddubs_epi16(ones_8, sy), ones_16); \
95
- acc_block = (K == 0) \
96
- ? _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
97
- _mm256_cvtepi32_ps(s32)) \
98
- : _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
99
- _mm256_cvtepi32_ps(s32), acc_block); \
100
- }
101
 
102
- const block_q8_0 * y_ptr = &y[ib*4];
103
- __m256 acc_block;
104
- Q1_AVX2_BLOCK(0)
105
- Q1_AVX2_BLOCK(1)
106
- Q1_AVX2_BLOCK(2)
107
- Q1_AVX2_BLOCK(3)
108
- #undef Q1_AVX2_BLOCK
 
 
109
 
110
- acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
111
- }
112
- {
113
- const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
114
- _mm256_extractf128_ps(acc, 1));
115
- const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
116
- *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
117
- }
 
 
 
 
 
 
 
 
118
  #else
119
  // Perform multiplication and create 16-bit values
120
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -657,99 +653,172 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
657
  const int nb = n / qk;
658
 
659
  assert(n % qk == 0);
660
- assert(nrc == 1);
661
- UNUSED(nrc);
662
- UNUSED(bx);
663
  UNUSED(by);
664
- UNUSED(bs);
665
 
666
- const block_q1_0_g128 * GGML_RESTRICT x = vx;
667
- const block_q8_0 * GGML_RESTRICT y = vy;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
 
669
- float sumf = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
 
671
- #if defined(__AVX2__)
672
- // AVX2: process 32 Q8_0 values per sub-block in two 16-element passes.
673
- // Sign-extend int8->int16, expand 1-bit weights to masks, blend to negate,
674
- // then madd->fma accumulation.
675
- const __m256i ones_16 = _mm256_set1_epi16(1);
676
- const __m256i bmask = _mm256_setr_epi16(
677
- 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
678
- 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, (short)(1<<15));
679
- __m256 acc = _mm256_setzero_ps();
680
 
681
- for (int ib = 0; ib < nb; ++ib) {
682
- const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
683
- __m256 acc_block = _mm256_setzero_ps();
 
684
 
685
- for (int k = 0; k < 4; k++) {
686
- const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
687
- const __m256i y_bytes = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
688
-
689
- uint32_t bits;
690
- memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits));
691
-
692
- // Lower 16 elements: sign-extend int8->int16, apply sign from weight bits
693
- const __m256i y_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(y_bytes));
694
- const __m256i neg_lo = _mm256_sub_epi16(_mm256_setzero_si256(), y_lo);
695
- const __m256i mask_lo = _mm256_cmpeq_epi16(
696
- _mm256_and_si256(_mm256_set1_epi16((short)(bits & 0xFFFF)), bmask), bmask);
697
- const __m256i signed_lo = _mm256_blendv_epi8(neg_lo, y_lo, mask_lo);
698
-
699
- // Upper 16 elements
700
- const __m256i y_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(y_bytes, 1));
701
- const __m256i neg_hi = _mm256_sub_epi16(_mm256_setzero_si256(), y_hi);
702
- const __m256i mask_hi = _mm256_cmpeq_epi16(
703
- _mm256_and_si256(_mm256_set1_epi16((short)(bits >> 16)), bmask), bmask);
704
- const __m256i signed_hi = _mm256_blendv_epi8(neg_hi, y_hi, mask_hi);
705
-
706
- // Pair-wise sum int16->int32, combine halves, convert to float, FMA
707
- const __m256i sum_32 = _mm256_add_epi32(
708
- _mm256_madd_epi16(signed_lo, ones_16),
709
- _mm256_madd_epi16(signed_hi, ones_16));
710
- acc_block = _mm256_fmadd_ps(_mm256_set1_ps(d1),
711
- _mm256_cvtepi32_ps(sum_32), acc_block);
712
  }
713
- acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
714
- }
715
- // Horizontal reduction: 256 -> 128 -> scalar
716
- {
717
- const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
718
- _mm256_extractf128_ps(acc, 1));
719
- const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
720
- *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
721
  }
 
 
 
 
722
  #else
723
- // Scalar fallback
 
 
 
 
 
 
 
724
  for (int ib = 0; ib < nb; ++ib) {
725
  const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
726
-
727
  float sumi = 0;
728
-
729
- // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
730
  for (int k = 0; k < 4; k++) {
731
  const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
732
-
733
  int sumi_block = 0;
734
-
735
  for (int j = 0; j < QK8_0; j++) {
736
  const int bit_index = k * QK8_0 + j;
737
  const int byte_index = bit_index / 8;
738
  const int bit_offset = bit_index % 8;
739
-
740
- // Extract bit: 1 = +1, 0 = -1
741
  const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
742
  const int yi = y[ib*4 + k].qs[j];
743
-
744
  sumi_block += xi * yi;
745
  }
746
-
747
  sumi += d1 * sumi_block;
748
  }
749
-
750
  sumf += d0 * sumi;
751
  }
752
-
753
  *s = sumf;
754
  #endif
755
  }
 
65
  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
66
  }
67
 
68
+ #if defined(__AVX2__) || defined(__AVX512F__)
69
+ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
70
+ const __m256i ax = _mm256_sign_epi8(x, x);
71
+ const __m256i sy = _mm256_sign_epi8(y, x);
72
+ return _mm256_maddubs_epi16(ax, sy);
73
+ }
 
 
 
 
 
 
 
74
 
75
+ // spread 32 bits to 32 bytes { 0x00, 0xFF }
76
+ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
77
+ uint32_t x32;
78
+ memcpy(&x32, x, sizeof(uint32_t));
79
+ const __m256i shuf_mask = _mm256_set_epi64x(
80
+ 0x0303030303030303, 0x0202020202020202,
81
+ 0x0101010101010101, 0x0000000000000000);
82
+ __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
83
+ const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
84
+ bytes = _mm256_or_si256(bytes, bit_mask);
85
+ return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
86
+ }
 
 
 
 
 
 
 
87
 
88
+ // Unpack 32 4-bit fields into 32 bytes
89
+ // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
90
+ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
91
+ {
92
+ const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
93
+ const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
94
+ const __m256i lowMask = _mm256_set1_epi8( 0xF );
95
+ return _mm256_and_si256(lowMask, bytes);
96
+ }
97
 
98
+ // add int16_t pairwise and return as float vector
99
+ static inline __m256 sum_i16_pairs_float(const __m256i x) {
100
+ const __m256i ones = _mm256_set1_epi16(1);
101
+ const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
102
+ return _mm256_cvtepi32_ps(summed_pairs);
103
+ }
104
+
105
+ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
106
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
107
+ const __m256i zero = _mm256_setzero_si256();
108
+ const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
109
+ return _mm256_cvtepi32_ps(summed_pairs);
110
+ #elif defined(__AVXVNNI__)
111
+ const __m256i zero = _mm256_setzero_si256();
112
+ const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
113
+ return _mm256_cvtepi32_ps(summed_pairs);
114
  #else
115
  // Perform multiplication and create 16-bit values
116
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
 
653
  const int nb = n / qk;
654
 
655
  assert(n % qk == 0);
 
 
 
656
  UNUSED(by);
 
657
 
658
+ #if defined(__AVX2__)
659
+ // Maddubs kernel: uses the identity dot(w, a) = 2·Σ(a where bit=1) − Σ(a)
660
+ // for 1-bit weights w ∈ {-1,+1} encoded as bits b ∈ {0,1} where w = 2b−1.
661
+ //
662
+ // Bit expansion: broadcast uint32 weight bits → shuffle each byte to its
663
+ // 8-byte group → AND with per-position bit test → clamp to 0/1 with min.
664
+ // Then maddubs(selector, activations) gives masked pair-sums, and
665
+ // 2·masked − sum_all gives the signed dot product in int16 pairs.
666
+ //
667
+ // Multi-row (nrc>1): activation data loaded once per sub-block, reused
668
+ // across all weight rows. Saves ~75% activation bandwidth for nrc=4.
669
+
670
+ const __m256i shuf_mask = _mm256_setr_epi8(
671
+ 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
672
+ 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
673
+ const __m256i bit_test = _mm256_set1_epi64x((long long)0x8040201008040201LL);
674
+ const __m256i ones_byte = _mm256_set1_epi8(1);
675
+ const __m256i ones_16 = _mm256_set1_epi16(1);
676
+
677
+ // Macro: compute one row's contribution for one sub-block.
678
+ // Expects ab (activation bytes) and sa (sum-all pairs) in scope.
679
+ #define Q1G128_DOT_ROW(xptr, ib_idx, k_idx, ab, sa, scale, acc_r) \
680
+ do { \
681
+ uint32_t _bits; \
682
+ memcpy(&_bits, &(xptr)[(ib_idx)].qs[(k_idx) * 4], sizeof(_bits)); \
683
+ const __m256i _bexp = _mm256_shuffle_epi8(_mm256_set1_epi32((int)_bits), shuf_mask); \
684
+ const __m256i _sel = _mm256_min_epu8(_mm256_and_si256(_bexp, bit_test), ones_byte); \
685
+ const __m256i _ps = _mm256_maddubs_epi16(_sel, (ab)); \
686
+ const __m256i _dp = _mm256_sub_epi16(_mm256_slli_epi16(_ps, 1), (sa)); \
687
+ const __m256i _d32 = _mm256_madd_epi16(_dp, ones_16); \
688
+ (acc_r) = _mm256_fmadd_ps(_mm256_set1_ps(scale), _mm256_cvtepi32_ps(_d32), (acc_r)); \
689
+ } while (0)
690
+
691
+ // Horizontal reduction: __m256 → scalar float
692
+ #define Q1G128_HREDUCE(acc_r) do { \
693
+ const __m128 _h = _mm_add_ps(_mm256_extractf128_ps((acc_r), 0), \
694
+ _mm256_extractf128_ps((acc_r), 1)); \
695
+ const __m128 _q = _mm_add_ps(_h, _mm_movehl_ps(_h, _h)); \
696
+ _hresult = _mm_cvtss_f32(_mm_add_ss(_q, _mm_movehdup_ps(_q))); \
697
+ } while (0)
698
+
699
+ if (nrc == 1) {
700
+ // Single-row path: no multi-row overhead
701
+ UNUSED(bx); UNUSED(bs);
702
+ const block_q1_0_g128 * GGML_RESTRICT x = (const block_q1_0_g128 *)vx;
703
+ const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
704
+
705
+ __m256 acc = _mm256_setzero_ps();
706
+ for (int ib = 0; ib < nb; ++ib) {
707
+ const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
708
+ for (int k = 0; k < 4; k++) {
709
+ const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
710
+ const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
711
+ const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
712
+ Q1G128_DOT_ROW(x, ib, k, ab, sa, d0 * d1, acc);
713
+ }
714
+ }
715
+ float _hresult;
716
+ Q1G128_HREDUCE(acc);
717
+ *s = _hresult;
718
+
719
+ } else if (nrc == 4) {
720
+ // 4-row path: load activation once, compute 4 dot products.
721
+ const block_q1_0_g128 * GGML_RESTRICT x0 = (const block_q1_0_g128 *)vx;
722
+ const block_q1_0_g128 * GGML_RESTRICT x1 = (const block_q1_0_g128 *)((const char *)vx + bx);
723
+ const block_q1_0_g128 * GGML_RESTRICT x2 = (const block_q1_0_g128 *)((const char *)vx + 2*bx);
724
+ const block_q1_0_g128 * GGML_RESTRICT x3 = (const block_q1_0_g128 *)((const char *)vx + 3*bx);
725
+ const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
726
+
727
+ __m256 a0 = _mm256_setzero_ps();
728
+ __m256 a1 = _mm256_setzero_ps();
729
+ __m256 a2 = _mm256_setzero_ps();
730
+ __m256 a3 = _mm256_setzero_ps();
731
+
732
+ for (int ib = 0; ib < nb; ++ib) {
733
+ const float d0_0 = GGML_CPU_FP16_TO_FP32(x0[ib].d);
734
+ const float d0_1 = GGML_CPU_FP16_TO_FP32(x1[ib].d);
735
+ const float d0_2 = GGML_CPU_FP16_TO_FP32(x2[ib].d);
736
+ const float d0_3 = GGML_CPU_FP16_TO_FP32(x3[ib].d);
737
+
738
+ for (int k = 0; k < 4; k++) {
739
+ const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
740
+ const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
741
+ const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
742
+
743
+ Q1G128_DOT_ROW(x0, ib, k, ab, sa, d0_0 * d1, a0);
744
+ Q1G128_DOT_ROW(x1, ib, k, ab, sa, d0_1 * d1, a1);
745
+ Q1G128_DOT_ROW(x2, ib, k, ab, sa, d0_2 * d1, a2);
746
+ Q1G128_DOT_ROW(x3, ib, k, ab, sa, d0_3 * d1, a3);
747
+ }
748
+ }
749
 
750
+ float _hresult;
751
+ Q1G128_HREDUCE(a0); *(float *)((char *)s + 0*bs) = _hresult;
752
+ Q1G128_HREDUCE(a1); *(float *)((char *)s + 1*bs) = _hresult;
753
+ Q1G128_HREDUCE(a2); *(float *)((char *)s + 2*bs) = _hresult;
754
+ Q1G128_HREDUCE(a3); *(float *)((char *)s + 3*bs) = _hresult;
755
+
756
+ } else {
757
+ // Generic multi-row path for nrc=2,3
758
+ assert(nrc >= 2 && nrc <= 4);
759
+ const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
760
+
761
+ const block_q1_0_g128 * GGML_RESTRICT xr[4];
762
+ __m256 acc_r[4];
763
+ for (int r = 0; r < nrc; r++) {
764
+ xr[r] = (const block_q1_0_g128 *)((const char *)vx + r * bx);
765
+ acc_r[r] = _mm256_setzero_ps();
766
+ }
767
 
768
+ for (int ib = 0; ib < nb; ++ib) {
769
+ float d0_r[4];
770
+ for (int r = 0; r < nrc; r++) {
771
+ d0_r[r] = GGML_CPU_FP16_TO_FP32(xr[r][ib].d);
772
+ }
 
 
 
 
773
 
774
+ for (int k = 0; k < 4; k++) {
775
+ const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
776
+ const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
777
+ const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
778
 
779
+ for (int r = 0; r < nrc; r++) {
780
+ Q1G128_DOT_ROW(xr[r], ib, k, ab, sa, d0_r[r] * d1, acc_r[r]);
781
+ }
782
+ }
783
+ }
784
+
785
+ float _hresult;
786
+ for (int r = 0; r < nrc; r++) {
787
+ Q1G128_HREDUCE(acc_r[r]);
788
+ *(float *)((char *)s + r * bs) = _hresult;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  }
 
 
 
 
 
 
 
 
790
  }
791
+
792
+ #undef Q1G128_DOT_ROW
793
+ #undef Q1G128_HREDUCE
794
+
795
  #else
796
+ // Scalar fallback (nrc=1 only, multi-row handled by dispatch calling nrc=1)
797
+ assert(nrc == 1);
798
+ UNUSED(nrc); UNUSED(bx); UNUSED(bs);
799
+
800
+ const block_q1_0_g128 * GGML_RESTRICT x = (const block_q1_0_g128 *)vx;
801
+ const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
802
+
803
+ float sumf = 0;
804
  for (int ib = 0; ib < nb; ++ib) {
805
  const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
 
806
  float sumi = 0;
 
 
807
  for (int k = 0; k < 4; k++) {
808
  const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
 
809
  int sumi_block = 0;
 
810
  for (int j = 0; j < QK8_0; j++) {
811
  const int bit_index = k * QK8_0 + j;
812
  const int byte_index = bit_index / 8;
813
  const int bit_offset = bit_index % 8;
 
 
814
  const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
815
  const int yi = y[ib*4 + k].qs[j];
 
816
  sumi_block += xi * yi;
817
  }
 
818
  sumi += d1 * sumi_block;
819
  }
 
820
  sumf += d0 * sumi;
821
  }
 
822
  *s = sumf;
823
  #endif
824
  }
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -227,6 +227,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
227
  .from_float = quantize_row_q1_0_g128,
228
  .vec_dot = ggml_vec_dot_q1_0_g128_q8_0,
229
  .vec_dot_type = GGML_TYPE_Q8_0,
 
 
 
 
 
230
  .nrows = 1,
231
  },
232
  [GGML_TYPE_Q4_0] = {
@@ -1185,13 +1190,99 @@ static void ggml_compute_forward_mul_mat_one_chunk(
1185
  assert(ne12 % ne02 == 0);
1186
  assert(ne13 % ne03 == 0);
1187
 
1188
- // COM6-inspired block-tiling: larger blocks for Q1_0_g128 (1-bit weights are tiny,
1189
- // so we can fit more rows in L1). Prefetch next weight block while processing current.
1190
- const int64_t blck_0 = (type == GGML_TYPE_Q1_0_g128) ? 64 : 16;
1191
- const int64_t blck_1 = 16;
1192
-
1193
  const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
1194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1195
  // attempt to reduce false-sharing (does not seem to make a difference)
1196
  // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
1197
  float tmp[128];
@@ -1213,31 +1304,18 @@ static void ggml_compute_forward_mul_mat_one_chunk(
1213
 
1214
  const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
1215
 
1216
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
1217
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
1218
- // the original src1 data pointer, so we should index using the indices directly
1219
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
1220
  const char * src1_col = (const char*)wdata +
1221
  (src1_cont || src1->type != vec_dot_type
1222
  ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
1223
  : (i11 * nb11 + i12 * nb12 + i13 * nb13));
1224
  float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
1225
 
1226
- //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
1227
- // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
1228
- //}
1229
-
1230
- // COM6-inspired: prefetch next weight rows while computing current ones.
1231
- const int64_t ir0_max = MIN(iir0 + blck_0, ir0_end);
1232
- for (int64_t ir0 = iir0; ir0 < ir0_max; ir0 += num_rows_per_vec_dot) {
1233
- if (ir0 + 4 * num_rows_per_vec_dot < ir0_max) {
1234
- __builtin_prefetch(src0_row + (ir0 + 4 * num_rows_per_vec_dot) * nb01, 0, 1);
1235
- }
1236
  vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
1237
  }
1238
 
1239
  for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
1240
- memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (ir0_max - iir0) * sizeof(float));
1241
  }
1242
  }
1243
  }
 
227
  .from_float = quantize_row_q1_0_g128,
228
  .vec_dot = ggml_vec_dot_q1_0_g128_q8_0,
229
  .vec_dot_type = GGML_TYPE_Q8_0,
230
+ // nrows=1: the nrc=2 x86 kernel used s[bs/sizeof(float)] but the dispatch
231
+ // expects MMLA-style s[bs] (2 weight rows × 2 activation cols). This caused
232
+ // corrupt prompt-eval results. Generation (ne11=1) was unaffected since the
233
+ // ne11%2!=0 guard forced nrc=1. Kept at 1 until a proper 2×2 tile kernel
234
+ // is written.
235
  .nrows = 1,
236
  },
237
  [GGML_TYPE_Q4_0] = {
 
1190
  assert(ne12 % ne02 == 0);
1191
  assert(ne13 % ne03 == 0);
1192
 
 
 
 
 
 
1193
  const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
1194
 
1195
+ #if defined(__AVX2__)
1196
+ // Q1_0_g128 fast path: nrc=4 kernel processes 4 weight rows per call,
1197
+ // loading Q8 activations once per quad (~60% activation bandwidth savings).
1198
+ // Write directly to dst (no tmp buffer needed).
1199
+ if (type == GGML_TYPE_Q1_0_g128) {
1200
+ // Resolve src0_row once: in mul_mat the broadcast dims (i02,i03)
1201
+ // are uniform across ir1 in this thread's range for typical LLM
1202
+ // matmuls (ne12==ne02, ne13==ne03), so we hoist it out. We still
1203
+ // recompute per-ir1 to be safe when broadcasting is in effect.
1204
+ const int64_t ne1xne12 = ne12 * ne1;
1205
+
1206
+ // GEPP-style outer tile: process 16 weight rows per outer iteration
1207
+ // (4 × nrc=4 calls), prefetching the next group to hide DRAM latency.
1208
+ // Each Q1_0_g128 row at K=4096 is 576 bytes; 16 rows = 9.2 KB (fits L1d).
1209
+ int64_t ir0 = ir0_start;
1210
+ for (; ir0 + 15 < ir0_end; ir0 += 16) {
1211
+ for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
1212
+ const int64_t i13 = (ir1 / ne1xne12);
1213
+ const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
1214
+ const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
1215
+ const int64_t i03 = i13 / r3;
1216
+ const int64_t i02 = i12 / r2;
1217
+ const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
1218
+ const char * src1_col = (const char*)wdata +
1219
+ (src1_cont || src1->type != vec_dot_type
1220
+ ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
1221
+ : (i11 * nb11 + i12 * nb12 + i13 * nb13));
1222
+ float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
1223
+ // Prefetch next 16-row tile's first cache lines
1224
+ if (ir0 + 19 < ir0_end) {
1225
+ _mm_prefetch(src0_row + (ir0 + 16) * nb01, _MM_HINT_T1);
1226
+ _mm_prefetch(src0_row + (ir0 + 18) * nb01, _MM_HINT_T1);
1227
+ }
1228
+ vec_dot(ne00, &dst_col[ir0], sizeof(float), src0_row + ir0 * nb01, nb01, src1_col, 0, 4);
1229
+ vec_dot(ne00, &dst_col[ir0+4], sizeof(float), src0_row + (ir0 + 4) * nb01, nb01, src1_col, 0, 4);
1230
+ vec_dot(ne00, &dst_col[ir0+8], sizeof(float), src0_row + (ir0 + 8) * nb01, nb01, src1_col, 0, 4);
1231
+ vec_dot(ne00, &dst_col[ir0+12], sizeof(float), src0_row + (ir0 +12) * nb01, nb01, src1_col, 0, 4);
1232
+ }
1233
+ }
1234
+ // Remaining rows in groups of 4
1235
+ for (; ir0 + 3 < ir0_end; ir0 += 4) {
1236
+ for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
1237
+ const int64_t i13 = (ir1 / ne1xne12);
1238
+ const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
1239
+ const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
1240
+ const int64_t i03 = i13 / r3;
1241
+ const int64_t i02 = i12 / r2;
1242
+ const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
1243
+ const char * src1_col = (const char*)wdata +
1244
+ (src1_cont || src1->type != vec_dot_type
1245
+ ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
1246
+ : (i11 * nb11 + i12 * nb12 + i13 * nb13));
1247
+ float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
1248
+ vec_dot(ne00, &dst_col[ir0], sizeof(float),
1249
+ src0_row + ir0 * nb01, nb01,
1250
+ src1_col, 0, 4);
1251
+ }
1252
+ }
1253
+ // Remainder rows: fall back to column-outer order (small tail)
1254
+ for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
1255
+ const int64_t i13 = (ir1 / ne1xne12);
1256
+ const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
1257
+ const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
1258
+ const int64_t i03 = i13 / r3;
1259
+ const int64_t i02 = i12 / r2;
1260
+ const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
1261
+ const char * src1_col = (const char*)wdata +
1262
+ (src1_cont || src1->type != vec_dot_type
1263
+ ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
1264
+ : (i11 * nb11 + i12 * nb12 + i13 * nb13));
1265
+ float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
1266
+ int64_t ir0t = ir0;
1267
+ for (; ir0t + 1 < ir0_end; ir0t += 2) {
1268
+ vec_dot(ne00, &dst_col[ir0t], sizeof(float),
1269
+ src0_row + ir0t * nb01, nb01,
1270
+ src1_col, 0, 2);
1271
+ }
1272
+ if (ir0t < ir0_end) {
1273
+ vec_dot(ne00, &dst_col[ir0t], 0,
1274
+ src0_row + ir0t * nb01, 0,
1275
+ src1_col, 0, 1);
1276
+ }
1277
+ }
1278
+ return;
1279
+ }
1280
+ #endif
1281
+
1282
+ // Generic path for all other quant types
1283
+ const int64_t blck_0 = 16;
1284
+ const int64_t blck_1 = 16;
1285
+
1286
  // attempt to reduce false-sharing (does not seem to make a difference)
1287
  // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
1288
  float tmp[128];
 
1304
 
1305
  const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
1306
 
 
 
 
 
1307
  const char * src1_col = (const char*)wdata +
1308
  (src1_cont || src1->type != vec_dot_type
1309
  ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
1310
  : (i11 * nb11 + i12 * nb12 + i13 * nb13));
1311
  float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
1312
 
1313
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
 
 
 
 
 
 
 
 
 
1314
  vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
1315
  }
1316
 
1317
  for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
1318
+ memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
1319
  }
1320
  }
1321
  }