perf: maddubs kernel + nrc=4 multi-row for Q1_0_g128 (3.5-3.75 t/s)
Browse files- ggml/src/ggml-cpu/arch/x86/quants.c +182 -113
- ggml/src/ggml-cpu/ggml-cpu.c +98 -20
ggml/src/ggml-cpu/arch/x86/quants.c
CHANGED
|
@@ -65,56 +65,52 @@ static inline int hsum_i32_4(const __m128i a) {
|
|
| 65 |
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
| 66 |
}
|
| 67 |
|
| 68 |
-
#if defined(__AVX2__)
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
const __m256i
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
|
| 75 |
-
2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
|
| 76 |
-
const __m256i bit_masks = _mm256_setr_epi8(
|
| 77 |
-
1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128,
|
| 78 |
-
1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128);
|
| 79 |
-
const __m256i zero = _mm256_setzero_si256();
|
| 80 |
-
__m256 acc = _mm256_setzero_ps();
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
_mm256_maddubs_epi16(ones_8, sy), ones_16); \
|
| 95 |
-
acc_block = (K == 0) \
|
| 96 |
-
? _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
|
| 97 |
-
_mm256_cvtepi32_ps(s32)) \
|
| 98 |
-
: _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
|
| 99 |
-
_mm256_cvtepi32_ps(s32), acc_block); \
|
| 100 |
-
}
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
#else
|
| 119 |
// Perform multiplication and create 16-bit values
|
| 120 |
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
|
@@ -657,99 +653,172 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
|
|
| 657 |
const int nb = n / qk;
|
| 658 |
|
| 659 |
assert(n % qk == 0);
|
| 660 |
-
assert(nrc == 1);
|
| 661 |
-
UNUSED(nrc);
|
| 662 |
-
UNUSED(bx);
|
| 663 |
UNUSED(by);
|
| 664 |
-
UNUSED(bs);
|
| 665 |
|
| 666 |
-
|
| 667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
const __m256i bmask = _mm256_setr_epi16(
|
| 677 |
-
1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
|
| 678 |
-
1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, (short)(1<<15));
|
| 679 |
-
__m256 acc = _mm256_setzero_ps();
|
| 680 |
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
|
|
|
| 684 |
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
const __m256i mask_lo = _mm256_cmpeq_epi16(
|
| 696 |
-
_mm256_and_si256(_mm256_set1_epi16((short)(bits & 0xFFFF)), bmask), bmask);
|
| 697 |
-
const __m256i signed_lo = _mm256_blendv_epi8(neg_lo, y_lo, mask_lo);
|
| 698 |
-
|
| 699 |
-
// Upper 16 elements
|
| 700 |
-
const __m256i y_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(y_bytes, 1));
|
| 701 |
-
const __m256i neg_hi = _mm256_sub_epi16(_mm256_setzero_si256(), y_hi);
|
| 702 |
-
const __m256i mask_hi = _mm256_cmpeq_epi16(
|
| 703 |
-
_mm256_and_si256(_mm256_set1_epi16((short)(bits >> 16)), bmask), bmask);
|
| 704 |
-
const __m256i signed_hi = _mm256_blendv_epi8(neg_hi, y_hi, mask_hi);
|
| 705 |
-
|
| 706 |
-
// Pair-wise sum int16->int32, combine halves, convert to float, FMA
|
| 707 |
-
const __m256i sum_32 = _mm256_add_epi32(
|
| 708 |
-
_mm256_madd_epi16(signed_lo, ones_16),
|
| 709 |
-
_mm256_madd_epi16(signed_hi, ones_16));
|
| 710 |
-
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(d1),
|
| 711 |
-
_mm256_cvtepi32_ps(sum_32), acc_block);
|
| 712 |
}
|
| 713 |
-
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
|
| 714 |
-
}
|
| 715 |
-
// Horizontal reduction: 256 -> 128 -> scalar
|
| 716 |
-
{
|
| 717 |
-
const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
|
| 718 |
-
_mm256_extractf128_ps(acc, 1));
|
| 719 |
-
const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
|
| 720 |
-
*s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
#else
|
| 723 |
-
// Scalar fallback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
for (int ib = 0; ib < nb; ++ib) {
|
| 725 |
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
| 726 |
-
|
| 727 |
float sumi = 0;
|
| 728 |
-
|
| 729 |
-
// Process 4 Q8_0 blocks (4 * 32 = 128 elements)
|
| 730 |
for (int k = 0; k < 4; k++) {
|
| 731 |
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
|
| 732 |
-
|
| 733 |
int sumi_block = 0;
|
| 734 |
-
|
| 735 |
for (int j = 0; j < QK8_0; j++) {
|
| 736 |
const int bit_index = k * QK8_0 + j;
|
| 737 |
const int byte_index = bit_index / 8;
|
| 738 |
const int bit_offset = bit_index % 8;
|
| 739 |
-
|
| 740 |
-
// Extract bit: 1 = +1, 0 = -1
|
| 741 |
const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
| 742 |
const int yi = y[ib*4 + k].qs[j];
|
| 743 |
-
|
| 744 |
sumi_block += xi * yi;
|
| 745 |
}
|
| 746 |
-
|
| 747 |
sumi += d1 * sumi_block;
|
| 748 |
}
|
| 749 |
-
|
| 750 |
sumf += d0 * sumi;
|
| 751 |
}
|
| 752 |
-
|
| 753 |
*s = sumf;
|
| 754 |
#endif
|
| 755 |
}
|
|
|
|
| 65 |
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
| 66 |
}
|
| 67 |
|
| 68 |
+
#if defined(__AVX2__) || defined(__AVX512F__)
|
| 69 |
+
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
| 70 |
+
const __m256i ax = _mm256_sign_epi8(x, x);
|
| 71 |
+
const __m256i sy = _mm256_sign_epi8(y, x);
|
| 72 |
+
return _mm256_maddubs_epi16(ax, sy);
|
| 73 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
+
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
| 76 |
+
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
| 77 |
+
uint32_t x32;
|
| 78 |
+
memcpy(&x32, x, sizeof(uint32_t));
|
| 79 |
+
const __m256i shuf_mask = _mm256_set_epi64x(
|
| 80 |
+
0x0303030303030303, 0x0202020202020202,
|
| 81 |
+
0x0101010101010101, 0x0000000000000000);
|
| 82 |
+
__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
|
| 83 |
+
const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
|
| 84 |
+
bytes = _mm256_or_si256(bytes, bit_mask);
|
| 85 |
+
return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
|
| 86 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
// Unpack 32 4-bit fields into 32 bytes
|
| 89 |
+
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
|
| 90 |
+
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
| 91 |
+
{
|
| 92 |
+
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
|
| 93 |
+
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
|
| 94 |
+
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
| 95 |
+
return _mm256_and_si256(lowMask, bytes);
|
| 96 |
+
}
|
| 97 |
|
| 98 |
+
// add int16_t pairwise and return as float vector
|
| 99 |
+
static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
| 100 |
+
const __m256i ones = _mm256_set1_epi16(1);
|
| 101 |
+
const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
|
| 102 |
+
return _mm256_cvtepi32_ps(summed_pairs);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
| 106 |
+
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
| 107 |
+
const __m256i zero = _mm256_setzero_si256();
|
| 108 |
+
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
| 109 |
+
return _mm256_cvtepi32_ps(summed_pairs);
|
| 110 |
+
#elif defined(__AVXVNNI__)
|
| 111 |
+
const __m256i zero = _mm256_setzero_si256();
|
| 112 |
+
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
| 113 |
+
return _mm256_cvtepi32_ps(summed_pairs);
|
| 114 |
#else
|
| 115 |
// Perform multiplication and create 16-bit values
|
| 116 |
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
|
|
|
| 653 |
const int nb = n / qk;
|
| 654 |
|
| 655 |
assert(n % qk == 0);
|
|
|
|
|
|
|
|
|
|
| 656 |
UNUSED(by);
|
|
|
|
| 657 |
|
| 658 |
+
#if defined(__AVX2__)
|
| 659 |
+
// Maddubs kernel: uses the identity dot(w, a) = 2·Σ(a where bit=1) − Σ(a)
|
| 660 |
+
// for 1-bit weights w ∈ {-1,+1} encoded as bits b ∈ {0,1} where w = 2b−1.
|
| 661 |
+
//
|
| 662 |
+
// Bit expansion: broadcast uint32 weight bits → shuffle each byte to its
|
| 663 |
+
// 8-byte group → AND with per-position bit test → clamp to 0/1 with min.
|
| 664 |
+
// Then maddubs(selector, activations) gives masked pair-sums, and
|
| 665 |
+
// 2·masked − sum_all gives the signed dot product in int16 pairs.
|
| 666 |
+
//
|
| 667 |
+
// Multi-row (nrc>1): activation data loaded once per sub-block, reused
|
| 668 |
+
// across all weight rows. Saves ~75% activation bandwidth for nrc=4.
|
| 669 |
+
|
| 670 |
+
const __m256i shuf_mask = _mm256_setr_epi8(
|
| 671 |
+
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
|
| 672 |
+
2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
|
| 673 |
+
const __m256i bit_test = _mm256_set1_epi64x((long long)0x8040201008040201LL);
|
| 674 |
+
const __m256i ones_byte = _mm256_set1_epi8(1);
|
| 675 |
+
const __m256i ones_16 = _mm256_set1_epi16(1);
|
| 676 |
+
|
| 677 |
+
// Macro: compute one row's contribution for one sub-block.
|
| 678 |
+
// Expects ab (activation bytes) and sa (sum-all pairs) in scope.
|
| 679 |
+
#define Q1G128_DOT_ROW(xptr, ib_idx, k_idx, ab, sa, scale, acc_r) \
|
| 680 |
+
do { \
|
| 681 |
+
uint32_t _bits; \
|
| 682 |
+
memcpy(&_bits, &(xptr)[(ib_idx)].qs[(k_idx) * 4], sizeof(_bits)); \
|
| 683 |
+
const __m256i _bexp = _mm256_shuffle_epi8(_mm256_set1_epi32((int)_bits), shuf_mask); \
|
| 684 |
+
const __m256i _sel = _mm256_min_epu8(_mm256_and_si256(_bexp, bit_test), ones_byte); \
|
| 685 |
+
const __m256i _ps = _mm256_maddubs_epi16(_sel, (ab)); \
|
| 686 |
+
const __m256i _dp = _mm256_sub_epi16(_mm256_slli_epi16(_ps, 1), (sa)); \
|
| 687 |
+
const __m256i _d32 = _mm256_madd_epi16(_dp, ones_16); \
|
| 688 |
+
(acc_r) = _mm256_fmadd_ps(_mm256_set1_ps(scale), _mm256_cvtepi32_ps(_d32), (acc_r)); \
|
| 689 |
+
} while (0)
|
| 690 |
+
|
| 691 |
+
// Horizontal reduction: __m256 → scalar float
|
| 692 |
+
#define Q1G128_HREDUCE(acc_r) do { \
|
| 693 |
+
const __m128 _h = _mm_add_ps(_mm256_extractf128_ps((acc_r), 0), \
|
| 694 |
+
_mm256_extractf128_ps((acc_r), 1)); \
|
| 695 |
+
const __m128 _q = _mm_add_ps(_h, _mm_movehl_ps(_h, _h)); \
|
| 696 |
+
_hresult = _mm_cvtss_f32(_mm_add_ss(_q, _mm_movehdup_ps(_q))); \
|
| 697 |
+
} while (0)
|
| 698 |
+
|
| 699 |
+
if (nrc == 1) {
|
| 700 |
+
// Single-row path: no multi-row overhead
|
| 701 |
+
UNUSED(bx); UNUSED(bs);
|
| 702 |
+
const block_q1_0_g128 * GGML_RESTRICT x = (const block_q1_0_g128 *)vx;
|
| 703 |
+
const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
|
| 704 |
+
|
| 705 |
+
__m256 acc = _mm256_setzero_ps();
|
| 706 |
+
for (int ib = 0; ib < nb; ++ib) {
|
| 707 |
+
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
| 708 |
+
for (int k = 0; k < 4; k++) {
|
| 709 |
+
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
|
| 710 |
+
const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
|
| 711 |
+
const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
|
| 712 |
+
Q1G128_DOT_ROW(x, ib, k, ab, sa, d0 * d1, acc);
|
| 713 |
+
}
|
| 714 |
+
}
|
| 715 |
+
float _hresult;
|
| 716 |
+
Q1G128_HREDUCE(acc);
|
| 717 |
+
*s = _hresult;
|
| 718 |
+
|
| 719 |
+
} else if (nrc == 4) {
|
| 720 |
+
// 4-row path: load activation once, compute 4 dot products.
|
| 721 |
+
const block_q1_0_g128 * GGML_RESTRICT x0 = (const block_q1_0_g128 *)vx;
|
| 722 |
+
const block_q1_0_g128 * GGML_RESTRICT x1 = (const block_q1_0_g128 *)((const char *)vx + bx);
|
| 723 |
+
const block_q1_0_g128 * GGML_RESTRICT x2 = (const block_q1_0_g128 *)((const char *)vx + 2*bx);
|
| 724 |
+
const block_q1_0_g128 * GGML_RESTRICT x3 = (const block_q1_0_g128 *)((const char *)vx + 3*bx);
|
| 725 |
+
const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
|
| 726 |
+
|
| 727 |
+
__m256 a0 = _mm256_setzero_ps();
|
| 728 |
+
__m256 a1 = _mm256_setzero_ps();
|
| 729 |
+
__m256 a2 = _mm256_setzero_ps();
|
| 730 |
+
__m256 a3 = _mm256_setzero_ps();
|
| 731 |
+
|
| 732 |
+
for (int ib = 0; ib < nb; ++ib) {
|
| 733 |
+
const float d0_0 = GGML_CPU_FP16_TO_FP32(x0[ib].d);
|
| 734 |
+
const float d0_1 = GGML_CPU_FP16_TO_FP32(x1[ib].d);
|
| 735 |
+
const float d0_2 = GGML_CPU_FP16_TO_FP32(x2[ib].d);
|
| 736 |
+
const float d0_3 = GGML_CPU_FP16_TO_FP32(x3[ib].d);
|
| 737 |
+
|
| 738 |
+
for (int k = 0; k < 4; k++) {
|
| 739 |
+
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
|
| 740 |
+
const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
|
| 741 |
+
const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
|
| 742 |
+
|
| 743 |
+
Q1G128_DOT_ROW(x0, ib, k, ab, sa, d0_0 * d1, a0);
|
| 744 |
+
Q1G128_DOT_ROW(x1, ib, k, ab, sa, d0_1 * d1, a1);
|
| 745 |
+
Q1G128_DOT_ROW(x2, ib, k, ab, sa, d0_2 * d1, a2);
|
| 746 |
+
Q1G128_DOT_ROW(x3, ib, k, ab, sa, d0_3 * d1, a3);
|
| 747 |
+
}
|
| 748 |
+
}
|
| 749 |
|
| 750 |
+
float _hresult;
|
| 751 |
+
Q1G128_HREDUCE(a0); *(float *)((char *)s + 0*bs) = _hresult;
|
| 752 |
+
Q1G128_HREDUCE(a1); *(float *)((char *)s + 1*bs) = _hresult;
|
| 753 |
+
Q1G128_HREDUCE(a2); *(float *)((char *)s + 2*bs) = _hresult;
|
| 754 |
+
Q1G128_HREDUCE(a3); *(float *)((char *)s + 3*bs) = _hresult;
|
| 755 |
+
|
| 756 |
+
} else {
|
| 757 |
+
// Generic multi-row path for nrc=2,3
|
| 758 |
+
assert(nrc >= 2 && nrc <= 4);
|
| 759 |
+
const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
|
| 760 |
+
|
| 761 |
+
const block_q1_0_g128 * GGML_RESTRICT xr[4];
|
| 762 |
+
__m256 acc_r[4];
|
| 763 |
+
for (int r = 0; r < nrc; r++) {
|
| 764 |
+
xr[r] = (const block_q1_0_g128 *)((const char *)vx + r * bx);
|
| 765 |
+
acc_r[r] = _mm256_setzero_ps();
|
| 766 |
+
}
|
| 767 |
|
| 768 |
+
for (int ib = 0; ib < nb; ++ib) {
|
| 769 |
+
float d0_r[4];
|
| 770 |
+
for (int r = 0; r < nrc; r++) {
|
| 771 |
+
d0_r[r] = GGML_CPU_FP16_TO_FP32(xr[r][ib].d);
|
| 772 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
+
for (int k = 0; k < 4; k++) {
|
| 775 |
+
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
|
| 776 |
+
const __m256i ab = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
|
| 777 |
+
const __m256i sa = _mm256_maddubs_epi16(ones_byte, ab);
|
| 778 |
|
| 779 |
+
for (int r = 0; r < nrc; r++) {
|
| 780 |
+
Q1G128_DOT_ROW(xr[r], ib, k, ab, sa, d0_r[r] * d1, acc_r[r]);
|
| 781 |
+
}
|
| 782 |
+
}
|
| 783 |
+
}
|
| 784 |
+
|
| 785 |
+
float _hresult;
|
| 786 |
+
for (int r = 0; r < nrc; r++) {
|
| 787 |
+
Q1G128_HREDUCE(acc_r[r]);
|
| 788 |
+
*(float *)((char *)s + r * bs) = _hresult;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
}
|
| 791 |
+
|
| 792 |
+
#undef Q1G128_DOT_ROW
|
| 793 |
+
#undef Q1G128_HREDUCE
|
| 794 |
+
|
| 795 |
#else
|
| 796 |
+
// Scalar fallback (nrc=1 only, multi-row handled by dispatch calling nrc=1)
|
| 797 |
+
assert(nrc == 1);
|
| 798 |
+
UNUSED(nrc); UNUSED(bx); UNUSED(bs);
|
| 799 |
+
|
| 800 |
+
const block_q1_0_g128 * GGML_RESTRICT x = (const block_q1_0_g128 *)vx;
|
| 801 |
+
const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
|
| 802 |
+
|
| 803 |
+
float sumf = 0;
|
| 804 |
for (int ib = 0; ib < nb; ++ib) {
|
| 805 |
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
|
|
| 806 |
float sumi = 0;
|
|
|
|
|
|
|
| 807 |
for (int k = 0; k < 4; k++) {
|
| 808 |
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
|
|
|
|
| 809 |
int sumi_block = 0;
|
|
|
|
| 810 |
for (int j = 0; j < QK8_0; j++) {
|
| 811 |
const int bit_index = k * QK8_0 + j;
|
| 812 |
const int byte_index = bit_index / 8;
|
| 813 |
const int bit_offset = bit_index % 8;
|
|
|
|
|
|
|
| 814 |
const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
| 815 |
const int yi = y[ib*4 + k].qs[j];
|
|
|
|
| 816 |
sumi_block += xi * yi;
|
| 817 |
}
|
|
|
|
| 818 |
sumi += d1 * sumi_block;
|
| 819 |
}
|
|
|
|
| 820 |
sumf += d0 * sumi;
|
| 821 |
}
|
|
|
|
| 822 |
*s = sumf;
|
| 823 |
#endif
|
| 824 |
}
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -227,6 +227,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
| 227 |
.from_float = quantize_row_q1_0_g128,
|
| 228 |
.vec_dot = ggml_vec_dot_q1_0_g128_q8_0,
|
| 229 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
.nrows = 1,
|
| 231 |
},
|
| 232 |
[GGML_TYPE_Q4_0] = {
|
|
@@ -1185,13 +1190,99 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
|
| 1185 |
assert(ne12 % ne02 == 0);
|
| 1186 |
assert(ne13 % ne03 == 0);
|
| 1187 |
|
| 1188 |
-
// COM6-inspired block-tiling: larger blocks for Q1_0_g128 (1-bit weights are tiny,
|
| 1189 |
-
// so we can fit more rows in L1). Prefetch next weight block while processing current.
|
| 1190 |
-
const int64_t blck_0 = (type == GGML_TYPE_Q1_0_g128) ? 64 : 16;
|
| 1191 |
-
const int64_t blck_1 = 16;
|
| 1192 |
-
|
| 1193 |
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
| 1194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1195 |
// attempt to reduce false-sharing (does not seem to make a difference)
|
| 1196 |
// Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
|
| 1197 |
float tmp[128];
|
|
@@ -1213,31 +1304,18 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
|
| 1213 |
|
| 1214 |
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
| 1215 |
|
| 1216 |
-
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
| 1217 |
-
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
| 1218 |
-
// the original src1 data pointer, so we should index using the indices directly
|
| 1219 |
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
| 1220 |
const char * src1_col = (const char*)wdata +
|
| 1221 |
(src1_cont || src1->type != vec_dot_type
|
| 1222 |
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
| 1223 |
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
| 1224 |
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
| 1225 |
|
| 1226 |
-
|
| 1227 |
-
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
| 1228 |
-
//}
|
| 1229 |
-
|
| 1230 |
-
// COM6-inspired: prefetch next weight rows while computing current ones.
|
| 1231 |
-
const int64_t ir0_max = MIN(iir0 + blck_0, ir0_end);
|
| 1232 |
-
for (int64_t ir0 = iir0; ir0 < ir0_max; ir0 += num_rows_per_vec_dot) {
|
| 1233 |
-
if (ir0 + 4 * num_rows_per_vec_dot < ir0_max) {
|
| 1234 |
-
__builtin_prefetch(src0_row + (ir0 + 4 * num_rows_per_vec_dot) * nb01, 0, 1);
|
| 1235 |
-
}
|
| 1236 |
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
| 1237 |
}
|
| 1238 |
|
| 1239 |
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
| 1240 |
-
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (
|
| 1241 |
}
|
| 1242 |
}
|
| 1243 |
}
|
|
|
|
| 227 |
.from_float = quantize_row_q1_0_g128,
|
| 228 |
.vec_dot = ggml_vec_dot_q1_0_g128_q8_0,
|
| 229 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 230 |
+
// nrows=1: the nrc=2 x86 kernel used s[bs/sizeof(float)] but the dispatch
|
| 231 |
+
// expects MMLA-style s[bs] (2 weight rows × 2 activation cols). This caused
|
| 232 |
+
// corrupt prompt-eval results. Generation (ne11=1) was unaffected since the
|
| 233 |
+
// ne11%2!=0 guard forced nrc=1. Kept at 1 until a proper 2×2 tile kernel
|
| 234 |
+
// is written.
|
| 235 |
.nrows = 1,
|
| 236 |
},
|
| 237 |
[GGML_TYPE_Q4_0] = {
|
|
|
|
| 1190 |
assert(ne12 % ne02 == 0);
|
| 1191 |
assert(ne13 % ne03 == 0);
|
| 1192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1193 |
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
| 1194 |
|
| 1195 |
+
#if defined(__AVX2__)
|
| 1196 |
+
// Q1_0_g128 fast path: nrc=4 kernel processes 4 weight rows per call,
|
| 1197 |
+
// loading Q8 activations once per quad (~60% activation bandwidth savings).
|
| 1198 |
+
// Write directly to dst (no tmp buffer needed).
|
| 1199 |
+
if (type == GGML_TYPE_Q1_0_g128) {
|
| 1200 |
+
// Resolve src0_row once: in mul_mat the broadcast dims (i02,i03)
|
| 1201 |
+
// are uniform across ir1 in this thread's range for typical LLM
|
| 1202 |
+
// matmuls (ne12==ne02, ne13==ne03), so we hoist it out. We still
|
| 1203 |
+
// recompute per-ir1 to be safe when broadcasting is in effect.
|
| 1204 |
+
const int64_t ne1xne12 = ne12 * ne1;
|
| 1205 |
+
|
| 1206 |
+
// GEPP-style outer tile: process 16 weight rows per outer iteration
|
| 1207 |
+
// (4 × nrc=4 calls), prefetching the next group to hide DRAM latency.
|
| 1208 |
+
// Each Q1_0_g128 row at K=4096 is 576 bytes; 16 rows = 9.2 KB (fits L1d).
|
| 1209 |
+
int64_t ir0 = ir0_start;
|
| 1210 |
+
for (; ir0 + 15 < ir0_end; ir0 += 16) {
|
| 1211 |
+
for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
|
| 1212 |
+
const int64_t i13 = (ir1 / ne1xne12);
|
| 1213 |
+
const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
|
| 1214 |
+
const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
|
| 1215 |
+
const int64_t i03 = i13 / r3;
|
| 1216 |
+
const int64_t i02 = i12 / r2;
|
| 1217 |
+
const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
|
| 1218 |
+
const char * src1_col = (const char*)wdata +
|
| 1219 |
+
(src1_cont || src1->type != vec_dot_type
|
| 1220 |
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
| 1221 |
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
| 1222 |
+
float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
|
| 1223 |
+
// Prefetch next 16-row tile's first cache lines
|
| 1224 |
+
if (ir0 + 19 < ir0_end) {
|
| 1225 |
+
_mm_prefetch(src0_row + (ir0 + 16) * nb01, _MM_HINT_T1);
|
| 1226 |
+
_mm_prefetch(src0_row + (ir0 + 18) * nb01, _MM_HINT_T1);
|
| 1227 |
+
}
|
| 1228 |
+
vec_dot(ne00, &dst_col[ir0], sizeof(float), src0_row + ir0 * nb01, nb01, src1_col, 0, 4);
|
| 1229 |
+
vec_dot(ne00, &dst_col[ir0+4], sizeof(float), src0_row + (ir0 + 4) * nb01, nb01, src1_col, 0, 4);
|
| 1230 |
+
vec_dot(ne00, &dst_col[ir0+8], sizeof(float), src0_row + (ir0 + 8) * nb01, nb01, src1_col, 0, 4);
|
| 1231 |
+
vec_dot(ne00, &dst_col[ir0+12], sizeof(float), src0_row + (ir0 +12) * nb01, nb01, src1_col, 0, 4);
|
| 1232 |
+
}
|
| 1233 |
+
}
|
| 1234 |
+
// Remaining rows in groups of 4
|
| 1235 |
+
for (; ir0 + 3 < ir0_end; ir0 += 4) {
|
| 1236 |
+
for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
|
| 1237 |
+
const int64_t i13 = (ir1 / ne1xne12);
|
| 1238 |
+
const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
|
| 1239 |
+
const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
|
| 1240 |
+
const int64_t i03 = i13 / r3;
|
| 1241 |
+
const int64_t i02 = i12 / r2;
|
| 1242 |
+
const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
|
| 1243 |
+
const char * src1_col = (const char*)wdata +
|
| 1244 |
+
(src1_cont || src1->type != vec_dot_type
|
| 1245 |
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
| 1246 |
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
| 1247 |
+
float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
|
| 1248 |
+
vec_dot(ne00, &dst_col[ir0], sizeof(float),
|
| 1249 |
+
src0_row + ir0 * nb01, nb01,
|
| 1250 |
+
src1_col, 0, 4);
|
| 1251 |
+
}
|
| 1252 |
+
}
|
| 1253 |
+
// Remainder rows: fall back to column-outer order (small tail)
|
| 1254 |
+
for (int64_t ir1 = ir1_start; ir1 < ir1_end; ++ir1) {
|
| 1255 |
+
const int64_t i13 = (ir1 / ne1xne12);
|
| 1256 |
+
const int64_t i12 = (ir1 - i13 * ne1xne12) / ne1;
|
| 1257 |
+
const int64_t i11 = (ir1 - i13 * ne1xne12 - i12 * ne1);
|
| 1258 |
+
const int64_t i03 = i13 / r3;
|
| 1259 |
+
const int64_t i02 = i12 / r2;
|
| 1260 |
+
const char * src0_row = (const char*)src0->data + (i02 * nb02 + i03 * nb03);
|
| 1261 |
+
const char * src1_col = (const char*)wdata +
|
| 1262 |
+
(src1_cont || src1->type != vec_dot_type
|
| 1263 |
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
| 1264 |
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
| 1265 |
+
float * dst_col = (float*)((char*)dst->data + (i11 * nb1 + i12 * nb2 + i13 * nb3));
|
| 1266 |
+
int64_t ir0t = ir0;
|
| 1267 |
+
for (; ir0t + 1 < ir0_end; ir0t += 2) {
|
| 1268 |
+
vec_dot(ne00, &dst_col[ir0t], sizeof(float),
|
| 1269 |
+
src0_row + ir0t * nb01, nb01,
|
| 1270 |
+
src1_col, 0, 2);
|
| 1271 |
+
}
|
| 1272 |
+
if (ir0t < ir0_end) {
|
| 1273 |
+
vec_dot(ne00, &dst_col[ir0t], 0,
|
| 1274 |
+
src0_row + ir0t * nb01, 0,
|
| 1275 |
+
src1_col, 0, 1);
|
| 1276 |
+
}
|
| 1277 |
+
}
|
| 1278 |
+
return;
|
| 1279 |
+
}
|
| 1280 |
+
#endif
|
| 1281 |
+
|
| 1282 |
+
// Generic path for all other quant types
|
| 1283 |
+
const int64_t blck_0 = 16;
|
| 1284 |
+
const int64_t blck_1 = 16;
|
| 1285 |
+
|
| 1286 |
// attempt to reduce false-sharing (does not seem to make a difference)
|
| 1287 |
// Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
|
| 1288 |
float tmp[128];
|
|
|
|
| 1304 |
|
| 1305 |
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
| 1306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1307 |
const char * src1_col = (const char*)wdata +
|
| 1308 |
(src1_cont || src1->type != vec_dot_type
|
| 1309 |
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
| 1310 |
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
| 1311 |
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
| 1312 |
|
| 1313 |
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1314 |
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
| 1315 |
}
|
| 1316 |
|
| 1317 |
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
| 1318 |
+
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
|
| 1319 |
}
|
| 1320 |
}
|
| 1321 |
}
|