Add files using upload-large-folder tool

19ed98b verified 2 months ago

22.4 kB

	/*
	* LOG-UNARY TRANSFORMER ENGINE
	*
	* Unary base-1 with logarithmic compression:
	* Linear unary: value 7 = 1111111 (7 planes, each = +1)
	* Log unary: value 8 = 111 (3 planes, plane p = 2^p)
	*
	* Matmul kernel: acc += popcount(w_plane[p] AND x_plane[q]) << (p+q)
	* Still pure AND+popcount+shift, no float in hot path.
	*
	* 3 log-planes = values {0,1,2,4} with sign = {-4..+4} = 9 levels
	* 4 log-planes = values {0,1,2,4,8} with sign = {-8..+8} = 17 levels
	* 5 log-planes = values {0,1,2,4,8,16} with sign = {-16..+16} = 33 levels
	*
	* vs linear 7 planes = {-7..+7} = 15 levels using 7 planes
	*
	* (c) 2026 OpenTransformers Ltd / Scott Bisset
	*/

	#include <immintrin.h>
	#include <omp.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <stdio.h>
	#include <time.h>

	#define MAX_SEQ 4096
	#define RMS_EPS 1e-6f

	/* ============================================================
	* Config
	* ============================================================ */
	typedef struct {
	int hidden;
	int inter;
	int n_heads;
	int n_kv_heads;
	int head_dim;
	int n_layers;
	int vocab;
	float rope_theta;
	int tie_embeddings;
	int w_planes; /* weight log-planes */
	int a_planes; /* activation log-planes */
	} Config;

	/* Log-unary weight matrix */
	typedef struct {
	uint64_t sign_bits; / [out_dim * chunks] */
	uint64_t log_planes; / [n_planes][out_dim * chunks] - plane p = 2^p */
	float scales; / [out_dim] */
	int out_dim;
	int in_dim;
	int n_planes;
	int chunks;
	} LogUnaryWeight;

	/* Transformer layer */
	typedef struct {
	LogUnaryWeight q_proj, k_proj, v_proj, o_proj;
	LogUnaryWeight gate_proj, up_proj, down_proj;
	float *input_norm;
	float *post_norm;
	float q_norm, k_norm;
	} Layer;

	/* Full model */
	typedef struct {
	Config cfg;
	uint16_t *embed;
	Layer *layers;
	float *final_norm;

	/* KV cache */
	float *k_cache;
	float *v_cache;

	/* Float scratch (O(dim) ops only) */
	float *hidden;
	float *normed;
	float *q_float;
	float *k_float;
	float *v_float;
	float *attn_out;
	float *gate_float;
	float *up_float;
	float *mlp_act;
	float *logits;
	float *attn_scores;

	/* Unary scratch for activation quantization */
	uint64_t *act_sign;
	uint64_t *act_planes;

	/* Larger scratch for intermediate dim */
	uint64_t *mlp_act_sign;
	uint64_t *mlp_act_planes;
	} Model;

	/* ============================================================
	* LOG-UNARY ACTIVATION QUANTIZATION
	*
	* Encode float value as sign + log-magnitude planes
	* Plane p is set if \|x\| >= threshold_p
	* threshold_p = scale * 2^p / max_level
	*
	* Effectively: compute integer magnitude = round(\|x\|/scale * max_level)
	* Then decompose into binary: if bit p is set in magnitude, plane p is set
	*
	* Wait — that's just BINARY encoding of the magnitude!
	* Log-unary IS binary representation stored as separate bitplanes.
	* The magic is that AND+popcount+shift MULTIPLIES them.
	* ============================================================ */
	static void quantize_log_unary(
	const float *x, int dim, int n_planes,
	uint64_t sign_out, uint64_t planes_out, float *scale_out
	) {
	int chunks = (dim + 63) / 64;
	int max_level = (1 << n_planes) - 1; /* 2^n - 1 */

	/* Find absmax */
	float amax = 0.0f;
	for (int i = 0; i < dim; i++) {
	float a = fabsf(x[i]);
	if (a > amax) amax = a;
	}
	if (amax == 0.0f) amax = 1.0f;
	*scale_out = amax / max_level;

	memset(sign_out, 0, chunks * sizeof(uint64_t));
	memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t));

	float inv_scale = max_level / amax;
	for (int i = 0; i < dim; i++) {
	int chunk = i / 64;
	int bit = i % 64;
	uint64_t mask = 1ULL << bit;

	if (x[i] < 0.0f)
	sign_out[chunk] \|= mask;

	int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f);
	if (mag > max_level) mag = max_level;

	/* Binary decomposition: plane p gets bit p of magnitude */
	for (int p = 0; p < n_planes; p++) {
	if (mag & (1 << p))
	planes_out[(size_t)p * chunks + chunk] \|= mask;
	}
	}
	}

	/* ============================================================
	* LOG-UNARY MATVEC: y = W @ x
	*
	* W: log-unary (sign + wp log-planes, scales)
	* x: log-unary (sign + xp log-planes, scale)
	*
	* For each output element i:
	* acc = 0
	* for each chunk c:
	* same = ~(w_sign[c] ^ x_sign[c])
	* diff = w_sign[c] ^ x_sign[c]
	* for p in 0..wp-1:
	* for q in 0..xp-1:
	* active = w_plane[p][c] & x_plane[q][c]
	* pos = popcount(active & same)
	* neg = popcount(active & diff)
	* acc += (pos - neg) << (p + q) <-- THE KEY: shift by p+q
	* y[i] = acc * w_scale[i] * x_scale
	* ============================================================ */
	static void log_unary_matvec(
	const LogUnaryWeight *W,
	const uint64_t x_sign, const uint64_t x_planes,
	float x_scale, int x_n_planes,
	float *y_out
	) {
	int out_dim = W->out_dim;
	int chunks = W->chunks;
	int wp = W->n_planes;
	int xp = x_n_planes;

	#pragma omp parallel for schedule(dynamic, 32)
	for (int i = 0; i < out_dim; i++) {
	const uint64_t w_sign_row = W->sign_bits + (size_t)i chunks;
	long long acc = 0;

	for (int c = 0; c < chunks; c++) {
	uint64_t ws = w_sign_row[c];
	uint64_t xs = x_sign[c];
	uint64_t same = ~(ws ^ xs);
	uint64_t diff = ws ^ xs;

	for (int p = 0; p < wp; p++) {
	uint64_t w_mag = W->log_planes[((size_t)p * out_dim + i) * chunks + c];

	for (int q = 0; q < xp; q++) {
	uint64_t x_mag = x_planes[(size_t)q * chunks + c];
	uint64_t active = w_mag & x_mag;
	if (!active) continue; /* skip zero — common with log encoding */

	uint64_t pos = active & same;
	uint64_t neg = active & diff;
	int shift = p + q;
	acc += (long long)(__builtin_popcountll(pos) -
	__builtin_popcountll(neg)) << shift;
	}
	}
	}

	y_out[i] = (float)acc * W->scales[i] * x_scale;
	}
	}

	/* ============================================================
	* FP16 ops (embedding, lm_head) — not in the critical per-layer path
	* ============================================================ */
	static void embed_token(const uint16_t embed, int token_id, float out, int hidden) {
	const uint16_t row = embed + (size_t)token_id hidden;
	int i;
	for (i = 0; i + 16 <= hidden; i += 16) {
	__m256i h = _mm256_loadu_si256((__m256i*)(row + i));
	__m512 fv = _mm512_cvtph_ps(h);
	_mm512_storeu_ps(out + i, fv);
	}
	for (; i < hidden; i++) {
	__m128i hv = _mm_set1_epi16(row[i]);
	__m128 fv = _mm_cvtph_ps(hv);
	_mm_store_ss(out + i, fv);
	}
	}

	static void fp16_matvec(const uint16_t w, const float x, float *y, int out_dim, int in_dim) {
	#pragma omp parallel for schedule(dynamic, 256)
	for (int i = 0; i < out_dim; i++) {
	__m512 acc = _mm512_setzero_ps();
	int j;
	for (j = 0; j + 16 <= in_dim; j += 16) {
	__m256i h = _mm256_loadu_si256((__m256i)(w + (size_t)i in_dim + j));
	__m512 wv = _mm512_cvtph_ps(h);
	__m512 xv = _mm512_loadu_ps(x + j);
	acc = _mm512_fmadd_ps(wv, xv, acc);
	}
	float sum = _mm512_reduce_add_ps(acc);
	for (; j < in_dim; j++) {
	__m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
	__m128 fv = _mm_cvtph_ps(hv);
	float wf; _mm_store_ss(&wf, fv);
	sum += wf * x[j];
	}
	y[i] = sum;
	}
	}

	/* ============================================================
	* O(dim) float ops — RMSNorm, SiLU, Softmax, RoPE, residual
	* ============================================================ */
	static void rmsnorm(const float x, const float w, float *y, int dim) {
	float ss = 0.0f;
	for (int i = 0; i < dim; i++) ss += x[i] * x[i];
	float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
	for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i];
	}

	static void silu_mul(const float gate, const float up, float *out, int n) {
	for (int i = 0; i < n; i++)
	out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i];
	}

	static void vec_add(float y, const float x, int n) {
	for (int i = 0; i < n; i++) y[i] += x[i];
	}

	static void apply_rope(float *vec, int pos, int dim, float theta) {
	for (int i = 0; i < dim; i += 2) {
	float freq = 1.0f / powf(theta, (float)i / dim);
	float angle = pos * freq;
	float co = cosf(angle), si = sinf(angle);
	float v0 = vec[i], v1 = vec[i+1];
	vec[i] = v0co - v1si;
	vec[i+1] = v0si + v1co;
	}
	}

	static void softmax(float *x, int n) {
	float mx = x[0];
	for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
	float sum = 0.0f;
	for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; }
	float inv = 1.0f / sum;
	for (int i = 0; i < n; i++) x[i] *= inv;
	}

	static float* kv_ptr(float cache, const Config c, int layer, int pos, int kv_head) {
	return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
	(size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
	}

	/* ============================================================
	* ATTENTION
	* ============================================================ */
	static void attention(Model *m, int layer_idx, int pos) {
	Config *c = &m->cfg;
	Layer *L = &m->layers[layer_idx];
	int heads_per_kv = c->n_heads / c->n_kv_heads;
	int hidden_chunks = (c->hidden + 63) / 64;
	float act_scale;

	/* Quantize normed hidden -> log-unary */
	quantize_log_unary(m->normed, c->hidden, c->a_planes,
	m->act_sign, m->act_planes, &act_scale);

	/* Q, K, V — log-unary matmul */
	log_unary_matvec(&L->q_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->q_float);
	log_unary_matvec(&L->k_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->k_float);
	log_unary_matvec(&L->v_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->v_float);

	/* QK-Norm */
	if (L->q_norm)
	for (int h = 0; h < c->n_heads; h++)
	rmsnorm(m->q_float + hc->head_dim, L->q_norm, m->q_float + hc->head_dim, c->head_dim);
	if (L->k_norm)
	for (int h = 0; h < c->n_kv_heads; h++)
	rmsnorm(m->k_float + hc->head_dim, L->k_norm, m->k_float + hc->head_dim, c->head_dim);

	/* RoPE */
	for (int h = 0; h < c->n_heads; h++)
	apply_rope(m->q_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);
	for (int h = 0; h < c->n_kv_heads; h++)
	apply_rope(m->k_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);

	/* KV cache store */
	for (int h = 0; h < c->n_kv_heads; h++) {
	memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
	m->k_float + hc->head_dim, c->head_dim sizeof(float));
	memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
	m->v_float + hc->head_dim, c->head_dim sizeof(float));
	}

	/* Attention dot products + softmax + weighted sum */
	float scale = 1.0f / sqrtf((float)c->head_dim);
	memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));

	for (int h = 0; h < c->n_heads; h++) {
	int kv_h = h / heads_per_kv;
	float qh = m->q_float + hc->head_dim;
	float oh = m->attn_out + hc->head_dim;

	for (int t = 0; t <= pos; t++) {
	float *kc = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
	float dot = 0.0f;
	for (int d = 0; d < c->head_dim; d++) dot += qh[d] * kc[d];
	m->attn_scores[t] = dot * scale;
	}
	softmax(m->attn_scores, pos + 1);
	for (int t = 0; t <= pos; t++) {
	float w = m->attn_scores[t];
	if (w < 1e-8f) continue;
	float *vc = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
	for (int d = 0; d < c->head_dim; d++) oh[d] += w * vc[d];
	}
	}

	/* O projection — quantize attn_out, then log-unary matmul */
	int o_dim = c->n_heads * c->head_dim;
	int o_chunks = (o_dim + 63) / 64;
	uint64_t o_sign = (uint64_t )aligned_alloc(64, o_chunks * sizeof(uint64_t));
	uint64_t o_planes = (uint64_t )aligned_alloc(64, (size_t)c->a_planes * o_chunks * sizeof(uint64_t));
	float o_scale;
	quantize_log_unary(m->attn_out, o_dim, c->a_planes, o_sign, o_planes, &o_scale);

	float o_tmp = m->normed; / reuse */
	log_unary_matvec(&L->o_proj, o_sign, o_planes, o_scale, c->a_planes, o_tmp);
	memcpy(m->attn_out, o_tmp, c->hidden * sizeof(float));

	free(o_sign); free(o_planes);
	}

	/* ============================================================
	* MLP
	* ============================================================ */
	static void mlp(Model *m, int layer_idx) {
	Config *c = &m->cfg;
	Layer *L = &m->layers[layer_idx];
	int hidden_chunks = (c->hidden + 63) / 64;
	int inter_chunks = (c->inter + 63) / 64;
	float act_scale, mlp_scale;

	/* Quantize normed input */
	quantize_log_unary(m->normed, c->hidden, c->a_planes,
	m->act_sign, m->act_planes, &act_scale);

	/* Gate + Up — log-unary */
	log_unary_matvec(&L->gate_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->gate_float);
	log_unary_matvec(&L->up_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->up_float);

	/* SiLU(gate) * up */
	silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter);

	/* Quantize for down projection */
	quantize_log_unary(m->mlp_act, c->inter, c->a_planes,
	m->mlp_act_sign, m->mlp_act_planes, &mlp_scale);

	/* Down — log-unary */
	log_unary_matvec(&L->down_proj, m->mlp_act_sign, m->mlp_act_planes, mlp_scale, c->a_planes, m->normed);
	}

	/* ============================================================
	* FORWARD
	* ============================================================ */
	float* forward_token(Model *m, int token_id, int pos) {
	Config *c = &m->cfg;

	embed_token(m->embed, token_id, m->hidden, c->hidden);

	for (int l = 0; l < c->n_layers; l++) {
	rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden);
	attention(m, l, pos);
	vec_add(m->hidden, m->attn_out, c->hidden);
	rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden);
	mlp(m, l);
	vec_add(m->hidden, m->normed, c->hidden);
	}

	rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden);

	if (c->tie_embeddings)
	fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden);

	return m->logits;
	}

	/* ============================================================
	* SAMPLING
	* ============================================================ */
	static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
	if (temperature > 0) {
	float inv_t = 1.0f / temperature;
	for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
	}
	softmax(logits, vocab);

	float probs = (float )malloc(vocab * sizeof(float));
	int indices = (int )malloc(vocab * sizeof(int));
	memcpy(probs, logits, vocab * sizeof(float));
	for (int i = 0; i < vocab; i++) indices[i] = i;

	int n = 0; float cum = 0.0f;
	while (cum < top_p && n < vocab) {
	int best = n;
	for (int i = n+1; i < vocab; i++) if (probs[i] > probs[best]) best = i;
	float t = probs[n]; probs[n] = probs[best]; probs[best] = t;
	int ti = indices[n]; indices[n] = indices[best]; indices[best] = ti;
	cum += probs[n]; n++;
	if (n >= 40) break;
	}
	float sum = 0; for (int i = 0; i < n; i++) sum += probs[i];
	float r = (float)rand() / RAND_MAX * sum;
	float a = 0; int ch = indices[0];
	for (int i = 0; i < n; i++) { a += probs[i]; if (a >= r) { ch = indices[i]; break; } }
	free(probs); free(indices);
	return ch;
	}

	int generate(Model m, const int prompt, int plen, int *out, int max_new,
	float temperature, float top_p, int eos) {
	srand(time(NULL));
	for (int i = 0; i < plen; i++) forward_token(m, prompt[i], i);
	int pos = plen, gen = 0;
	for (int t = 0; t < max_new; t++) {
	int next;
	if (temperature <= 0) {
	next = 0;
	for (int i = 1; i < m->cfg.vocab; i++)
	if (m->logits[i] > m->logits[next]) next = i;
	} else {
	next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p);
	}
	out[t] = next; gen++;
	if (next == eos) break;
	forward_token(m, next, pos); pos++;
	}
	return gen;
	}

	/* ============================================================
	* ALLOCATION
	* ============================================================ */
	Model* model_alloc(
	int w_planes, int a_planes,
	int hidden, int inter, int n_heads, int n_kv_heads,
	int head_dim, int n_layers, int vocab,
	float rope_theta, int tie_embeddings
	) {
	Model m = (Model )calloc(1, sizeof(Model));
	Config *c = &m->cfg;
	c->hidden = hidden; c->inter = inter;
	c->n_heads = n_heads; c->n_kv_heads = n_kv_heads;
	c->head_dim = head_dim; c->n_layers = n_layers;
	c->vocab = vocab; c->rope_theta = rope_theta;
	c->tie_embeddings = tie_embeddings;
	c->w_planes = w_planes; c->a_planes = a_planes;

	m->layers = (Layer *)calloc(n_layers, sizeof(Layer));

	size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
	m->k_cache = (float *)calloc(kv_size, sizeof(float));
	m->v_cache = (float *)calloc(kv_size, sizeof(float));

	int max_dim = inter > hidden ? inter : hidden;
	m->hidden = (float )aligned_alloc(64, hidden sizeof(float));
	m->normed = (float )aligned_alloc(64, max_dim sizeof(float));
	m->q_float = (float )aligned_alloc(64, n_heads head_dim * sizeof(float));
	m->k_float = (float )aligned_alloc(64, n_kv_heads head_dim * sizeof(float));
	m->v_float = (float )aligned_alloc(64, n_kv_heads head_dim * sizeof(float));
	m->attn_out = (float )aligned_alloc(64, n_heads head_dim * sizeof(float));
	m->gate_float = (float )aligned_alloc(64, inter sizeof(float));
	m->up_float = (float )aligned_alloc(64, inter sizeof(float));
	m->mlp_act = (float )aligned_alloc(64, inter sizeof(float));
	m->logits = (float )aligned_alloc(64, vocab sizeof(float));
	m->attn_scores = (float )aligned_alloc(64, MAX_SEQ sizeof(float));
	m->final_norm = (float )aligned_alloc(64, hidden sizeof(float));

	/* Unary scratch for hidden dim */
	int h_chunks = (hidden + 63) / 64;
	m->act_sign = (uint64_t )aligned_alloc(64, h_chunks sizeof(uint64_t));
	m->act_planes = (uint64_t )aligned_alloc(64, (size_t)a_planes h_chunks * sizeof(uint64_t));

	/* Unary scratch for intermediate dim */
	int i_chunks = (inter + 63) / 64;
	m->mlp_act_sign = (uint64_t )aligned_alloc(64, i_chunks sizeof(uint64_t));
	m->mlp_act_planes = (uint64_t )aligned_alloc(64, (size_t)a_planes i_chunks * sizeof(uint64_t));

	int w_max = (1 << w_planes) - 1;
	int a_max = (1 << a_planes) - 1;

	printf("LOG-UNARY ENGINE\n");
	printf(" Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n",
	hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
	printf(" Weight: %d log-planes -> %d levels (range -%d..+%d)\n",
	w_planes, 2*w_max+1, w_max, w_max);
	printf(" Activation: %d log-planes -> %d levels (range -%d..+%d)\n",
	a_planes, 2*a_max+1, a_max, a_max);
	printf(" Plane pairs per element: %d (vs %d linear)\n",
	w_planes * a_planes, 7 * 4);
	printf(" KV cache: %zu MB\n", kv_size * 2 * sizeof(float) / (1024*1024));

	return m;
	}

	/* Weight setters */
	void model_set_embed(Model m, uint16_t data) { m->embed = data; }
	void model_set_final_norm(Model m, float data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }

	void layer_set_norms(Model m, int l, float in_norm, float *post_norm) {
	m->layers[l].input_norm = in_norm;
	m->layers[l].post_norm = post_norm;
	}

	void layer_set_qk_norm(Model m, int l, float q_norm, float *k_norm) {
	m->layers[l].q_norm = q_norm;
	m->layers[l].k_norm = k_norm;
	}

	static void init_weight(LogUnaryWeight w, uint64_t sign, uint64_t planes, float scales,
	int out_dim, int in_dim, int n_planes) {
	w->sign_bits = sign; w->log_planes = planes; w->scales = scales;
	w->out_dim = out_dim; w->in_dim = in_dim; w->n_planes = n_planes;
	w->chunks = (in_dim + 63) / 64;
	}

	void layer_set_linears(
	Model *m, int l,
	uint64_t q_s, uint64_t q_p, float *q_sc, int q_out, int q_in,
	uint64_t k_s, uint64_t k_p, float *k_sc, int k_out, int k_in,
	uint64_t v_s, uint64_t v_p, float *v_sc, int v_out, int v_in,
	uint64_t o_s, uint64_t o_p, float *o_sc, int o_out, int o_in,
	uint64_t g_s, uint64_t g_p, float *g_sc, int g_out, int g_in,
	uint64_t u_s, uint64_t u_p, float *u_sc, int u_out, int u_in,
	uint64_t d_s, uint64_t d_p, float *d_sc, int d_out, int d_in,
	int n_planes
	) {
	init_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes);
	init_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes);
	init_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes);
	init_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes);
	init_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes);
	init_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes);
	init_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes);
	}

	void model_reset_cache(Model *m) {
	size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
	memset(m->k_cache, 0, kv_size * sizeof(float));
	memset(m->v_cache, 0, kv_size * sizeof(float));
	}