littlefig-bench / lila_engine_phase2.py

Engine Phase 2: Full transformer forward pass + tokenizer + attention

bc38a2c verified about 21 hours ago

19.1 kB

	#!/usr/bin/env python3
	"""Push transformer forward pass, attention, tokenizer to Lila engine."""
	import subprocess, os
	TOKEN = "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"
	subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True)
	os.chdir("/app/lila")
	subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True)
	subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True)

	# ═══════════════════════════════════════════════════════════════════════════════
	# engine/runtime/attention.c — Multi-Head Attention with RoPE
	# ═══════════════════════════════════════════════════════════════════════════════
	with open("engine/runtime/attention.c", "w") as f:
	f.write('''#include "model.h"
	#include <math.h>
	#include <stdlib.h>
	#include <string.h>

	/*
	* Multi-Head Attention with Rotary Position Embeddings (RoPE)
	* and KV Cache for efficient autoregressive generation.
	*
	* For Gemma 4B: n_heads=16, n_kv_heads=8 (GQA), head_dim=256
	* GQA: key/value heads are shared across query head groups
	*/

	/* Apply RoPE to a single head vector */
	static void apply_rope(float *vec, int head_dim, int position, float theta) {
	for (int i = 0; i < head_dim; i += 2) {
	float freq = 1.0f / powf(theta, (float)i / head_dim);
	float angle = position * freq;
	float cos_a = cosf(angle);
	float sin_a = sinf(angle);

	float v0 = vec[i];
	float v1 = vec[i + 1];
	vec[i] = v0 * cos_a - v1 * sin_a;
	vec[i + 1] = v0 * sin_a + v1 * cos_a;
	}
	}

	/* Initialize KV cache */
	void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq,
	int n_kv_heads, int head_dim) {
	cache->max_seq_len = max_seq;
	cache->current_pos = 0;

	size_t layer_size = (size_t)max_seq * n_kv_heads * head_dim * sizeof(float);
	cache->key_cache = calloc(n_layers, layer_size);
	cache->value_cache = calloc(n_layers, layer_size);
	}

	/* Single-token attention (for autoregressive generation) */
	void lila_attention(
	float output, / [hidden_size] */
	const float input, / [hidden_size] */
	LilaLayer *layer,
	LilaKVCache *cache,
	int layer_idx,
	int position
	) {
	int hidden = layer->hidden_size;
	int n_heads = layer->n_heads;
	int n_kv_heads = layer->n_kv_heads;
	int head_dim = layer->head_dim;
	int kv_group = n_heads / n_kv_heads; /* GQA group size */

	/* Allocate scratch (TODO: pre-allocate in model struct) */
	float q = malloc(hidden sizeof(float));
	float k = malloc(n_kv_heads head_dim * sizeof(float));
	float v = malloc(n_kv_heads head_dim * sizeof(float));
	float *attn_out = calloc(hidden, sizeof(float));

	/* Project Q, K, V using quantized weights */
	/* TODO: replace with dequant_matvec from kernels */
	dequant_matvec(q, &layer->q_proj, input);
	dequant_matvec(k, &layer->k_proj, input);
	dequant_matvec(v, &layer->v_proj, input);

	/* Apply RoPE to Q and K */
	for (int h = 0; h < n_heads; h++) {
	apply_rope(q + h * head_dim, head_dim, position, 10000.0f);
	}
	for (int h = 0; h < n_kv_heads; h++) {
	apply_rope(k + h * head_dim, head_dim, position, 10000.0f);
	}

	/* Store K, V in cache */
	size_t kv_offset = (size_t)position * n_kv_heads * head_dim;
	size_t layer_offset = (size_t)layer_idx * cache->max_seq_len * n_kv_heads * head_dim;
	memcpy(cache->key_cache + layer_offset + kv_offset, k, n_kv_heads * head_dim * sizeof(float));
	memcpy(cache->value_cache + layer_offset + kv_offset, v, n_kv_heads * head_dim * sizeof(float));

	/* Compute attention scores for each head */
	float scale = 1.0f / sqrtf((float)head_dim);

	for (int h = 0; h < n_heads; h++) {
	int kv_h = h / kv_group; /* GQA: which KV head this Q head uses */
	float q_h = q + h head_dim;

	/* Attention scores: dot(q, all cached keys) */
	float scores = malloc((position + 1) sizeof(float));
	float max_score = -1e30f;

	for (int t = 0; t <= position; t++) {
	float k_t = cache->key_cache + layer_offset + (size_t)t n_kv_heads * head_dim + kv_h * head_dim;
	float score = 0.0f;
	for (int d = 0; d < head_dim; d++) {
	score += q_h[d] * k_t[d];
	}
	score *= scale;
	scores[t] = score;
	if (score > max_score) max_score = score;
	}

	/* Softmax */
	float sum = 0.0f;
	for (int t = 0; t <= position; t++) {
	scores[t] = expf(scores[t] - max_score);
	sum += scores[t];
	}
	for (int t = 0; t <= position; t++) {
	scores[t] /= sum;
	}

	/* Weighted sum of values */
	float out_h = attn_out + h head_dim;
	for (int t = 0; t <= position; t++) {
	float v_t = cache->value_cache + layer_offset + (size_t)t n_kv_heads * head_dim + kv_h * head_dim;
	for (int d = 0; d < head_dim; d++) {
	out_h[d] += scores[t] * v_t[d];
	}
	}

	free(scores);
	}

	/* Output projection */
	dequant_matvec(output, &layer->o_proj, attn_out);

	free(q);
	free(k);
	free(v);
	free(attn_out);
	}

	/* Forward declaration for dequant_matvec (defined in inference.c) */
	extern void dequant_matvec(float out, const LilaQuantWeight w, const float *vec);
	''')

	# ═══════════════════════════════════════════════════════════════════════════════
	# engine/runtime/transformer.c — Full transformer block
	# ═══════════════════════════════════════════════════════════════════════════════
	with open("engine/runtime/transformer.c", "w") as f:
	f.write('''#include "model.h"
	#include <math.h>
	#include <stdlib.h>
	#include <string.h>

	/*
	* Full transformer decoder block:
	* residual = x
	* x = rmsnorm(x)
	* x = attention(x) + residual
	* residual = x
	* x = rmsnorm(x)
	* x = mlp(x) + residual
	*/

	/* External kernel declarations */
	extern void lila_rmsnorm_avx2(float out, const float x, const float *weight, int size, float eps);
	extern void lila_attention(float output, const float input, LilaLayer *layer,
	LilaKVCache *cache, int layer_idx, int position);
	extern void dequant_matvec(float out, const LilaQuantWeight w, const float *vec);

	/* SiLU activation (will be assembly in Phase 4) */
	static inline float silu_f(float x) {
	return x / (1.0f + expf(-x));
	}

	/* MLP: gate_proj + up_proj → SiLU(gate) * up → down_proj */
	static void lila_mlp(float output, const float input, LilaLayer *layer) {
	int hidden = layer->hidden_size;
	int inter = layer->intermediate_size;

	float gate = malloc(inter sizeof(float));
	float up = malloc(inter sizeof(float));

	/* Gate and up projections */
	dequant_matvec(gate, &layer->gate_proj, input);
	dequant_matvec(up, &layer->up_proj, input);

	/* SiLU(gate) * up */
	for (int i = 0; i < inter; i++) {
	gate[i] = silu_f(gate[i]) * up[i];
	}

	/* Down projection */
	dequant_matvec(output, &layer->down_proj, gate);

	free(gate);
	free(up);
	}

	/* Memory Fabric contribution (multi-LoRA gated adapters) */
	static void lila_memory_fabric(float output, const float input, LilaMemoryFabric *fabric,
	int in_features, int out_features) {
	/* For each active namespace adapter, compute gated LoRA correction */
	for (int ns = 0; ns < LILA_N_NAMESPACES; ns++) {
	LilaLoRA *adapter = &fabric->adapters[ns];
	if (adapter->gate < 0.01f \|\| adapter->A == NULL) continue;

	int r = adapter->rank;

	/* Compute: gate * (input @ A) @ B */
	float *mid = calloc(r, sizeof(float));

	/* mid = input @ A [in_features] @ [in_features, r] → [r] */
	for (int j = 0; j < r; j++) {
	float sum = 0.0f;
	for (int i = 0; i < in_features; i++) {
	sum += input[i] * adapter->A[i * r + j];
	}
	mid[j] = sum;
	}

	/* output += gate * (mid @ B) [r] @ [r, out_features] → [out_features] */
	float scale = adapter->gate * (32.0f / r); /* alpha/rank */
	for (int i = 0; i < out_features; i++) {
	float sum = 0.0f;
	for (int j = 0; j < r; j++) {
	sum += mid[j] * adapter->B[j * out_features + i];
	}
	output[i] += sum * scale;
	}

	free(mid);
	}
	}

	/* Full transformer block forward pass */
	void lila_transformer_block(
	float hidden_state, / [hidden_size] — modified in place */
	LilaLayer *layer,
	LilaKVCache *cache,
	int layer_idx,
	int position
	) {
	int hidden = layer->hidden_size;
	float residual = malloc(hidden sizeof(float));
	float normed = malloc(hidden sizeof(float));
	float attn_out = malloc(hidden sizeof(float));
	float mlp_out = malloc(hidden sizeof(float));

	/* ── Pre-attention norm ── */
	memcpy(residual, hidden_state, hidden * sizeof(float));
	lila_rmsnorm_avx2(normed, hidden_state, layer->input_layernorm, hidden, 1e-6f);

	/* ── Attention ── */
	lila_attention(attn_out, normed, layer, cache, layer_idx, position);

	/* ── Add Memory Fabric to attention output ── */
	lila_memory_fabric(attn_out, normed, &layer->fabric, hidden, hidden);

	/* ── Residual connection ── */
	for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + attn_out[i];

	/* ── Pre-MLP norm ── */
	memcpy(residual, hidden_state, hidden * sizeof(float));
	lila_rmsnorm_avx2(normed, hidden_state, layer->post_attention_layernorm, hidden, 1e-6f);

	/* ── MLP ── */
	lila_mlp(mlp_out, normed, layer);

	/* ── Residual connection ── */
	for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + mlp_out[i];

	free(residual);
	free(normed);
	free(attn_out);
	free(mlp_out);
	}

	/* Full model forward pass — single token */
	int lila_forward(LilaModel *model, int token, int position) {
	int hidden = model->hidden_size;

	/* Token embedding */
	float hidden_state = malloc(hidden sizeof(float));
	memcpy(hidden_state, model->token_embedding + (size_t)token * hidden,
	hidden * sizeof(float));

	/* Transformer layers */
	for (int l = 0; l < model->n_layers; l++) {
	lila_transformer_block(hidden_state, &model->layers[l],
	&model->kv_cache, l, position);
	}

	/* Final norm */
	float normed = malloc(hidden sizeof(float));
	lila_rmsnorm_avx2(normed, hidden_state, model->final_norm, hidden, 1e-6f);

	/* LM head: project to vocab logits */
	float logits = malloc(model->vocab_size sizeof(float));

	/* matvec: logits = lm_head @ normed */
	/* lm_head is [vocab_size, hidden_size] */
	for (int i = 0; i < model->vocab_size; i++) {
	float sum = 0.0f;
	for (int j = 0; j < hidden; j++) {
	sum += model->lm_head[i * hidden + j] * normed[j];
	}
	logits[i] = sum;
	}

	/* Sample */
	/* Greedy for now — temperature sampling in Phase 4 */
	int next_token = 0;
	float max_val = logits[0];
	for (int i = 1; i < model->vocab_size; i++) {
	if (logits[i] > max_val) { max_val = logits[i]; next_token = i; }
	}

	free(hidden_state);
	free(normed);
	free(logits);

	return next_token;
	}
	''')

	# ═══════════════════════════════════════════════════════════════════════════════
	# engine/runtime/tokenizer.c — BPE Tokenizer
	# ═══════════════════════════════════════════════════════════════════════════════
	with open("engine/runtime/tokenizer.c", "w") as f:
	f.write('''#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	/*
	* BPE Tokenizer for Gemma/LLaMA-family models.
	* Loads sentencepiece vocabulary and performs encoding/decoding.
	*
	* For full functionality, this would need:
	* 1. Load .model file (protobuf) or vocab.json
	* 2. BPE merge rules
	* 3. Byte-fallback for unknown characters
	*
	* Phase 1: Load vocab from a simple text format (one token per line).
	* Phase 4: Full sentencepiece compatibility.
	*/

	#define MAX_VOCAB 128000
	#define MAX_TOKEN_LEN 128

	typedef struct {
	char *tokens; / Array of token strings */
	int vocab_size;
	/* TODO: merge rules, scores */
	} LilaTokenizer;

	LilaTokenizer lila_load_tokenizer(const char vocab_path) {
	LilaTokenizer *tok = calloc(1, sizeof(LilaTokenizer));
	tok->tokens = calloc(MAX_VOCAB, sizeof(char *));

	FILE *f = fopen(vocab_path, "r");
	if (!f) {
	fprintf(stderr, "Cannot open vocab: %s\\n", vocab_path);
	free(tok->tokens);
	free(tok);
	return NULL;
	}

	char line[MAX_TOKEN_LEN];
	int i = 0;
	while (fgets(line, sizeof(line), f) && i < MAX_VOCAB) {
	line[strcspn(line, "\\n")] = 0;
	tok->tokens[i] = strdup(line);
	i++;
	}
	tok->vocab_size = i;
	fclose(f);

	fprintf(stderr, "Tokenizer loaded: %d tokens\\n", tok->vocab_size);
	return tok;
	}

	/* Decode token ID to string */
	const char lila_decode_token(LilaTokenizer tok, int token_id) {
	if (token_id < 0 \|\| token_id >= tok->vocab_size) return "<unk>";
	return tok->tokens[token_id];
	}

	/* Simple encode (character-level fallback — full BPE in Phase 4) */
	int lila_encode_char(LilaTokenizer *tok, char c) {
	/* Search for single-character token */
	char target[2] = {c, 0};
	for (int i = 0; i < tok->vocab_size; i++) {
	if (tok->tokens[i] && strcmp(tok->tokens[i], target) == 0) {
	return i;
	}
	}
	return 0; /* unknown → first token */
	}

	void lila_free_tokenizer(LilaTokenizer *tok) {
	if (!tok) return;
	for (int i = 0; i < tok->vocab_size; i++) {
	free(tok->tokens[i]);
	}
	free(tok->tokens);
	free(tok);
	}
	''')

	# ═══════════════════════════════════════════════════════════════════════════════
	# engine/runtime/tokenizer.h
	# ═══════════════════════════════════════════════════════════════════════════════
	with open("engine/runtime/tokenizer.h", "w") as f:
	f.write('''#ifndef LILA_TOKENIZER_H
	#define LILA_TOKENIZER_H

	typedef struct LilaTokenizer LilaTokenizer;

	LilaTokenizer lila_load_tokenizer(const char vocab_path);
	const char lila_decode_token(LilaTokenizer tok, int token_id);
	int lila_encode_char(LilaTokenizer *tok, char c);
	void lila_free_tokenizer(LilaTokenizer *tok);

	#endif
	''')

	# ═══════════════════════════════════════════════════════════════════════════════
	# engine/runtime/attention.h
	# ═══════════════════════════════════════════════════════════════════════════════
	with open("engine/runtime/attention.h", "w") as f:
	f.write('''#ifndef LILA_ATTENTION_H
	#define LILA_ATTENTION_H

	#include "model.h"

	void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq,
	int n_kv_heads, int head_dim);
	void lila_attention(float output, const float input, LilaLayer *layer,
	LilaKVCache *cache, int layer_idx, int position);

	#endif
	''')

	# ═══════════════════════════════════════════════════════════════════════════════
	# engine/runtime/transformer.h
	# ═══════════════════════════════════════════════════════════════════════════════
	with open("engine/runtime/transformer.h", "w") as f:
	f.write('''#ifndef LILA_TRANSFORMER_H
	#define LILA_TRANSFORMER_H

	#include "model.h"

	void lila_transformer_block(float hidden_state, LilaLayer layer,
	LilaKVCache *cache, int layer_idx, int position);
	int lila_forward(LilaModel *model, int token, int position);

	#endif
	''')

	# Commit and push
	subprocess.run(["git", "add", "-A"], check=True)
	subprocess.run(["git", "commit", "-m",
	"Engine Phase 2: Full transformer forward pass\n\n"
	"runtime/attention.c:\n"
	" - Multi-head attention with Grouped Query Attention (GQA)\n"
	" - Rotary Position Embeddings (RoPE)\n"
	" - KV Cache for autoregressive generation\n"
	" - Memory Fabric (multi-LoRA) integrated into attention\n\n"
	"runtime/transformer.c:\n"
	" - Full decoder block: norm → attention → residual → norm → MLP → residual\n"
	" - Memory Fabric adapter contribution added to attention output\n"
	" - lila_forward(): complete single-token forward pass\n"
	" - Token embedding → N layers → final norm → LM head → sample\n\n"
	"runtime/tokenizer.c:\n"
	" - Vocab loading from text file\n"
	" - Token decode (ID → string)\n"
	" - Character-level encode fallback (full BPE in Phase 4)\n\n"
	"The full inference path is structurally complete.\n"
	"Remaining: wire format converter to produce loadable .lila files,\n"
	"then test end-to-end token generation."],
	check=True)
	subprocess.run(["git", "push", "origin", "main"], check=True)
	print("✅ Engine Phase 2 pushed!")