#!/usr/bin/env python3 """Push transformer forward pass, attention, tokenizer to Lila engine.""" import subprocess, os TOKEN = "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" subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True) os.chdir("/app/lila") subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True) subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True) # ═══════════════════════════════════════════════════════════════════════════════ # engine/runtime/attention.c — Multi-Head Attention with RoPE # ═══════════════════════════════════════════════════════════════════════════════ with open("engine/runtime/attention.c", "w") as f: f.write('''#include "model.h" #include #include #include /* * Multi-Head Attention with Rotary Position Embeddings (RoPE) * and KV Cache for efficient autoregressive generation. * * For Gemma 4B: n_heads=16, n_kv_heads=8 (GQA), head_dim=256 * GQA: key/value heads are shared across query head groups */ /* Apply RoPE to a single head vector */ static void apply_rope(float *vec, int head_dim, int position, float theta) { for (int i = 0; i < head_dim; i += 2) { float freq = 1.0f / powf(theta, (float)i / head_dim); float angle = position * freq; float cos_a = cosf(angle); float sin_a = sinf(angle); float v0 = vec[i]; float v1 = vec[i + 1]; vec[i] = v0 * cos_a - v1 * sin_a; vec[i + 1] = v0 * sin_a + v1 * cos_a; } } /* Initialize KV cache */ void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq, int n_kv_heads, int head_dim) { cache->max_seq_len = max_seq; cache->current_pos = 0; size_t layer_size = (size_t)max_seq * n_kv_heads * head_dim * sizeof(float); cache->key_cache = calloc(n_layers, layer_size); cache->value_cache = calloc(n_layers, layer_size); } /* Single-token attention (for autoregressive generation) */ void lila_attention( float *output, /* [hidden_size] */ const float *input, /* [hidden_size] */ LilaLayer *layer, LilaKVCache *cache, int layer_idx, int position ) { int hidden = layer->hidden_size; int n_heads = layer->n_heads; int n_kv_heads = layer->n_kv_heads; int head_dim = layer->head_dim; int kv_group = n_heads / n_kv_heads; /* GQA group size */ /* Allocate scratch (TODO: pre-allocate in model struct) */ float *q = malloc(hidden * sizeof(float)); float *k = malloc(n_kv_heads * head_dim * sizeof(float)); float *v = malloc(n_kv_heads * head_dim * sizeof(float)); float *attn_out = calloc(hidden, sizeof(float)); /* Project Q, K, V using quantized weights */ /* TODO: replace with dequant_matvec from kernels */ dequant_matvec(q, &layer->q_proj, input); dequant_matvec(k, &layer->k_proj, input); dequant_matvec(v, &layer->v_proj, input); /* Apply RoPE to Q and K */ for (int h = 0; h < n_heads; h++) { apply_rope(q + h * head_dim, head_dim, position, 10000.0f); } for (int h = 0; h < n_kv_heads; h++) { apply_rope(k + h * head_dim, head_dim, position, 10000.0f); } /* Store K, V in cache */ size_t kv_offset = (size_t)position * n_kv_heads * head_dim; size_t layer_offset = (size_t)layer_idx * cache->max_seq_len * n_kv_heads * head_dim; memcpy(cache->key_cache + layer_offset + kv_offset, k, n_kv_heads * head_dim * sizeof(float)); memcpy(cache->value_cache + layer_offset + kv_offset, v, n_kv_heads * head_dim * sizeof(float)); /* Compute attention scores for each head */ float scale = 1.0f / sqrtf((float)head_dim); for (int h = 0; h < n_heads; h++) { int kv_h = h / kv_group; /* GQA: which KV head this Q head uses */ float *q_h = q + h * head_dim; /* Attention scores: dot(q, all cached keys) */ float *scores = malloc((position + 1) * sizeof(float)); float max_score = -1e30f; for (int t = 0; t <= position; t++) { float *k_t = cache->key_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim; float score = 0.0f; for (int d = 0; d < head_dim; d++) { score += q_h[d] * k_t[d]; } score *= scale; scores[t] = score; if (score > max_score) max_score = score; } /* Softmax */ float sum = 0.0f; for (int t = 0; t <= position; t++) { scores[t] = expf(scores[t] - max_score); sum += scores[t]; } for (int t = 0; t <= position; t++) { scores[t] /= sum; } /* Weighted sum of values */ float *out_h = attn_out + h * head_dim; for (int t = 0; t <= position; t++) { float *v_t = cache->value_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim; for (int d = 0; d < head_dim; d++) { out_h[d] += scores[t] * v_t[d]; } } free(scores); } /* Output projection */ dequant_matvec(output, &layer->o_proj, attn_out); free(q); free(k); free(v); free(attn_out); } /* Forward declaration for dequant_matvec (defined in inference.c) */ extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec); ''') # ═══════════════════════════════════════════════════════════════════════════════ # engine/runtime/transformer.c — Full transformer block # ═══════════════════════════════════════════════════════════════════════════════ with open("engine/runtime/transformer.c", "w") as f: f.write('''#include "model.h" #include #include #include /* * Full transformer decoder block: * residual = x * x = rmsnorm(x) * x = attention(x) + residual * residual = x * x = rmsnorm(x) * x = mlp(x) + residual */ /* External kernel declarations */ extern void lila_rmsnorm_avx2(float *out, const float *x, const float *weight, int size, float eps); extern void lila_attention(float *output, const float *input, LilaLayer *layer, LilaKVCache *cache, int layer_idx, int position); extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec); /* SiLU activation (will be assembly in Phase 4) */ static inline float silu_f(float x) { return x / (1.0f + expf(-x)); } /* MLP: gate_proj + up_proj → SiLU(gate) * up → down_proj */ static void lila_mlp(float *output, const float *input, LilaLayer *layer) { int hidden = layer->hidden_size; int inter = layer->intermediate_size; float *gate = malloc(inter * sizeof(float)); float *up = malloc(inter * sizeof(float)); /* Gate and up projections */ dequant_matvec(gate, &layer->gate_proj, input); dequant_matvec(up, &layer->up_proj, input); /* SiLU(gate) * up */ for (int i = 0; i < inter; i++) { gate[i] = silu_f(gate[i]) * up[i]; } /* Down projection */ dequant_matvec(output, &layer->down_proj, gate); free(gate); free(up); } /* Memory Fabric contribution (multi-LoRA gated adapters) */ static void lila_memory_fabric(float *output, const float *input, LilaMemoryFabric *fabric, int in_features, int out_features) { /* For each active namespace adapter, compute gated LoRA correction */ for (int ns = 0; ns < LILA_N_NAMESPACES; ns++) { LilaLoRA *adapter = &fabric->adapters[ns]; if (adapter->gate < 0.01f || adapter->A == NULL) continue; int r = adapter->rank; /* Compute: gate * (input @ A) @ B */ float *mid = calloc(r, sizeof(float)); /* mid = input @ A [in_features] @ [in_features, r] → [r] */ for (int j = 0; j < r; j++) { float sum = 0.0f; for (int i = 0; i < in_features; i++) { sum += input[i] * adapter->A[i * r + j]; } mid[j] = sum; } /* output += gate * (mid @ B) [r] @ [r, out_features] → [out_features] */ float scale = adapter->gate * (32.0f / r); /* alpha/rank */ for (int i = 0; i < out_features; i++) { float sum = 0.0f; for (int j = 0; j < r; j++) { sum += mid[j] * adapter->B[j * out_features + i]; } output[i] += sum * scale; } free(mid); } } /* Full transformer block forward pass */ void lila_transformer_block( float *hidden_state, /* [hidden_size] — modified in place */ LilaLayer *layer, LilaKVCache *cache, int layer_idx, int position ) { int hidden = layer->hidden_size; float *residual = malloc(hidden * sizeof(float)); float *normed = malloc(hidden * sizeof(float)); float *attn_out = malloc(hidden * sizeof(float)); float *mlp_out = malloc(hidden * sizeof(float)); /* ── Pre-attention norm ── */ memcpy(residual, hidden_state, hidden * sizeof(float)); lila_rmsnorm_avx2(normed, hidden_state, layer->input_layernorm, hidden, 1e-6f); /* ── Attention ── */ lila_attention(attn_out, normed, layer, cache, layer_idx, position); /* ── Add Memory Fabric to attention output ── */ lila_memory_fabric(attn_out, normed, &layer->fabric, hidden, hidden); /* ── Residual connection ── */ for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + attn_out[i]; /* ── Pre-MLP norm ── */ memcpy(residual, hidden_state, hidden * sizeof(float)); lila_rmsnorm_avx2(normed, hidden_state, layer->post_attention_layernorm, hidden, 1e-6f); /* ── MLP ── */ lila_mlp(mlp_out, normed, layer); /* ── Residual connection ── */ for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + mlp_out[i]; free(residual); free(normed); free(attn_out); free(mlp_out); } /* Full model forward pass — single token */ int lila_forward(LilaModel *model, int token, int position) { int hidden = model->hidden_size; /* Token embedding */ float *hidden_state = malloc(hidden * sizeof(float)); memcpy(hidden_state, model->token_embedding + (size_t)token * hidden, hidden * sizeof(float)); /* Transformer layers */ for (int l = 0; l < model->n_layers; l++) { lila_transformer_block(hidden_state, &model->layers[l], &model->kv_cache, l, position); } /* Final norm */ float *normed = malloc(hidden * sizeof(float)); lila_rmsnorm_avx2(normed, hidden_state, model->final_norm, hidden, 1e-6f); /* LM head: project to vocab logits */ float *logits = malloc(model->vocab_size * sizeof(float)); /* matvec: logits = lm_head @ normed */ /* lm_head is [vocab_size, hidden_size] */ for (int i = 0; i < model->vocab_size; i++) { float sum = 0.0f; for (int j = 0; j < hidden; j++) { sum += model->lm_head[i * hidden + j] * normed[j]; } logits[i] = sum; } /* Sample */ /* Greedy for now — temperature sampling in Phase 4 */ int next_token = 0; float max_val = logits[0]; for (int i = 1; i < model->vocab_size; i++) { if (logits[i] > max_val) { max_val = logits[i]; next_token = i; } } free(hidden_state); free(normed); free(logits); return next_token; } ''') # ═══════════════════════════════════════════════════════════════════════════════ # engine/runtime/tokenizer.c — BPE Tokenizer # ═══════════════════════════════════════════════════════════════════════════════ with open("engine/runtime/tokenizer.c", "w") as f: f.write('''#include #include #include /* * BPE Tokenizer for Gemma/LLaMA-family models. * Loads sentencepiece vocabulary and performs encoding/decoding. * * For full functionality, this would need: * 1. Load .model file (protobuf) or vocab.json * 2. BPE merge rules * 3. Byte-fallback for unknown characters * * Phase 1: Load vocab from a simple text format (one token per line). * Phase 4: Full sentencepiece compatibility. */ #define MAX_VOCAB 128000 #define MAX_TOKEN_LEN 128 typedef struct { char **tokens; /* Array of token strings */ int vocab_size; /* TODO: merge rules, scores */ } LilaTokenizer; LilaTokenizer *lila_load_tokenizer(const char *vocab_path) { LilaTokenizer *tok = calloc(1, sizeof(LilaTokenizer)); tok->tokens = calloc(MAX_VOCAB, sizeof(char *)); FILE *f = fopen(vocab_path, "r"); if (!f) { fprintf(stderr, "Cannot open vocab: %s\\n", vocab_path); free(tok->tokens); free(tok); return NULL; } char line[MAX_TOKEN_LEN]; int i = 0; while (fgets(line, sizeof(line), f) && i < MAX_VOCAB) { line[strcspn(line, "\\n")] = 0; tok->tokens[i] = strdup(line); i++; } tok->vocab_size = i; fclose(f); fprintf(stderr, "Tokenizer loaded: %d tokens\\n", tok->vocab_size); return tok; } /* Decode token ID to string */ const char *lila_decode_token(LilaTokenizer *tok, int token_id) { if (token_id < 0 || token_id >= tok->vocab_size) return ""; return tok->tokens[token_id]; } /* Simple encode (character-level fallback — full BPE in Phase 4) */ int lila_encode_char(LilaTokenizer *tok, char c) { /* Search for single-character token */ char target[2] = {c, 0}; for (int i = 0; i < tok->vocab_size; i++) { if (tok->tokens[i] && strcmp(tok->tokens[i], target) == 0) { return i; } } return 0; /* unknown → first token */ } void lila_free_tokenizer(LilaTokenizer *tok) { if (!tok) return; for (int i = 0; i < tok->vocab_size; i++) { free(tok->tokens[i]); } free(tok->tokens); free(tok); } ''') # ═══════════════════════════════════════════════════════════════════════════════ # engine/runtime/tokenizer.h # ═══════════════════════════════════════════════════════════════════════════════ with open("engine/runtime/tokenizer.h", "w") as f: f.write('''#ifndef LILA_TOKENIZER_H #define LILA_TOKENIZER_H typedef struct LilaTokenizer LilaTokenizer; LilaTokenizer *lila_load_tokenizer(const char *vocab_path); const char *lila_decode_token(LilaTokenizer *tok, int token_id); int lila_encode_char(LilaTokenizer *tok, char c); void lila_free_tokenizer(LilaTokenizer *tok); #endif ''') # ═══════════════════════════════════════════════════════════════════════════════ # engine/runtime/attention.h # ═══════════════════════════════════════════════════════════════════════════════ with open("engine/runtime/attention.h", "w") as f: f.write('''#ifndef LILA_ATTENTION_H #define LILA_ATTENTION_H #include "model.h" void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq, int n_kv_heads, int head_dim); void lila_attention(float *output, const float *input, LilaLayer *layer, LilaKVCache *cache, int layer_idx, int position); #endif ''') # ═══════════════════════════════════════════════════════════════════════════════ # engine/runtime/transformer.h # ═══════════════════════════════════════════════════════════════════════════════ with open("engine/runtime/transformer.h", "w") as f: f.write('''#ifndef LILA_TRANSFORMER_H #define LILA_TRANSFORMER_H #include "model.h" void lila_transformer_block(float *hidden_state, LilaLayer *layer, LilaKVCache *cache, int layer_idx, int position); int lila_forward(LilaModel *model, int token, int position); #endif ''') # Commit and push subprocess.run(["git", "add", "-A"], check=True) subprocess.run(["git", "commit", "-m", "Engine Phase 2: Full transformer forward pass\n\n" "runtime/attention.c:\n" " - Multi-head attention with Grouped Query Attention (GQA)\n" " - Rotary Position Embeddings (RoPE)\n" " - KV Cache for autoregressive generation\n" " - Memory Fabric (multi-LoRA) integrated into attention\n\n" "runtime/transformer.c:\n" " - Full decoder block: norm → attention → residual → norm → MLP → residual\n" " - Memory Fabric adapter contribution added to attention output\n" " - lila_forward(): complete single-token forward pass\n" " - Token embedding → N layers → final norm → LM head → sample\n\n" "runtime/tokenizer.c:\n" " - Vocab loading from text file\n" " - Token decode (ID → string)\n" " - Character-level encode fallback (full BPE in Phase 4)\n\n" "The full inference path is structurally complete.\n" "Remaining: wire format converter to produce loadable .lila files,\n" "then test end-to-end token generation."], check=True) subprocess.run(["git", "push", "origin", "main"], check=True) print("✅ Engine Phase 2 pushed!")