ticketguy
/

littlefig-bench

ml-intern

Model card Files Files and versions

xet

Community

ticketguy commited on about 21 hours ago

Commit

bc38a2c

verified ·

1 Parent(s): bb40248

Engine Phase 2: Full transformer forward pass + tokenizer + attention

Browse files

Files changed (1) hide show

lila_engine_phase2.py +489 -0

lila_engine_phase2.py ADDED Viewed

	@@ -0,0 +1,489 @@

+#!/usr/bin/env python3
+"""Push transformer forward pass, attention, tokenizer to Lila engine."""
+import subprocess, os
+TOKEN = "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"
+subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True)
+os.chdir("/app/lila")
+subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True)
+subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True)
+# ═══════════════════════════════════════════════════════════════════════════════
+# engine/runtime/attention.c — Multi-Head Attention with RoPE
+# ═══════════════════════════════════════════════════════════════════════════════
+with open("engine/runtime/attention.c", "w") as f:
+    f.write('''#include "model.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+/*
+ * Multi-Head Attention with Rotary Position Embeddings (RoPE)
+ * and KV Cache for efficient autoregressive generation.
+ *
+ * For Gemma 4B: n_heads=16, n_kv_heads=8 (GQA), head_dim=256
+ * GQA: key/value heads are shared across query head groups
+ */
+/* Apply RoPE to a single head vector */
+static void apply_rope(float *vec, int head_dim, int position, float theta) {
+    for (int i = 0; i < head_dim; i += 2) {
+        float freq = 1.0f / powf(theta, (float)i / head_dim);
+        float angle = position * freq;
+        float cos_a = cosf(angle);
+        float sin_a = sinf(angle);
+        float v0 = vec[i];
+        float v1 = vec[i + 1];
+        vec[i]     = v0 * cos_a - v1 * sin_a;
+        vec[i + 1] = v0 * sin_a + v1 * cos_a;
+    }
+}
+/* Initialize KV cache */
+void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq,
+                         int n_kv_heads, int head_dim) {
+    cache->max_seq_len = max_seq;
+    cache->current_pos = 0;
+    size_t layer_size = (size_t)max_seq * n_kv_heads * head_dim * sizeof(float);
+    cache->key_cache = calloc(n_layers, layer_size);
+    cache->value_cache = calloc(n_layers, layer_size);
+}
+/* Single-token attention (for autoregressive generation) */
+void lila_attention(
+    float *output,          /* [hidden_size] */
+    const float *input,     /* [hidden_size] */
+    LilaLayer *layer,
+    LilaKVCache *cache,
+    int layer_idx,
+    int position
+) {
+    int hidden = layer->hidden_size;
+    int n_heads = layer->n_heads;
+    int n_kv_heads = layer->n_kv_heads;
+    int head_dim = layer->head_dim;
+    int kv_group = n_heads / n_kv_heads;  /* GQA group size */
+    /* Allocate scratch (TODO: pre-allocate in model struct) */
+    float *q = malloc(hidden * sizeof(float));
+    float *k = malloc(n_kv_heads * head_dim * sizeof(float));
+    float *v = malloc(n_kv_heads * head_dim * sizeof(float));
+    float *attn_out = calloc(hidden, sizeof(float));
+    /* Project Q, K, V using quantized weights */
+    /* TODO: replace with dequant_matvec from kernels */
+    dequant_matvec(q, &layer->q_proj, input);
+    dequant_matvec(k, &layer->k_proj, input);
+    dequant_matvec(v, &layer->v_proj, input);
+    /* Apply RoPE to Q and K */
+    for (int h = 0; h < n_heads; h++) {
+        apply_rope(q + h * head_dim, head_dim, position, 10000.0f);
+    }
+    for (int h = 0; h < n_kv_heads; h++) {
+        apply_rope(k + h * head_dim, head_dim, position, 10000.0f);
+    }
+    /* Store K, V in cache */
+    size_t kv_offset = (size_t)position * n_kv_heads * head_dim;
+    size_t layer_offset = (size_t)layer_idx * cache->max_seq_len * n_kv_heads * head_dim;
+    memcpy(cache->key_cache + layer_offset + kv_offset, k, n_kv_heads * head_dim * sizeof(float));
+    memcpy(cache->value_cache + layer_offset + kv_offset, v, n_kv_heads * head_dim * sizeof(float));
+    /* Compute attention scores for each head */
+    float scale = 1.0f / sqrtf((float)head_dim);
+    for (int h = 0; h < n_heads; h++) {
+        int kv_h = h / kv_group;  /* GQA: which KV head this Q head uses */
+        float *q_h = q + h * head_dim;
+        /* Attention scores: dot(q, all cached keys) */
+        float *scores = malloc((position + 1) * sizeof(float));
+        float max_score = -1e30f;
+        for (int t = 0; t <= position; t++) {
+            float *k_t = cache->key_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim;
+            float score = 0.0f;
+            for (int d = 0; d < head_dim; d++) {
+                score += q_h[d] * k_t[d];
+            }
+            score *= scale;
+            scores[t] = score;
+            if (score > max_score) max_score = score;
+        }
+        /* Softmax */
+        float sum = 0.0f;
+        for (int t = 0; t <= position; t++) {
+            scores[t] = expf(scores[t] - max_score);
+            sum += scores[t];
+        }
+        for (int t = 0; t <= position; t++) {
+            scores[t] /= sum;
+        }
+        /* Weighted sum of values */
+        float *out_h = attn_out + h * head_dim;
+        for (int t = 0; t <= position; t++) {
+            float *v_t = cache->value_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim;
+            for (int d = 0; d < head_dim; d++) {
+                out_h[d] += scores[t] * v_t[d];
+            }
+        }
+        free(scores);
+    }
+    /* Output projection */
+    dequant_matvec(output, &layer->o_proj, attn_out);
+    free(q);
+    free(k);
+    free(v);
+    free(attn_out);
+}
+/* Forward declaration for dequant_matvec (defined in inference.c) */
+extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec);
+''')
+# ═══════════════════════════════════════════════════════════════════════════════
+# engine/runtime/transformer.c — Full transformer block
+# ═══════════════════════════════════════════════════════════════════════════════
+with open("engine/runtime/transformer.c", "w") as f:
+    f.write('''#include "model.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+/*
+ * Full transformer decoder block:
+ *   residual = x
+ *   x = rmsnorm(x)
+ *   x = attention(x) + residual
+ *   residual = x
+ *   x = rmsnorm(x)
+ *   x = mlp(x) + residual
+ */
+/* External kernel declarations */
+extern void lila_rmsnorm_avx2(float *out, const float *x, const float *weight, int size, float eps);
+extern void lila_attention(float *output, const float *input, LilaLayer *layer,
+                           LilaKVCache *cache, int layer_idx, int position);
+extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec);
+/* SiLU activation (will be assembly in Phase 4) */
+static inline float silu_f(float x) {
+    return x / (1.0f + expf(-x));
+}
+/* MLP: gate_proj + up_proj → SiLU(gate) * up → down_proj */
+static void lila_mlp(float *output, const float *input, LilaLayer *layer) {
+    int hidden = layer->hidden_size;
+    int inter = layer->intermediate_size;
+    float *gate = malloc(inter * sizeof(float));
+    float *up = malloc(inter * sizeof(float));
+    /* Gate and up projections */
+    dequant_matvec(gate, &layer->gate_proj, input);
+    dequant_matvec(up, &layer->up_proj, input);
+    /* SiLU(gate) * up */
+    for (int i = 0; i < inter; i++) {
+        gate[i] = silu_f(gate[i]) * up[i];
+    }
+    /* Down projection */
+    dequant_matvec(output, &layer->down_proj, gate);
+    free(gate);
+    free(up);
+}
+/* Memory Fabric contribution (multi-LoRA gated adapters) */
+static void lila_memory_fabric(float *output, const float *input, LilaMemoryFabric *fabric,
+                                int in_features, int out_features) {
+    /* For each active namespace adapter, compute gated LoRA correction */
+    for (int ns = 0; ns < LILA_N_NAMESPACES; ns++) {
+        LilaLoRA *adapter = &fabric->adapters[ns];
+        if (adapter->gate < 0.01f || adapter->A == NULL) continue;
+        int r = adapter->rank;
+        /* Compute: gate * (input @ A) @ B */
+        float *mid = calloc(r, sizeof(float));
+        /* mid = input @ A  [in_features] @ [in_features, r] → [r] */
+        for (int j = 0; j < r; j++) {
+            float sum = 0.0f;
+            for (int i = 0; i < in_features; i++) {
+                sum += input[i] * adapter->A[i * r + j];
+            }
+            mid[j] = sum;
+        }
+        /* output += gate * (mid @ B)  [r] @ [r, out_features] → [out_features] */
+        float scale = adapter->gate * (32.0f / r);  /* alpha/rank */
+        for (int i = 0; i < out_features; i++) {
+            float sum = 0.0f;
+            for (int j = 0; j < r; j++) {
+                sum += mid[j] * adapter->B[j * out_features + i];
+            }
+            output[i] += sum * scale;
+        }
+        free(mid);
+    }
+}
+/* Full transformer block forward pass */
+void lila_transformer_block(
+    float *hidden_state,    /* [hidden_size] — modified in place */
+    LilaLayer *layer,
+    LilaKVCache *cache,
+    int layer_idx,
+    int position
+) {
+    int hidden = layer->hidden_size;
+    float *residual = malloc(hidden * sizeof(float));
+    float *normed = malloc(hidden * sizeof(float));
+    float *attn_out = malloc(hidden * sizeof(float));
+    float *mlp_out = malloc(hidden * sizeof(float));
+    /* ── Pre-attention norm ── */
+    memcpy(residual, hidden_state, hidden * sizeof(float));
+    lila_rmsnorm_avx2(normed, hidden_state, layer->input_layernorm, hidden, 1e-6f);
+    /* ── Attention ── */
+    lila_attention(attn_out, normed, layer, cache, layer_idx, position);
+    /* ── Add Memory Fabric to attention output ── */
+    lila_memory_fabric(attn_out, normed, &layer->fabric, hidden, hidden);
+    /* ── Residual connection ── */
+    for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + attn_out[i];
+    /* ── Pre-MLP norm ── */
+    memcpy(residual, hidden_state, hidden * sizeof(float));
+    lila_rmsnorm_avx2(normed, hidden_state, layer->post_attention_layernorm, hidden, 1e-6f);
+    /* ── MLP ── */
+    lila_mlp(mlp_out, normed, layer);
+    /* ── Residual connection ── */
+    for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + mlp_out[i];
+    free(residual);
+    free(normed);
+    free(attn_out);
+    free(mlp_out);
+}
+/* Full model forward pass — single token */
+int lila_forward(LilaModel *model, int token, int position) {
+    int hidden = model->hidden_size;
+    /* Token embedding */
+    float *hidden_state = malloc(hidden * sizeof(float));
+    memcpy(hidden_state, model->token_embedding + (size_t)token * hidden,
+           hidden * sizeof(float));
+    /* Transformer layers */
+    for (int l = 0; l < model->n_layers; l++) {
+        lila_transformer_block(hidden_state, &model->layers[l],
+                               &model->kv_cache, l, position);
+    }
+    /* Final norm */
+    float *normed = malloc(hidden * sizeof(float));
+    lila_rmsnorm_avx2(normed, hidden_state, model->final_norm, hidden, 1e-6f);
+    /* LM head: project to vocab logits */
+    float *logits = malloc(model->vocab_size * sizeof(float));
+    /* matvec: logits = lm_head @ normed */
+    /* lm_head is [vocab_size, hidden_size] */
+    for (int i = 0; i < model->vocab_size; i++) {
+        float sum = 0.0f;
+        for (int j = 0; j < hidden; j++) {
+            sum += model->lm_head[i * hidden + j] * normed[j];
+        }
+        logits[i] = sum;
+    }
+    /* Sample */
+    /* Greedy for now — temperature sampling in Phase 4 */
+    int next_token = 0;
+    float max_val = logits[0];
+    for (int i = 1; i < model->vocab_size; i++) {
+        if (logits[i] > max_val) { max_val = logits[i]; next_token = i; }
+    }
+    free(hidden_state);
+    free(normed);
+    free(logits);
+    return next_token;
+}
+''')
+# ═══════════════════════════════════════════════════════════════════════════════
+# engine/runtime/tokenizer.c — BPE Tokenizer
+# ═══════════════════════════════════════════════════════════════════════════════
+with open("engine/runtime/tokenizer.c", "w") as f:
+    f.write('''#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/*
+ * BPE Tokenizer for Gemma/LLaMA-family models.
+ * Loads sentencepiece vocabulary and performs encoding/decoding.
+ *
+ * For full functionality, this would need:
+ * 1. Load .model file (protobuf) or vocab.json
+ * 2. BPE merge rules
+ * 3. Byte-fallback for unknown characters
+ *
+ * Phase 1: Load vocab from a simple text format (one token per line).
+ * Phase 4: Full sentencepiece compatibility.
+ */
+#define MAX_VOCAB 128000
+#define MAX_TOKEN_LEN 128
+typedef struct {
+    char **tokens;      /* Array of token strings */
+    int vocab_size;
+    /* TODO: merge rules, scores */
+} LilaTokenizer;
+LilaTokenizer *lila_load_tokenizer(const char *vocab_path) {
+    LilaTokenizer *tok = calloc(1, sizeof(LilaTokenizer));
+    tok->tokens = calloc(MAX_VOCAB, sizeof(char *));
+    FILE *f = fopen(vocab_path, "r");
+    if (!f) {
+        fprintf(stderr, "Cannot open vocab: %s\\n", vocab_path);
+        free(tok->tokens);
+        free(tok);
+        return NULL;
+    }
+    char line[MAX_TOKEN_LEN];
+    int i = 0;
+    while (fgets(line, sizeof(line), f) && i < MAX_VOCAB) {
+        line[strcspn(line, "\\n")] = 0;
+        tok->tokens[i] = strdup(line);
+        i++;
+    }
+    tok->vocab_size = i;
+    fclose(f);
+    fprintf(stderr, "Tokenizer loaded: %d tokens\\n", tok->vocab_size);
+    return tok;
+}
+/* Decode token ID to string */
+const char *lila_decode_token(LilaTokenizer *tok, int token_id) {
+    if (token_id < 0 || token_id >= tok->vocab_size) return "<unk>";
+    return tok->tokens[token_id];
+}
+/* Simple encode (character-level fallback — full BPE in Phase 4) */
+int lila_encode_char(LilaTokenizer *tok, char c) {
+    /* Search for single-character token */
+    char target[2] = {c, 0};
+    for (int i = 0; i < tok->vocab_size; i++) {
+        if (tok->tokens[i] && strcmp(tok->tokens[i], target) == 0) {
+            return i;
+        }
+    }
+    return 0; /* unknown → first token */
+}
+void lila_free_tokenizer(LilaTokenizer *tok) {
+    if (!tok) return;
+    for (int i = 0; i < tok->vocab_size; i++) {
+        free(tok->tokens[i]);
+    }
+    free(tok->tokens);
+    free(tok);
+}
+''')
+# ═══════════════════════════════════════════════════════════════════════════════
+# engine/runtime/tokenizer.h
+# ═══════════════════════════════════════════════════════════════════════════════
+with open("engine/runtime/tokenizer.h", "w") as f:
+    f.write('''#ifndef LILA_TOKENIZER_H
+#define LILA_TOKENIZER_H
+typedef struct LilaTokenizer LilaTokenizer;
+LilaTokenizer *lila_load_tokenizer(const char *vocab_path);
+const char *lila_decode_token(LilaTokenizer *tok, int token_id);
+int lila_encode_char(LilaTokenizer *tok, char c);
+void lila_free_tokenizer(LilaTokenizer *tok);
+#endif
+''')
+# ═══════════════════════════════════════════════════════════════════════════════
+# engine/runtime/attention.h
+# ═══════════════════════════════════════════════════════════════════════════════
+with open("engine/runtime/attention.h", "w") as f:
+    f.write('''#ifndef LILA_ATTENTION_H
+#define LILA_ATTENTION_H
+#include "model.h"
+void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq,
+                         int n_kv_heads, int head_dim);
+void lila_attention(float *output, const float *input, LilaLayer *layer,
+                    LilaKVCache *cache, int layer_idx, int position);
+#endif
+''')
+# ═══════════════════════════════════════════════════════════════════════════════
+# engine/runtime/transformer.h
+# ═══════════════════════════════════════════════════════════════════════════════
+with open("engine/runtime/transformer.h", "w") as f:
+    f.write('''#ifndef LILA_TRANSFORMER_H
+#define LILA_TRANSFORMER_H
+#include "model.h"
+void lila_transformer_block(float *hidden_state, LilaLayer *layer,
+                            LilaKVCache *cache, int layer_idx, int position);
+int lila_forward(LilaModel *model, int token, int position);
+#endif
+''')
+# Commit and push
+subprocess.run(["git", "add", "-A"], check=True)
+subprocess.run(["git", "commit", "-m",
+    "Engine Phase 2: Full transformer forward pass\n\n"
+    "runtime/attention.c:\n"
+    "  - Multi-head attention with Grouped Query Attention (GQA)\n"
+    "  - Rotary Position Embeddings (RoPE)\n"
+    "  - KV Cache for autoregressive generation\n"
+    "  - Memory Fabric (multi-LoRA) integrated into attention\n\n"
+    "runtime/transformer.c:\n"
+    "  - Full decoder block: norm → attention → residual → norm → MLP → residual\n"
+    "  - Memory Fabric adapter contribution added to attention output\n"
+    "  - lila_forward(): complete single-token forward pass\n"
+    "  - Token embedding → N layers → final norm → LM head → sample\n\n"
+    "runtime/tokenizer.c:\n"
+    "  - Vocab loading from text file\n"
+    "  - Token decode (ID → string)\n"
+    "  - Character-level encode fallback (full BPE in Phase 4)\n\n"
+    "The full inference path is structurally complete.\n"
+    "Remaining: wire format converter to produce loadable .lila files,\n"
+    "then test end-to-end token generation."],
+    check=True)
+subprocess.run(["git", "push", "origin", "main"], check=True)
+print("✅ Engine Phase 2 pushed!")