| |
| """Push transformer forward pass, attention, tokenizer to Lila engine.""" |
| import subprocess, os |
| TOKEN = "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" |
| subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True) |
| os.chdir("/app/lila") |
| subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True) |
| subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True) |
|
|
| |
| |
| |
| with open("engine/runtime/attention.c", "w") as f: |
| f.write('''#include "model.h" |
| #include <math.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| /* |
| * Multi-Head Attention with Rotary Position Embeddings (RoPE) |
| * and KV Cache for efficient autoregressive generation. |
| * |
| * For Gemma 4B: n_heads=16, n_kv_heads=8 (GQA), head_dim=256 |
| * GQA: key/value heads are shared across query head groups |
| */ |
| |
| /* Apply RoPE to a single head vector */ |
| static void apply_rope(float *vec, int head_dim, int position, float theta) { |
| for (int i = 0; i < head_dim; i += 2) { |
| float freq = 1.0f / powf(theta, (float)i / head_dim); |
| float angle = position * freq; |
| float cos_a = cosf(angle); |
| float sin_a = sinf(angle); |
| |
| float v0 = vec[i]; |
| float v1 = vec[i + 1]; |
| vec[i] = v0 * cos_a - v1 * sin_a; |
| vec[i + 1] = v0 * sin_a + v1 * cos_a; |
| } |
| } |
| |
| /* Initialize KV cache */ |
| void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq, |
| int n_kv_heads, int head_dim) { |
| cache->max_seq_len = max_seq; |
| cache->current_pos = 0; |
| |
| size_t layer_size = (size_t)max_seq * n_kv_heads * head_dim * sizeof(float); |
| cache->key_cache = calloc(n_layers, layer_size); |
| cache->value_cache = calloc(n_layers, layer_size); |
| } |
| |
| /* Single-token attention (for autoregressive generation) */ |
| void lila_attention( |
| float *output, /* [hidden_size] */ |
| const float *input, /* [hidden_size] */ |
| LilaLayer *layer, |
| LilaKVCache *cache, |
| int layer_idx, |
| int position |
| ) { |
| int hidden = layer->hidden_size; |
| int n_heads = layer->n_heads; |
| int n_kv_heads = layer->n_kv_heads; |
| int head_dim = layer->head_dim; |
| int kv_group = n_heads / n_kv_heads; /* GQA group size */ |
| |
| /* Allocate scratch (TODO: pre-allocate in model struct) */ |
| float *q = malloc(hidden * sizeof(float)); |
| float *k = malloc(n_kv_heads * head_dim * sizeof(float)); |
| float *v = malloc(n_kv_heads * head_dim * sizeof(float)); |
| float *attn_out = calloc(hidden, sizeof(float)); |
| |
| /* Project Q, K, V using quantized weights */ |
| /* TODO: replace with dequant_matvec from kernels */ |
| dequant_matvec(q, &layer->q_proj, input); |
| dequant_matvec(k, &layer->k_proj, input); |
| dequant_matvec(v, &layer->v_proj, input); |
| |
| /* Apply RoPE to Q and K */ |
| for (int h = 0; h < n_heads; h++) { |
| apply_rope(q + h * head_dim, head_dim, position, 10000.0f); |
| } |
| for (int h = 0; h < n_kv_heads; h++) { |
| apply_rope(k + h * head_dim, head_dim, position, 10000.0f); |
| } |
| |
| /* Store K, V in cache */ |
| size_t kv_offset = (size_t)position * n_kv_heads * head_dim; |
| size_t layer_offset = (size_t)layer_idx * cache->max_seq_len * n_kv_heads * head_dim; |
| memcpy(cache->key_cache + layer_offset + kv_offset, k, n_kv_heads * head_dim * sizeof(float)); |
| memcpy(cache->value_cache + layer_offset + kv_offset, v, n_kv_heads * head_dim * sizeof(float)); |
| |
| /* Compute attention scores for each head */ |
| float scale = 1.0f / sqrtf((float)head_dim); |
| |
| for (int h = 0; h < n_heads; h++) { |
| int kv_h = h / kv_group; /* GQA: which KV head this Q head uses */ |
| float *q_h = q + h * head_dim; |
| |
| /* Attention scores: dot(q, all cached keys) */ |
| float *scores = malloc((position + 1) * sizeof(float)); |
| float max_score = -1e30f; |
| |
| for (int t = 0; t <= position; t++) { |
| float *k_t = cache->key_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim; |
| float score = 0.0f; |
| for (int d = 0; d < head_dim; d++) { |
| score += q_h[d] * k_t[d]; |
| } |
| score *= scale; |
| scores[t] = score; |
| if (score > max_score) max_score = score; |
| } |
| |
| /* Softmax */ |
| float sum = 0.0f; |
| for (int t = 0; t <= position; t++) { |
| scores[t] = expf(scores[t] - max_score); |
| sum += scores[t]; |
| } |
| for (int t = 0; t <= position; t++) { |
| scores[t] /= sum; |
| } |
| |
| /* Weighted sum of values */ |
| float *out_h = attn_out + h * head_dim; |
| for (int t = 0; t <= position; t++) { |
| float *v_t = cache->value_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim; |
| for (int d = 0; d < head_dim; d++) { |
| out_h[d] += scores[t] * v_t[d]; |
| } |
| } |
| |
| free(scores); |
| } |
| |
| /* Output projection */ |
| dequant_matvec(output, &layer->o_proj, attn_out); |
| |
| free(q); |
| free(k); |
| free(v); |
| free(attn_out); |
| } |
| |
| /* Forward declaration for dequant_matvec (defined in inference.c) */ |
| extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec); |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/transformer.c", "w") as f: |
| f.write('''#include "model.h" |
| #include <math.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| /* |
| * Full transformer decoder block: |
| * residual = x |
| * x = rmsnorm(x) |
| * x = attention(x) + residual |
| * residual = x |
| * x = rmsnorm(x) |
| * x = mlp(x) + residual |
| */ |
| |
| /* External kernel declarations */ |
| extern void lila_rmsnorm_avx2(float *out, const float *x, const float *weight, int size, float eps); |
| extern void lila_attention(float *output, const float *input, LilaLayer *layer, |
| LilaKVCache *cache, int layer_idx, int position); |
| extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec); |
| |
| /* SiLU activation (will be assembly in Phase 4) */ |
| static inline float silu_f(float x) { |
| return x / (1.0f + expf(-x)); |
| } |
| |
| /* MLP: gate_proj + up_proj β SiLU(gate) * up β down_proj */ |
| static void lila_mlp(float *output, const float *input, LilaLayer *layer) { |
| int hidden = layer->hidden_size; |
| int inter = layer->intermediate_size; |
| |
| float *gate = malloc(inter * sizeof(float)); |
| float *up = malloc(inter * sizeof(float)); |
| |
| /* Gate and up projections */ |
| dequant_matvec(gate, &layer->gate_proj, input); |
| dequant_matvec(up, &layer->up_proj, input); |
| |
| /* SiLU(gate) * up */ |
| for (int i = 0; i < inter; i++) { |
| gate[i] = silu_f(gate[i]) * up[i]; |
| } |
| |
| /* Down projection */ |
| dequant_matvec(output, &layer->down_proj, gate); |
| |
| free(gate); |
| free(up); |
| } |
| |
| /* Memory Fabric contribution (multi-LoRA gated adapters) */ |
| static void lila_memory_fabric(float *output, const float *input, LilaMemoryFabric *fabric, |
| int in_features, int out_features) { |
| /* For each active namespace adapter, compute gated LoRA correction */ |
| for (int ns = 0; ns < LILA_N_NAMESPACES; ns++) { |
| LilaLoRA *adapter = &fabric->adapters[ns]; |
| if (adapter->gate < 0.01f || adapter->A == NULL) continue; |
| |
| int r = adapter->rank; |
| |
| /* Compute: gate * (input @ A) @ B */ |
| float *mid = calloc(r, sizeof(float)); |
| |
| /* mid = input @ A [in_features] @ [in_features, r] β [r] */ |
| for (int j = 0; j < r; j++) { |
| float sum = 0.0f; |
| for (int i = 0; i < in_features; i++) { |
| sum += input[i] * adapter->A[i * r + j]; |
| } |
| mid[j] = sum; |
| } |
| |
| /* output += gate * (mid @ B) [r] @ [r, out_features] β [out_features] */ |
| float scale = adapter->gate * (32.0f / r); /* alpha/rank */ |
| for (int i = 0; i < out_features; i++) { |
| float sum = 0.0f; |
| for (int j = 0; j < r; j++) { |
| sum += mid[j] * adapter->B[j * out_features + i]; |
| } |
| output[i] += sum * scale; |
| } |
| |
| free(mid); |
| } |
| } |
| |
| /* Full transformer block forward pass */ |
| void lila_transformer_block( |
| float *hidden_state, /* [hidden_size] β modified in place */ |
| LilaLayer *layer, |
| LilaKVCache *cache, |
| int layer_idx, |
| int position |
| ) { |
| int hidden = layer->hidden_size; |
| float *residual = malloc(hidden * sizeof(float)); |
| float *normed = malloc(hidden * sizeof(float)); |
| float *attn_out = malloc(hidden * sizeof(float)); |
| float *mlp_out = malloc(hidden * sizeof(float)); |
| |
| /* ββ Pre-attention norm ββ */ |
| memcpy(residual, hidden_state, hidden * sizeof(float)); |
| lila_rmsnorm_avx2(normed, hidden_state, layer->input_layernorm, hidden, 1e-6f); |
| |
| /* ββ Attention ββ */ |
| lila_attention(attn_out, normed, layer, cache, layer_idx, position); |
| |
| /* ββ Add Memory Fabric to attention output ββ */ |
| lila_memory_fabric(attn_out, normed, &layer->fabric, hidden, hidden); |
| |
| /* ββ Residual connection ββ */ |
| for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + attn_out[i]; |
| |
| /* ββ Pre-MLP norm ββ */ |
| memcpy(residual, hidden_state, hidden * sizeof(float)); |
| lila_rmsnorm_avx2(normed, hidden_state, layer->post_attention_layernorm, hidden, 1e-6f); |
| |
| /* ββ MLP ββ */ |
| lila_mlp(mlp_out, normed, layer); |
| |
| /* ββ Residual connection ββ */ |
| for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + mlp_out[i]; |
| |
| free(residual); |
| free(normed); |
| free(attn_out); |
| free(mlp_out); |
| } |
| |
| /* Full model forward pass β single token */ |
| int lila_forward(LilaModel *model, int token, int position) { |
| int hidden = model->hidden_size; |
| |
| /* Token embedding */ |
| float *hidden_state = malloc(hidden * sizeof(float)); |
| memcpy(hidden_state, model->token_embedding + (size_t)token * hidden, |
| hidden * sizeof(float)); |
| |
| /* Transformer layers */ |
| for (int l = 0; l < model->n_layers; l++) { |
| lila_transformer_block(hidden_state, &model->layers[l], |
| &model->kv_cache, l, position); |
| } |
| |
| /* Final norm */ |
| float *normed = malloc(hidden * sizeof(float)); |
| lila_rmsnorm_avx2(normed, hidden_state, model->final_norm, hidden, 1e-6f); |
| |
| /* LM head: project to vocab logits */ |
| float *logits = malloc(model->vocab_size * sizeof(float)); |
| |
| /* matvec: logits = lm_head @ normed */ |
| /* lm_head is [vocab_size, hidden_size] */ |
| for (int i = 0; i < model->vocab_size; i++) { |
| float sum = 0.0f; |
| for (int j = 0; j < hidden; j++) { |
| sum += model->lm_head[i * hidden + j] * normed[j]; |
| } |
| logits[i] = sum; |
| } |
| |
| /* Sample */ |
| /* Greedy for now β temperature sampling in Phase 4 */ |
| int next_token = 0; |
| float max_val = logits[0]; |
| for (int i = 1; i < model->vocab_size; i++) { |
| if (logits[i] > max_val) { max_val = logits[i]; next_token = i; } |
| } |
| |
| free(hidden_state); |
| free(normed); |
| free(logits); |
| |
| return next_token; |
| } |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/tokenizer.c", "w") as f: |
| f.write('''#include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| /* |
| * BPE Tokenizer for Gemma/LLaMA-family models. |
| * Loads sentencepiece vocabulary and performs encoding/decoding. |
| * |
| * For full functionality, this would need: |
| * 1. Load .model file (protobuf) or vocab.json |
| * 2. BPE merge rules |
| * 3. Byte-fallback for unknown characters |
| * |
| * Phase 1: Load vocab from a simple text format (one token per line). |
| * Phase 4: Full sentencepiece compatibility. |
| */ |
| |
| #define MAX_VOCAB 128000 |
| #define MAX_TOKEN_LEN 128 |
| |
| typedef struct { |
| char **tokens; /* Array of token strings */ |
| int vocab_size; |
| /* TODO: merge rules, scores */ |
| } LilaTokenizer; |
| |
| LilaTokenizer *lila_load_tokenizer(const char *vocab_path) { |
| LilaTokenizer *tok = calloc(1, sizeof(LilaTokenizer)); |
| tok->tokens = calloc(MAX_VOCAB, sizeof(char *)); |
| |
| FILE *f = fopen(vocab_path, "r"); |
| if (!f) { |
| fprintf(stderr, "Cannot open vocab: %s\\n", vocab_path); |
| free(tok->tokens); |
| free(tok); |
| return NULL; |
| } |
| |
| char line[MAX_TOKEN_LEN]; |
| int i = 0; |
| while (fgets(line, sizeof(line), f) && i < MAX_VOCAB) { |
| line[strcspn(line, "\\n")] = 0; |
| tok->tokens[i] = strdup(line); |
| i++; |
| } |
| tok->vocab_size = i; |
| fclose(f); |
| |
| fprintf(stderr, "Tokenizer loaded: %d tokens\\n", tok->vocab_size); |
| return tok; |
| } |
| |
| /* Decode token ID to string */ |
| const char *lila_decode_token(LilaTokenizer *tok, int token_id) { |
| if (token_id < 0 || token_id >= tok->vocab_size) return "<unk>"; |
| return tok->tokens[token_id]; |
| } |
| |
| /* Simple encode (character-level fallback β full BPE in Phase 4) */ |
| int lila_encode_char(LilaTokenizer *tok, char c) { |
| /* Search for single-character token */ |
| char target[2] = {c, 0}; |
| for (int i = 0; i < tok->vocab_size; i++) { |
| if (tok->tokens[i] && strcmp(tok->tokens[i], target) == 0) { |
| return i; |
| } |
| } |
| return 0; /* unknown β first token */ |
| } |
| |
| void lila_free_tokenizer(LilaTokenizer *tok) { |
| if (!tok) return; |
| for (int i = 0; i < tok->vocab_size; i++) { |
| free(tok->tokens[i]); |
| } |
| free(tok->tokens); |
| free(tok); |
| } |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/tokenizer.h", "w") as f: |
| f.write('''#ifndef LILA_TOKENIZER_H |
| #define LILA_TOKENIZER_H |
| |
| typedef struct LilaTokenizer LilaTokenizer; |
| |
| LilaTokenizer *lila_load_tokenizer(const char *vocab_path); |
| const char *lila_decode_token(LilaTokenizer *tok, int token_id); |
| int lila_encode_char(LilaTokenizer *tok, char c); |
| void lila_free_tokenizer(LilaTokenizer *tok); |
| |
| #endif |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/attention.h", "w") as f: |
| f.write('''#ifndef LILA_ATTENTION_H |
| #define LILA_ATTENTION_H |
| |
| #include "model.h" |
| |
| void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq, |
| int n_kv_heads, int head_dim); |
| void lila_attention(float *output, const float *input, LilaLayer *layer, |
| LilaKVCache *cache, int layer_idx, int position); |
| |
| #endif |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/transformer.h", "w") as f: |
| f.write('''#ifndef LILA_TRANSFORMER_H |
| #define LILA_TRANSFORMER_H |
| |
| #include "model.h" |
| |
| void lila_transformer_block(float *hidden_state, LilaLayer *layer, |
| LilaKVCache *cache, int layer_idx, int position); |
| int lila_forward(LilaModel *model, int token, int position); |
| |
| #endif |
| ''') |
|
|
| |
| subprocess.run(["git", "add", "-A"], check=True) |
| subprocess.run(["git", "commit", "-m", |
| "Engine Phase 2: Full transformer forward pass\n\n" |
| "runtime/attention.c:\n" |
| " - Multi-head attention with Grouped Query Attention (GQA)\n" |
| " - Rotary Position Embeddings (RoPE)\n" |
| " - KV Cache for autoregressive generation\n" |
| " - Memory Fabric (multi-LoRA) integrated into attention\n\n" |
| "runtime/transformer.c:\n" |
| " - Full decoder block: norm β attention β residual β norm β MLP β residual\n" |
| " - Memory Fabric adapter contribution added to attention output\n" |
| " - lila_forward(): complete single-token forward pass\n" |
| " - Token embedding β N layers β final norm β LM head β sample\n\n" |
| "runtime/tokenizer.c:\n" |
| " - Vocab loading from text file\n" |
| " - Token decode (ID β string)\n" |
| " - Character-level encode fallback (full BPE in Phase 4)\n\n" |
| "The full inference path is structurally complete.\n" |
| "Remaining: wire format converter to produce loadable .lila files,\n" |
| "then test end-to-end token generation."], |
| check=True) |
| subprocess.run(["git", "push", "origin", "main"], check=True) |
| print("β
Engine Phase 2 pushed!") |
|
|