ticketguy commited on
Commit
bc38a2c
Β·
verified Β·
1 Parent(s): bb40248

Engine Phase 2: Full transformer forward pass + tokenizer + attention

Browse files
Files changed (1) hide show
  1. lila_engine_phase2.py +489 -0
lila_engine_phase2.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Push transformer forward pass, attention, tokenizer to Lila engine."""
3
+ import subprocess, os
4
+ TOKEN = "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"
5
+ subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True)
6
+ os.chdir("/app/lila")
7
+ subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True)
8
+ subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True)
9
+
10
+ # ═══════════════════════════════════════════════════════════════════════════════
11
+ # engine/runtime/attention.c β€” Multi-Head Attention with RoPE
12
+ # ═══════════════════════════════════════════════════════════════════════════════
13
+ with open("engine/runtime/attention.c", "w") as f:
14
+ f.write('''#include "model.h"
15
+ #include <math.h>
16
+ #include <stdlib.h>
17
+ #include <string.h>
18
+
19
+ /*
20
+ * Multi-Head Attention with Rotary Position Embeddings (RoPE)
21
+ * and KV Cache for efficient autoregressive generation.
22
+ *
23
+ * For Gemma 4B: n_heads=16, n_kv_heads=8 (GQA), head_dim=256
24
+ * GQA: key/value heads are shared across query head groups
25
+ */
26
+
27
+ /* Apply RoPE to a single head vector */
28
+ static void apply_rope(float *vec, int head_dim, int position, float theta) {
29
+ for (int i = 0; i < head_dim; i += 2) {
30
+ float freq = 1.0f / powf(theta, (float)i / head_dim);
31
+ float angle = position * freq;
32
+ float cos_a = cosf(angle);
33
+ float sin_a = sinf(angle);
34
+
35
+ float v0 = vec[i];
36
+ float v1 = vec[i + 1];
37
+ vec[i] = v0 * cos_a - v1 * sin_a;
38
+ vec[i + 1] = v0 * sin_a + v1 * cos_a;
39
+ }
40
+ }
41
+
42
+ /* Initialize KV cache */
43
+ void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq,
44
+ int n_kv_heads, int head_dim) {
45
+ cache->max_seq_len = max_seq;
46
+ cache->current_pos = 0;
47
+
48
+ size_t layer_size = (size_t)max_seq * n_kv_heads * head_dim * sizeof(float);
49
+ cache->key_cache = calloc(n_layers, layer_size);
50
+ cache->value_cache = calloc(n_layers, layer_size);
51
+ }
52
+
53
+ /* Single-token attention (for autoregressive generation) */
54
+ void lila_attention(
55
+ float *output, /* [hidden_size] */
56
+ const float *input, /* [hidden_size] */
57
+ LilaLayer *layer,
58
+ LilaKVCache *cache,
59
+ int layer_idx,
60
+ int position
61
+ ) {
62
+ int hidden = layer->hidden_size;
63
+ int n_heads = layer->n_heads;
64
+ int n_kv_heads = layer->n_kv_heads;
65
+ int head_dim = layer->head_dim;
66
+ int kv_group = n_heads / n_kv_heads; /* GQA group size */
67
+
68
+ /* Allocate scratch (TODO: pre-allocate in model struct) */
69
+ float *q = malloc(hidden * sizeof(float));
70
+ float *k = malloc(n_kv_heads * head_dim * sizeof(float));
71
+ float *v = malloc(n_kv_heads * head_dim * sizeof(float));
72
+ float *attn_out = calloc(hidden, sizeof(float));
73
+
74
+ /* Project Q, K, V using quantized weights */
75
+ /* TODO: replace with dequant_matvec from kernels */
76
+ dequant_matvec(q, &layer->q_proj, input);
77
+ dequant_matvec(k, &layer->k_proj, input);
78
+ dequant_matvec(v, &layer->v_proj, input);
79
+
80
+ /* Apply RoPE to Q and K */
81
+ for (int h = 0; h < n_heads; h++) {
82
+ apply_rope(q + h * head_dim, head_dim, position, 10000.0f);
83
+ }
84
+ for (int h = 0; h < n_kv_heads; h++) {
85
+ apply_rope(k + h * head_dim, head_dim, position, 10000.0f);
86
+ }
87
+
88
+ /* Store K, V in cache */
89
+ size_t kv_offset = (size_t)position * n_kv_heads * head_dim;
90
+ size_t layer_offset = (size_t)layer_idx * cache->max_seq_len * n_kv_heads * head_dim;
91
+ memcpy(cache->key_cache + layer_offset + kv_offset, k, n_kv_heads * head_dim * sizeof(float));
92
+ memcpy(cache->value_cache + layer_offset + kv_offset, v, n_kv_heads * head_dim * sizeof(float));
93
+
94
+ /* Compute attention scores for each head */
95
+ float scale = 1.0f / sqrtf((float)head_dim);
96
+
97
+ for (int h = 0; h < n_heads; h++) {
98
+ int kv_h = h / kv_group; /* GQA: which KV head this Q head uses */
99
+ float *q_h = q + h * head_dim;
100
+
101
+ /* Attention scores: dot(q, all cached keys) */
102
+ float *scores = malloc((position + 1) * sizeof(float));
103
+ float max_score = -1e30f;
104
+
105
+ for (int t = 0; t <= position; t++) {
106
+ float *k_t = cache->key_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim;
107
+ float score = 0.0f;
108
+ for (int d = 0; d < head_dim; d++) {
109
+ score += q_h[d] * k_t[d];
110
+ }
111
+ score *= scale;
112
+ scores[t] = score;
113
+ if (score > max_score) max_score = score;
114
+ }
115
+
116
+ /* Softmax */
117
+ float sum = 0.0f;
118
+ for (int t = 0; t <= position; t++) {
119
+ scores[t] = expf(scores[t] - max_score);
120
+ sum += scores[t];
121
+ }
122
+ for (int t = 0; t <= position; t++) {
123
+ scores[t] /= sum;
124
+ }
125
+
126
+ /* Weighted sum of values */
127
+ float *out_h = attn_out + h * head_dim;
128
+ for (int t = 0; t <= position; t++) {
129
+ float *v_t = cache->value_cache + layer_offset + (size_t)t * n_kv_heads * head_dim + kv_h * head_dim;
130
+ for (int d = 0; d < head_dim; d++) {
131
+ out_h[d] += scores[t] * v_t[d];
132
+ }
133
+ }
134
+
135
+ free(scores);
136
+ }
137
+
138
+ /* Output projection */
139
+ dequant_matvec(output, &layer->o_proj, attn_out);
140
+
141
+ free(q);
142
+ free(k);
143
+ free(v);
144
+ free(attn_out);
145
+ }
146
+
147
+ /* Forward declaration for dequant_matvec (defined in inference.c) */
148
+ extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec);
149
+ ''')
150
+
151
+ # ═══════════════════════════════════════════════════════════════════════════════
152
+ # engine/runtime/transformer.c β€” Full transformer block
153
+ # ═══════════════════════════════════════════════════════════════════════════════
154
+ with open("engine/runtime/transformer.c", "w") as f:
155
+ f.write('''#include "model.h"
156
+ #include <math.h>
157
+ #include <stdlib.h>
158
+ #include <string.h>
159
+
160
+ /*
161
+ * Full transformer decoder block:
162
+ * residual = x
163
+ * x = rmsnorm(x)
164
+ * x = attention(x) + residual
165
+ * residual = x
166
+ * x = rmsnorm(x)
167
+ * x = mlp(x) + residual
168
+ */
169
+
170
+ /* External kernel declarations */
171
+ extern void lila_rmsnorm_avx2(float *out, const float *x, const float *weight, int size, float eps);
172
+ extern void lila_attention(float *output, const float *input, LilaLayer *layer,
173
+ LilaKVCache *cache, int layer_idx, int position);
174
+ extern void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec);
175
+
176
+ /* SiLU activation (will be assembly in Phase 4) */
177
+ static inline float silu_f(float x) {
178
+ return x / (1.0f + expf(-x));
179
+ }
180
+
181
+ /* MLP: gate_proj + up_proj β†’ SiLU(gate) * up β†’ down_proj */
182
+ static void lila_mlp(float *output, const float *input, LilaLayer *layer) {
183
+ int hidden = layer->hidden_size;
184
+ int inter = layer->intermediate_size;
185
+
186
+ float *gate = malloc(inter * sizeof(float));
187
+ float *up = malloc(inter * sizeof(float));
188
+
189
+ /* Gate and up projections */
190
+ dequant_matvec(gate, &layer->gate_proj, input);
191
+ dequant_matvec(up, &layer->up_proj, input);
192
+
193
+ /* SiLU(gate) * up */
194
+ for (int i = 0; i < inter; i++) {
195
+ gate[i] = silu_f(gate[i]) * up[i];
196
+ }
197
+
198
+ /* Down projection */
199
+ dequant_matvec(output, &layer->down_proj, gate);
200
+
201
+ free(gate);
202
+ free(up);
203
+ }
204
+
205
+ /* Memory Fabric contribution (multi-LoRA gated adapters) */
206
+ static void lila_memory_fabric(float *output, const float *input, LilaMemoryFabric *fabric,
207
+ int in_features, int out_features) {
208
+ /* For each active namespace adapter, compute gated LoRA correction */
209
+ for (int ns = 0; ns < LILA_N_NAMESPACES; ns++) {
210
+ LilaLoRA *adapter = &fabric->adapters[ns];
211
+ if (adapter->gate < 0.01f || adapter->A == NULL) continue;
212
+
213
+ int r = adapter->rank;
214
+
215
+ /* Compute: gate * (input @ A) @ B */
216
+ float *mid = calloc(r, sizeof(float));
217
+
218
+ /* mid = input @ A [in_features] @ [in_features, r] β†’ [r] */
219
+ for (int j = 0; j < r; j++) {
220
+ float sum = 0.0f;
221
+ for (int i = 0; i < in_features; i++) {
222
+ sum += input[i] * adapter->A[i * r + j];
223
+ }
224
+ mid[j] = sum;
225
+ }
226
+
227
+ /* output += gate * (mid @ B) [r] @ [r, out_features] β†’ [out_features] */
228
+ float scale = adapter->gate * (32.0f / r); /* alpha/rank */
229
+ for (int i = 0; i < out_features; i++) {
230
+ float sum = 0.0f;
231
+ for (int j = 0; j < r; j++) {
232
+ sum += mid[j] * adapter->B[j * out_features + i];
233
+ }
234
+ output[i] += sum * scale;
235
+ }
236
+
237
+ free(mid);
238
+ }
239
+ }
240
+
241
+ /* Full transformer block forward pass */
242
+ void lila_transformer_block(
243
+ float *hidden_state, /* [hidden_size] β€” modified in place */
244
+ LilaLayer *layer,
245
+ LilaKVCache *cache,
246
+ int layer_idx,
247
+ int position
248
+ ) {
249
+ int hidden = layer->hidden_size;
250
+ float *residual = malloc(hidden * sizeof(float));
251
+ float *normed = malloc(hidden * sizeof(float));
252
+ float *attn_out = malloc(hidden * sizeof(float));
253
+ float *mlp_out = malloc(hidden * sizeof(float));
254
+
255
+ /* ── Pre-attention norm ── */
256
+ memcpy(residual, hidden_state, hidden * sizeof(float));
257
+ lila_rmsnorm_avx2(normed, hidden_state, layer->input_layernorm, hidden, 1e-6f);
258
+
259
+ /* ── Attention ── */
260
+ lila_attention(attn_out, normed, layer, cache, layer_idx, position);
261
+
262
+ /* ── Add Memory Fabric to attention output ── */
263
+ lila_memory_fabric(attn_out, normed, &layer->fabric, hidden, hidden);
264
+
265
+ /* ── Residual connection ── */
266
+ for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + attn_out[i];
267
+
268
+ /* ── Pre-MLP norm ── */
269
+ memcpy(residual, hidden_state, hidden * sizeof(float));
270
+ lila_rmsnorm_avx2(normed, hidden_state, layer->post_attention_layernorm, hidden, 1e-6f);
271
+
272
+ /* ── MLP ── */
273
+ lila_mlp(mlp_out, normed, layer);
274
+
275
+ /* ── Residual connection ── */
276
+ for (int i = 0; i < hidden; i++) hidden_state[i] = residual[i] + mlp_out[i];
277
+
278
+ free(residual);
279
+ free(normed);
280
+ free(attn_out);
281
+ free(mlp_out);
282
+ }
283
+
284
+ /* Full model forward pass β€” single token */
285
+ int lila_forward(LilaModel *model, int token, int position) {
286
+ int hidden = model->hidden_size;
287
+
288
+ /* Token embedding */
289
+ float *hidden_state = malloc(hidden * sizeof(float));
290
+ memcpy(hidden_state, model->token_embedding + (size_t)token * hidden,
291
+ hidden * sizeof(float));
292
+
293
+ /* Transformer layers */
294
+ for (int l = 0; l < model->n_layers; l++) {
295
+ lila_transformer_block(hidden_state, &model->layers[l],
296
+ &model->kv_cache, l, position);
297
+ }
298
+
299
+ /* Final norm */
300
+ float *normed = malloc(hidden * sizeof(float));
301
+ lila_rmsnorm_avx2(normed, hidden_state, model->final_norm, hidden, 1e-6f);
302
+
303
+ /* LM head: project to vocab logits */
304
+ float *logits = malloc(model->vocab_size * sizeof(float));
305
+
306
+ /* matvec: logits = lm_head @ normed */
307
+ /* lm_head is [vocab_size, hidden_size] */
308
+ for (int i = 0; i < model->vocab_size; i++) {
309
+ float sum = 0.0f;
310
+ for (int j = 0; j < hidden; j++) {
311
+ sum += model->lm_head[i * hidden + j] * normed[j];
312
+ }
313
+ logits[i] = sum;
314
+ }
315
+
316
+ /* Sample */
317
+ /* Greedy for now β€” temperature sampling in Phase 4 */
318
+ int next_token = 0;
319
+ float max_val = logits[0];
320
+ for (int i = 1; i < model->vocab_size; i++) {
321
+ if (logits[i] > max_val) { max_val = logits[i]; next_token = i; }
322
+ }
323
+
324
+ free(hidden_state);
325
+ free(normed);
326
+ free(logits);
327
+
328
+ return next_token;
329
+ }
330
+ ''')
331
+
332
+ # ═══════════════════════════════════════════════════════════════════════════════
333
+ # engine/runtime/tokenizer.c β€” BPE Tokenizer
334
+ # ═══════════════════════════════════════════════════════════════════════════════
335
+ with open("engine/runtime/tokenizer.c", "w") as f:
336
+ f.write('''#include <stdio.h>
337
+ #include <stdlib.h>
338
+ #include <string.h>
339
+
340
+ /*
341
+ * BPE Tokenizer for Gemma/LLaMA-family models.
342
+ * Loads sentencepiece vocabulary and performs encoding/decoding.
343
+ *
344
+ * For full functionality, this would need:
345
+ * 1. Load .model file (protobuf) or vocab.json
346
+ * 2. BPE merge rules
347
+ * 3. Byte-fallback for unknown characters
348
+ *
349
+ * Phase 1: Load vocab from a simple text format (one token per line).
350
+ * Phase 4: Full sentencepiece compatibility.
351
+ */
352
+
353
+ #define MAX_VOCAB 128000
354
+ #define MAX_TOKEN_LEN 128
355
+
356
+ typedef struct {
357
+ char **tokens; /* Array of token strings */
358
+ int vocab_size;
359
+ /* TODO: merge rules, scores */
360
+ } LilaTokenizer;
361
+
362
+ LilaTokenizer *lila_load_tokenizer(const char *vocab_path) {
363
+ LilaTokenizer *tok = calloc(1, sizeof(LilaTokenizer));
364
+ tok->tokens = calloc(MAX_VOCAB, sizeof(char *));
365
+
366
+ FILE *f = fopen(vocab_path, "r");
367
+ if (!f) {
368
+ fprintf(stderr, "Cannot open vocab: %s\\n", vocab_path);
369
+ free(tok->tokens);
370
+ free(tok);
371
+ return NULL;
372
+ }
373
+
374
+ char line[MAX_TOKEN_LEN];
375
+ int i = 0;
376
+ while (fgets(line, sizeof(line), f) && i < MAX_VOCAB) {
377
+ line[strcspn(line, "\\n")] = 0;
378
+ tok->tokens[i] = strdup(line);
379
+ i++;
380
+ }
381
+ tok->vocab_size = i;
382
+ fclose(f);
383
+
384
+ fprintf(stderr, "Tokenizer loaded: %d tokens\\n", tok->vocab_size);
385
+ return tok;
386
+ }
387
+
388
+ /* Decode token ID to string */
389
+ const char *lila_decode_token(LilaTokenizer *tok, int token_id) {
390
+ if (token_id < 0 || token_id >= tok->vocab_size) return "<unk>";
391
+ return tok->tokens[token_id];
392
+ }
393
+
394
+ /* Simple encode (character-level fallback β€” full BPE in Phase 4) */
395
+ int lila_encode_char(LilaTokenizer *tok, char c) {
396
+ /* Search for single-character token */
397
+ char target[2] = {c, 0};
398
+ for (int i = 0; i < tok->vocab_size; i++) {
399
+ if (tok->tokens[i] && strcmp(tok->tokens[i], target) == 0) {
400
+ return i;
401
+ }
402
+ }
403
+ return 0; /* unknown β†’ first token */
404
+ }
405
+
406
+ void lila_free_tokenizer(LilaTokenizer *tok) {
407
+ if (!tok) return;
408
+ for (int i = 0; i < tok->vocab_size; i++) {
409
+ free(tok->tokens[i]);
410
+ }
411
+ free(tok->tokens);
412
+ free(tok);
413
+ }
414
+ ''')
415
+
416
+ # ═══════════════════════════════════════════════════════════════════════════════
417
+ # engine/runtime/tokenizer.h
418
+ # ═══════════════════════════════════════════════════════════════════════════════
419
+ with open("engine/runtime/tokenizer.h", "w") as f:
420
+ f.write('''#ifndef LILA_TOKENIZER_H
421
+ #define LILA_TOKENIZER_H
422
+
423
+ typedef struct LilaTokenizer LilaTokenizer;
424
+
425
+ LilaTokenizer *lila_load_tokenizer(const char *vocab_path);
426
+ const char *lila_decode_token(LilaTokenizer *tok, int token_id);
427
+ int lila_encode_char(LilaTokenizer *tok, char c);
428
+ void lila_free_tokenizer(LilaTokenizer *tok);
429
+
430
+ #endif
431
+ ''')
432
+
433
+ # ═══════════════════════════════════════════════════════════════════════════════
434
+ # engine/runtime/attention.h
435
+ # ═══════════════════════════════════════════════════════════════════════════════
436
+ with open("engine/runtime/attention.h", "w") as f:
437
+ f.write('''#ifndef LILA_ATTENTION_H
438
+ #define LILA_ATTENTION_H
439
+
440
+ #include "model.h"
441
+
442
+ void lila_init_kv_cache(LilaKVCache *cache, int n_layers, int max_seq,
443
+ int n_kv_heads, int head_dim);
444
+ void lila_attention(float *output, const float *input, LilaLayer *layer,
445
+ LilaKVCache *cache, int layer_idx, int position);
446
+
447
+ #endif
448
+ ''')
449
+
450
+ # ═══════════════════════════════════════════════════════════════════════════════
451
+ # engine/runtime/transformer.h
452
+ # ═══════════════════════════════════════════════════════════════════════════════
453
+ with open("engine/runtime/transformer.h", "w") as f:
454
+ f.write('''#ifndef LILA_TRANSFORMER_H
455
+ #define LILA_TRANSFORMER_H
456
+
457
+ #include "model.h"
458
+
459
+ void lila_transformer_block(float *hidden_state, LilaLayer *layer,
460
+ LilaKVCache *cache, int layer_idx, int position);
461
+ int lila_forward(LilaModel *model, int token, int position);
462
+
463
+ #endif
464
+ ''')
465
+
466
+ # Commit and push
467
+ subprocess.run(["git", "add", "-A"], check=True)
468
+ subprocess.run(["git", "commit", "-m",
469
+ "Engine Phase 2: Full transformer forward pass\n\n"
470
+ "runtime/attention.c:\n"
471
+ " - Multi-head attention with Grouped Query Attention (GQA)\n"
472
+ " - Rotary Position Embeddings (RoPE)\n"
473
+ " - KV Cache for autoregressive generation\n"
474
+ " - Memory Fabric (multi-LoRA) integrated into attention\n\n"
475
+ "runtime/transformer.c:\n"
476
+ " - Full decoder block: norm β†’ attention β†’ residual β†’ norm β†’ MLP β†’ residual\n"
477
+ " - Memory Fabric adapter contribution added to attention output\n"
478
+ " - lila_forward(): complete single-token forward pass\n"
479
+ " - Token embedding β†’ N layers β†’ final norm β†’ LM head β†’ sample\n\n"
480
+ "runtime/tokenizer.c:\n"
481
+ " - Vocab loading from text file\n"
482
+ " - Token decode (ID β†’ string)\n"
483
+ " - Character-level encode fallback (full BPE in Phase 4)\n\n"
484
+ "The full inference path is structurally complete.\n"
485
+ "Remaining: wire format converter to produce loadable .lila files,\n"
486
+ "then test end-to-end token generation."],
487
+ check=True)
488
+ subprocess.run(["git", "push", "origin", "main"], check=True)
489
+ print("βœ… Engine Phase 2 pushed!")