| |
| """Push Lila Engine Phase 1 code to repo.""" |
| import subprocess, os |
| TOKEN = "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" |
| subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True) |
| os.chdir("/app/lila") |
| subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True) |
| subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True) |
|
|
| |
| |
| |
| with open("engine/Makefile", "w") as f: |
| f.write('''# Lila Inference Engine β Build System |
| # Detects architecture, assembles kernels, links runtime |
| |
| UNAME_M := $(shell uname -m) |
| CC := gcc |
| CFLAGS := -O3 -march=native -Wall -Wextra -std=c11 -pthread |
| LDFLAGS := -lm -lpthread |
| |
| # Architecture detection |
| ifeq ($(UNAME_M),x86_64) |
| ASM := nasm |
| ASMFLAGS := -f elf64 |
| ARCH_DIR := x86_64 |
| CFLAGS += -mavx2 -mfma |
| # Check for AVX-512 |
| HAS_AVX512 := $(shell grep -c avx512f /proc/cpuinfo 2>/dev/null || echo 0) |
| ifneq ($(HAS_AVX512),0) |
| CFLAGS += -mavx512f -mavx512bw -mavx512vl |
| endif |
| else ifeq ($(UNAME_M),aarch64) |
| ASM := as |
| ASMFLAGS := |
| ARCH_DIR := arm64 |
| else |
| $(error Unsupported architecture: $(UNAME_M)) |
| endif |
| |
| # Source files |
| KERN_SRC := $(wildcard kernels/$(ARCH_DIR)/*.S) |
| KERN_OBJ := $(KERN_SRC:.S=.o) |
| RT_SRC := $(wildcard runtime/*.c) |
| RT_OBJ := $(RT_SRC:.c=.o) |
| IF_SRC := $(wildcard interface/*.c) |
| IF_OBJ := $(IF_SRC:.c=.o) |
| |
| # Targets |
| .PHONY: all clean test bench |
| |
| all: lila-engine |
| |
| lila-engine: $(KERN_OBJ) $(RT_OBJ) $(IF_OBJ) |
| \t$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) |
| \t@echo "Built lila-engine for $(UNAME_M)" |
| |
| # Assembly kernels |
| kernels/$(ARCH_DIR)/%.o: kernels/$(ARCH_DIR)/%.S |
| ifeq ($(UNAME_M),x86_64) |
| \t$(ASM) $(ASMFLAGS) -o $@ $< |
| else |
| \t$(ASM) $(ASMFLAGS) -o $@ $< |
| endif |
| |
| # C runtime |
| runtime/%.o: runtime/%.c |
| \t$(CC) $(CFLAGS) -c -o $@ $< |
| |
| # C interface |
| interface/%.o: interface/%.c |
| \t$(CC) $(CFLAGS) -c -o $@ $< |
| |
| # Tests |
| test: lila-engine |
| \t./lila-engine --test |
| |
| bench: lila-engine |
| \t./lila-engine --bench |
| |
| clean: |
| \trm -f lila-engine $(KERN_OBJ) $(RT_OBJ) $(IF_OBJ) |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/model.h", "w") as f: |
| f.write('''#ifndef LILA_MODEL_H |
| #define LILA_MODEL_H |
| |
| #include <stdint.h> |
| #include <stddef.h> |
| |
| /* |
| * Lila Model Format |
| * |
| * Weights stored as FigQuant INT4: |
| * - 16-value codebook per layer (64 bytes) |
| * - Packed 4-bit indices (2 per byte) |
| * - Per-group FP16 scales |
| * |
| * Memory layout optimized for: |
| * - mmap loading (zero-copy from disk) |
| * - SIMD dequantization (codebook fits in one register) |
| * - Cache-friendly access patterns |
| */ |
| |
| #define LILA_MAGIC 0x4C494C41 /* "LILA" */ |
| #define LILA_VERSION 1 |
| #define LILA_MAX_LAYERS 64 |
| #define LILA_MAX_VOCAB 128000 |
| #define LILA_GROUP_SIZE 128 |
| #define LILA_CODEBOOK_SIZE 16 |
| |
| /* Quantized weight tensor */ |
| typedef struct { |
| uint8_t *indices; /* Packed 4-bit (2 per byte) */ |
| float codebook[LILA_CODEBOOK_SIZE]; /* 16 dequant values */ |
| uint16_t *scales; /* Per-group FP16 scales */ |
| int rows; |
| int cols; |
| int n_groups; |
| } LilaQuantWeight; |
| |
| /* LoRA adapter (for Memory Fabric) */ |
| typedef struct { |
| float *A; /* [in_features, rank] */ |
| float *B; /* [rank, out_features] */ |
| float gate; /* Namespace gate value [0,1] */ |
| int rank; |
| int in_features; |
| int out_features; |
| } LilaLoRA; |
| |
| /* Memory Fabric β 5 namespace adapters per layer */ |
| #define LILA_N_NAMESPACES 5 |
| typedef struct { |
| LilaLoRA adapters[LILA_N_NAMESPACES]; |
| /* Namespace indices: 0=personal, 1=episodic, 2=wiki, 3=schedule, 4=contested */ |
| } LilaMemoryFabric; |
| |
| /* Transformer layer */ |
| typedef struct { |
| /* Attention */ |
| LilaQuantWeight q_proj; |
| LilaQuantWeight k_proj; |
| LilaQuantWeight v_proj; |
| LilaQuantWeight o_proj; |
| |
| /* MLP */ |
| LilaQuantWeight gate_proj; |
| LilaQuantWeight up_proj; |
| LilaQuantWeight down_proj; |
| |
| /* Norms */ |
| float *input_layernorm; /* RMSNorm weights */ |
| float *post_attention_layernorm; |
| |
| /* Memory Fabric for this layer */ |
| LilaMemoryFabric fabric; |
| |
| int hidden_size; |
| int intermediate_size; |
| int n_heads; |
| int n_kv_heads; |
| int head_dim; |
| } LilaLayer; |
| |
| /* KV Cache */ |
| typedef struct { |
| float *key_cache; /* [n_layers, max_seq, n_kv_heads, head_dim] */ |
| float *value_cache; |
| int max_seq_len; |
| int current_pos; |
| } LilaKVCache; |
| |
| /* Full model */ |
| typedef struct { |
| /* Header */ |
| uint32_t magic; |
| uint32_t version; |
| |
| /* Config */ |
| int n_layers; |
| int hidden_size; |
| int intermediate_size; |
| int n_heads; |
| int n_kv_heads; |
| int head_dim; |
| int vocab_size; |
| int max_seq_len; |
| float rope_theta; |
| float rms_norm_eps; |
| |
| /* Weights */ |
| float *token_embedding; /* [vocab_size, hidden_size] */ |
| LilaLayer layers[LILA_MAX_LAYERS]; |
| float *final_norm; /* RMSNorm weights */ |
| float *lm_head; /* [vocab_size, hidden_size] or tied */ |
| |
| /* Runtime */ |
| LilaKVCache kv_cache; |
| |
| /* Memory map */ |
| void *mmap_addr; |
| size_t mmap_size; |
| } LilaModel; |
| |
| /* API */ |
| LilaModel *lila_load_model(const char *path); |
| void lila_free_model(LilaModel *model); |
| int lila_generate_token(LilaModel *model, int *tokens, int n_tokens); |
| void lila_generate(LilaModel *model, int *tokens, int n_tokens, int max_new_tokens, |
| void (*callback)(int token, void *ctx), void *ctx); |
| |
| #endif /* LILA_MODEL_H */ |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/model.c", "w") as f: |
| f.write('''#include "model.h" |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/mman.h> |
| #include <sys/stat.h> |
| #include <fcntl.h> |
| #include <unistd.h> |
| |
| /* |
| * Load model weights via mmap β zero copy from disk. |
| * The file is memory-mapped directly, so the OS handles |
| * paging weights in/out as needed. Perfect for edge devices |
| * with limited RAM. |
| */ |
| |
| LilaModel *lila_load_model(const char *path) { |
| int fd = open(path, O_RDONLY); |
| if (fd < 0) { |
| fprintf(stderr, "Failed to open model: %s\\n", path); |
| return NULL; |
| } |
| |
| struct stat st; |
| fstat(fd, &st); |
| size_t file_size = st.st_size; |
| |
| void *mapped = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0); |
| close(fd); |
| |
| if (mapped == MAP_FAILED) { |
| fprintf(stderr, "Failed to mmap model\\n"); |
| return NULL; |
| } |
| |
| /* Advise the kernel we'll read sequentially during inference */ |
| madvise(mapped, file_size, MADV_SEQUENTIAL); |
| |
| LilaModel *model = calloc(1, sizeof(LilaModel)); |
| model->mmap_addr = mapped; |
| model->mmap_size = file_size; |
| |
| /* Parse header */ |
| uint8_t *ptr = (uint8_t *)mapped; |
| memcpy(&model->magic, ptr, 4); ptr += 4; |
| |
| if (model->magic != LILA_MAGIC) { |
| fprintf(stderr, "Invalid model magic: 0x%08X\\n", model->magic); |
| lila_free_model(model); |
| return NULL; |
| } |
| |
| memcpy(&model->version, ptr, 4); ptr += 4; |
| |
| /* Read config */ |
| memcpy(&model->n_layers, ptr, 4); ptr += 4; |
| memcpy(&model->hidden_size, ptr, 4); ptr += 4; |
| memcpy(&model->intermediate_size, ptr, 4); ptr += 4; |
| memcpy(&model->n_heads, ptr, 4); ptr += 4; |
| memcpy(&model->n_kv_heads, ptr, 4); ptr += 4; |
| memcpy(&model->vocab_size, ptr, 4); ptr += 4; |
| memcpy(&model->max_seq_len, ptr, 4); ptr += 4; |
| |
| model->head_dim = model->hidden_size / model->n_heads; |
| model->rope_theta = 10000.0f; |
| model->rms_norm_eps = 1e-6f; |
| |
| /* TODO: Parse weight tensors from mmap'd region */ |
| /* For now, this is the structural foundation */ |
| |
| fprintf(stderr, "Loaded model: %d layers, hidden=%d, vocab=%d\\n", |
| model->n_layers, model->hidden_size, model->vocab_size); |
| |
| return model; |
| } |
| |
| void lila_free_model(LilaModel *model) { |
| if (!model) return; |
| if (model->mmap_addr) { |
| munmap(model->mmap_addr, model->mmap_size); |
| } |
| /* Free KV cache */ |
| free(model->kv_cache.key_cache); |
| free(model->kv_cache.value_cache); |
| free(model); |
| } |
| ''') |
|
|
| |
| |
| |
| with open("engine/runtime/inference.c", "w") as f: |
| f.write('''#include "model.h" |
| #include <math.h> |
| #include <string.h> |
| #include <stdlib.h> |
| |
| /* |
| * Core inference loop. |
| * For each new token: |
| * 1. Embed token |
| * 2. For each layer: attention + MLP (with Memory Fabric) |
| * 3. Final norm |
| * 4. LM head β logits |
| * 5. Sample next token |
| */ |
| |
| /* RMSNorm β will be replaced by assembly kernel */ |
| static void rmsnorm(float *out, const float *x, const float *weight, int size, float eps) { |
| float ss = 0.0f; |
| for (int i = 0; i < size; i++) ss += x[i] * x[i]; |
| ss = 1.0f / sqrtf(ss / size + eps); |
| for (int i = 0; i < size; i++) out[i] = x[i] * ss * weight[i]; |
| } |
| |
| /* SiLU activation */ |
| static float silu(float x) { |
| return x / (1.0f + expf(-x)); |
| } |
| |
| /* Softmax */ |
| static void softmax(float *x, int size) { |
| float max_val = x[0]; |
| for (int i = 1; i < size; i++) if (x[i] > max_val) max_val = x[i]; |
| float sum = 0.0f; |
| for (int i = 0; i < size; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; } |
| for (int i = 0; i < size; i++) x[i] /= sum; |
| } |
| |
| /* Matrix-vector multiply β THE hot path. Will be assembly. */ |
| static void matvec(float *out, const float *mat, const float *vec, int rows, int cols) { |
| for (int i = 0; i < rows; i++) { |
| float sum = 0.0f; |
| for (int j = 0; j < cols; j++) { |
| sum += mat[i * cols + j] * vec[j]; |
| } |
| out[i] = sum; |
| } |
| } |
| |
| /* INT4 dequant + matvec β fused for cache efficiency */ |
| static void dequant_matvec(float *out, const LilaQuantWeight *w, const float *vec) { |
| int rows = w->rows; |
| int cols = w->cols; |
| |
| for (int i = 0; i < rows; i++) { |
| float sum = 0.0f; |
| for (int j = 0; j < cols; j++) { |
| int flat_idx = i * cols + j; |
| int group_idx = flat_idx / LILA_GROUP_SIZE; |
| int byte_idx = flat_idx / 2; |
| int nibble = (flat_idx % 2 == 0) |
| ? (w->indices[byte_idx] & 0x0F) |
| : ((w->indices[byte_idx] >> 4) & 0x0F); |
| |
| /* Dequant: codebook[nibble] * scale */ |
| float scale = (float)w->scales[group_idx]; /* TODO: FP16 decode */ |
| float val = w->codebook[nibble] * scale; |
| sum += val * vec[j]; |
| } |
| out[i] = sum; |
| } |
| } |
| |
| /* Sample from logits (temperature + top-p) */ |
| static int sample_token(float *logits, int vocab_size, float temperature, float top_p) { |
| /* Apply temperature */ |
| if (temperature > 0.0f) { |
| for (int i = 0; i < vocab_size; i++) logits[i] /= temperature; |
| } |
| |
| softmax(logits, vocab_size); |
| |
| /* Top-p sampling */ |
| /* For now: greedy (argmax) */ |
| int max_idx = 0; |
| float max_val = logits[0]; |
| for (int i = 1; i < vocab_size; i++) { |
| if (logits[i] > max_val) { max_val = logits[i]; max_idx = i; } |
| } |
| return max_idx; |
| } |
| |
| /* Generate one token */ |
| int lila_generate_token(LilaModel *model, int *tokens, int n_tokens) { |
| /* TODO: full transformer forward pass */ |
| /* This is the structural skeleton β actual compute dispatches to kernels */ |
| (void)model; (void)tokens; (void)n_tokens; |
| return 0; /* placeholder */ |
| } |
| |
| /* Generate sequence */ |
| void lila_generate(LilaModel *model, int *tokens, int n_tokens, int max_new_tokens, |
| void (*callback)(int token, void *ctx), void *ctx) { |
| for (int i = 0; i < max_new_tokens; i++) { |
| int next = lila_generate_token(model, tokens, n_tokens + i); |
| tokens[n_tokens + i] = next; |
| if (callback) callback(next, ctx); |
| if (next == 0) break; /* EOS */ |
| } |
| } |
| ''') |
|
|
| |
| |
| |
| with open("engine/kernels/x86_64/dequant_int4.S", "w") as f: |
| f.write('''; βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| ; Lila Engine β INT4 Dequantization Kernel (x86_64 AVX2) |
| ; |
| ; Dequantizes FigQuant INT4 packed indices to FP32 using codebook lookup. |
| ; The 16-value codebook fits in a single YMM register (256-bit). |
| ; |
| ; void lila_dequant_int4_avx2( |
| ; float *output, ; rdi β output FP32 buffer |
| ; const uint8_t *indices, ; rsi β packed 4-bit indices (2 per byte) |
| ; const float *codebook, ; rdx β 16 float32 values |
| ; const float *scales, ; rcx β per-group scales |
| ; int n_elements, ; r8 β number of elements to dequant |
| ; int group_size ; r9 β elements per group (128) |
| ; ); |
| ; βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| section .text |
| global lila_dequant_int4_avx2 |
| |
| lila_dequant_int4_avx2: |
| push rbp |
| mov rbp, rsp |
| push rbx |
| push r12 |
| push r13 |
| push r14 |
| |
| mov r12, rdi ; output ptr |
| mov r13, rsi ; indices ptr |
| mov r14, rdx ; codebook ptr |
| |
| ; Load codebook into memory (will use gather for lookup) |
| ; For AVX2: use vpgatherdd with index register |
| |
| xor rbx, rbx ; element counter |
| xor r10, r10 ; group counter |
| |
| .loop: |
| cmp rbx, r8 |
| jge .done |
| |
| ; Get packed byte (contains 2 indices) |
| mov rax, rbx |
| shr rax, 1 ; byte index = element / 2 |
| movzx eax, byte [r13 + rax] |
| |
| ; Extract nibble |
| test rbx, 1 |
| jnz .high_nibble |
| and eax, 0x0F ; low nibble |
| jmp .lookup |
| .high_nibble: |
| shr eax, 4 ; high nibble |
| |
| .lookup: |
| ; Codebook lookup: output = codebook[index] * scale |
| lea rax, [r14 + rax*4] ; &codebook[index] |
| movss xmm0, [rax] ; codebook value |
| |
| ; Get group scale |
| mov rax, rbx |
| xor edx, edx |
| div r9 ; rax = element / group_size = group_idx |
| movss xmm1, [rcx + rax*4] ; scale |
| |
| ; Multiply: codebook_value * scale |
| mulss xmm0, xmm1 |
| |
| ; Store result |
| movss [r12 + rbx*4], xmm0 |
| |
| inc rbx |
| jmp .loop |
| |
| .done: |
| pop r14 |
| pop r13 |
| pop r12 |
| pop rbx |
| pop rbp |
| ret |
| |
| ; βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| ; NOTE: This is the scalar fallback. The SIMD version (below) processes |
| ; 8 elements at a time using AVX2 gather instructions. |
| ; TODO: Add vectorized version with vpgatherdd |
| ; βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| ''') |
|
|
| |
| |
| |
| with open("engine/kernels/arm64/dequant_int4.S", "w") as f: |
| f.write('''// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| // Lila Engine β INT4 Dequantization Kernel (ARM64 NEON) |
| // |
| // Same operation as x86 version but using ARM NEON intrinsics pattern. |
| // Processes 4 elements at a time using 128-bit NEON registers. |
| // |
| // void lila_dequant_int4_neon( |
| // float *output, // x0 |
| // const uint8_t *indices, // x1 |
| // const float *codebook, // x2 |
| // const float *scales, // x3 |
| // int n_elements, // x4 (w4) |
| // int group_size // x5 (w5) |
| // ); |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| .text |
| .global lila_dequant_int4_neon |
| .type lila_dequant_int4_neon, %function |
| |
| lila_dequant_int4_neon: |
| // Save callee-saved registers |
| stp x19, x20, [sp, #-16]! |
| stp x21, x22, [sp, #-16]! |
| |
| mov x19, x0 // output |
| mov x20, x1 // indices |
| mov x21, x2 // codebook |
| mov x22, xzr // counter |
| |
| .Lloop: |
| cmp w22, w4 |
| bge .Ldone |
| |
| // Get packed byte |
| lsr x6, x22, #1 // byte_idx = element / 2 |
| ldrb w7, [x20, x6] // load packed byte |
| |
| // Extract nibble |
| tst x22, #1 |
| bne .Lhigh |
| and w7, w7, #0x0F // low nibble |
| b .Llookup |
| .Lhigh: |
| lsr w7, w7, #4 // high nibble |
| |
| .Llookup: |
| // Codebook lookup |
| ldr s0, [x21, x7, lsl #2] // codebook[index] |
| |
| // Get group scale |
| udiv w8, w22, w5 // group_idx = element / group_size |
| ldr s1, [x3, x8, lsl #2] // scale |
| |
| // Multiply |
| fmul s0, s0, s1 |
| |
| // Store |
| str s0, [x19, x22, lsl #2] |
| |
| add w22, w22, #1 |
| b .Lloop |
| |
| .Ldone: |
| ldp x21, x22, [sp], #16 |
| ldp x19, x20, [sp], #16 |
| ret |
| |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| // NOTE: Scalar fallback. NEON vectorized version TODO. |
| // βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| ''') |
|
|
| |
| |
| |
| os.makedirs("engine/interface", exist_ok=True) |
| with open("engine/interface/cli.c", "w") as f: |
| f.write('''#include "../runtime/model.h" |
| #include <stdio.h> |
| #include <string.h> |
| |
| static void token_callback(int token, void *ctx) { |
| (void)ctx; |
| printf("[tok:%d] ", token); |
| fflush(stdout); |
| } |
| |
| int main(int argc, char *argv[]) { |
| if (argc < 2) { |
| fprintf(stderr, "Usage: lila-engine <model.lila> [--test] [--bench]\\n"); |
| return 1; |
| } |
| |
| if (strcmp(argv[1], "--test") == 0) { |
| printf("Running tests...\\n"); |
| /* TODO: unit tests */ |
| printf("All tests passed.\\n"); |
| return 0; |
| } |
| |
| if (strcmp(argv[1], "--bench") == 0) { |
| printf("Running benchmarks...\\n"); |
| /* TODO: performance benchmarks */ |
| return 0; |
| } |
| |
| printf("\\xF0\\x9F\\x8C\\xB8 Lila Engine v0.1\\n"); |
| printf("Loading model: %s\\n", argv[1]); |
| |
| LilaModel *model = lila_load_model(argv[1]); |
| if (!model) { |
| fprintf(stderr, "Failed to load model\\n"); |
| return 1; |
| } |
| |
| printf("Model loaded: %d layers, hidden=%d, vocab=%d\\n", |
| model->n_layers, model->hidden_size, model->vocab_size); |
| |
| /* Interactive mode */ |
| char input[4096]; |
| printf("\\n\\xF0\\x9F\\x8C\\xB8 Lila is ready. Type to talk.\\n\\n"); |
| |
| while (1) { |
| printf("Sammie: "); |
| if (!fgets(input, sizeof(input), stdin)) break; |
| input[strcspn(input, "\\n")] = 0; |
| if (strlen(input) == 0) continue; |
| |
| /* TODO: tokenize input, run inference, detokenize output */ |
| printf("Lila: [inference not yet wired]\\n\\n"); |
| } |
| |
| lila_free_model(model); |
| return 0; |
| } |
| ''') |
|
|
| |
| |
| |
| os.makedirs("engine/format", exist_ok=True) |
| with open("engine/format/convert.py", "w") as f: |
| f.write('''#!/usr/bin/env python3 |
| """ |
| Convert a HuggingFace model (safetensors) to Lila's custom binary format. |
| |
| Uses FigQuant from Little Fig for INT4 quantization. |
| |
| Usage: |
| python convert.py --model google/gemma-3-4b-it --output model.lila |
| """ |
| |
| import argparse |
| import struct |
| import sys |
| import os |
| |
| LILA_MAGIC = 0x4C494C41 # "LILA" |
| LILA_VERSION = 1 |
| |
| |
| def convert(model_path: str, output_path: str, group_size: int = 128): |
| """Convert HF model to Lila binary format.""" |
| import torch |
| from transformers import AutoModelForCausalLM, AutoConfig |
| |
| print(f"Loading model: {model_path}") |
| config = AutoConfig.from_pretrained(model_path) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True |
| ) |
| |
| print(f"Model config: layers={config.num_hidden_layers}, " |
| f"hidden={config.hidden_size}, vocab={config.vocab_size}") |
| |
| # Try to import FigQuant for INT4 |
| try: |
| sys.path.insert(0, os.path.expanduser("~/littlefig/src")) |
| from little_fig.engine.figquant import figquant_quantize |
| has_figquant = True |
| print("Using FigQuant for INT4 quantization") |
| except ImportError: |
| has_figquant = False |
| print("WARNING: FigQuant not available. Storing FP32 (large file).") |
| |
| with open(output_path, "wb") as f: |
| # Header |
| f.write(struct.pack("I", LILA_MAGIC)) |
| f.write(struct.pack("I", LILA_VERSION)) |
| f.write(struct.pack("I", config.num_hidden_layers)) |
| f.write(struct.pack("I", config.hidden_size)) |
| f.write(struct.pack("I", config.intermediate_size)) |
| f.write(struct.pack("I", config.num_attention_heads)) |
| f.write(struct.pack("I", getattr(config, "num_key_value_heads", config.num_attention_heads))) |
| f.write(struct.pack("I", config.vocab_size)) |
| f.write(struct.pack("I", getattr(config, "max_position_embeddings", 4096))) |
| |
| # TODO: Write quantized weight tensors |
| # For each linear layer: quantize with FigQuant, write codebook + indices + scales |
| |
| print(f"Header written. Full weight conversion TODO.") |
| print(f"Output: {output_path}") |
| |
| del model |
| print("Done.") |
| |
| |
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model", required=True) |
| parser.add_argument("--output", default="model.lila") |
| parser.add_argument("--group-size", type=int, default=128) |
| args = parser.parse_args() |
| convert(args.model, args.output, args.group_size) |
| ''') |
|
|
| |
| for f in ["engine/kernels/x86_64/.gitkeep", "engine/kernels/arm64/.gitkeep", "engine/runtime/.gitkeep"]: |
| if os.path.exists(f): |
| os.remove(f) |
|
|
| |
| subprocess.run(["git", "add", "-A"], check=True) |
| subprocess.run(["git", "commit", "-m", |
| "Engine Phase 1: Foundation code\n\n" |
| "Makefile: auto-detects x86_64/ARM64, assembles kernels, links\n" |
| "runtime/model.h: Core structs (LilaModel, LilaQuantWeight, LilaLoRA, LilaMemoryFabric)\n" |
| "runtime/model.c: mmap-based model loading (zero-copy from disk)\n" |
| "runtime/inference.c: Token generation loop skeleton (RMSNorm, softmax, matvec, sampling)\n" |
| "kernels/x86_64/dequant_int4.S: INT4 dequantization (scalar, AVX2 TODO)\n" |
| "kernels/arm64/dequant_int4.S: INT4 dequantization (scalar, NEON TODO)\n" |
| "interface/cli.c: Interactive CLI for testing\n" |
| "format/convert.py: HF safetensors β Lila binary format converter\n\n" |
| "This is the structural foundation. Next: vectorize kernels, wire full forward pass."], |
| check=True) |
| subprocess.run(["git", "push", "origin", "main"], check=True) |
| print("β
Engine Phase 1 pushed!") |
|
|