| |
| """Push LilaCore fix + inference engine build plan to Lila repo.""" |
| import subprocess, os |
| TOKEN = "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" |
| subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True) |
| os.chdir("/app/lila") |
| subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True) |
| subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True) |
|
|
| |
| |
| |
| with open("INFERENCE_ENGINE_PLAN.md", "w") as f: |
| f.write('''# Lila Inference Engine β Build Plan |
| |
| ## What This Is |
| |
| A custom inference engine for Lila written in assembly + C. No Python at runtime. |
| No dependency on llama.cpp, vLLM, transformers, or any third-party inference library. |
| Lila runs as native machine code β the fastest possible execution on any hardware. |
| |
| ## Why Custom |
| |
| 1. **Speed** β Hand-tuned assembly for the hot path (matmul, attention) beats any compiler output |
| 2. **Portability** β Assembly kernels per architecture means it runs on anything: x86 desktop, ARM phone, RISC-V edge device |
| 3. **Control** β Memory Fabric (multi-LoRA) is native to the engine, not bolted on |
| 4. **Identity** β Lila speaks machine language. Her own inference IS machine language. Aligned. |
| 5. **Independence** β No supply chain risk. No one else's bugs. No license constraints. |
| |
| ## Architecture |
| |
| ``` |
| lila-engine/ |
| βββ kernels/ # Assembly kernels (THE hot path) |
| β βββ x86_64/ |
| β β βββ matmul_avx512.S # Matrix multiply (AVX-512, INT4 fused) |
| β β βββ matmul_avx2.S # Fallback for older CPUs |
| β β βββ rmsnorm.S # RMS normalization |
| β β βββ softmax.S # Softmax with online computation |
| β β βββ silu.S # SiLU activation |
| β β βββ dequant_int4.S # FigQuant INT4 dequantization |
| β β βββ lora_fused.S # Dequant + base matmul + LoRA in one pass |
| β β βββ rope.S # Rotary position embeddings |
| β βββ arm64/ |
| β β βββ matmul_neon.S # Matrix multiply (NEON) |
| β β βββ matmul_sve.S # Matrix multiply (SVE β newer ARM) |
| β β βββ rmsnorm.S |
| β β βββ softmax.S |
| β β βββ silu.S |
| β β βββ dequant_int4.S |
| β β βββ lora_fused.S |
| β β βββ rope.S |
| β βββ riscv/ # Future: RISC-V vector extension |
| β βββ (same pattern) |
| β |
| βββ runtime/ # C runtime (orchestrates kernels) |
| β βββ model.c # Model struct, weight loading (mmap) |
| β βββ model.h |
| β βββ inference.c # Token generation loop |
| β βββ attention.c # Multi-head attention (dispatches to kernels) |
| β βββ transformer.c # Full transformer block |
| β βββ kv_cache.c # KV cache management |
| β βββ memory_fabric.c # Multi-adapter LoRA routing + gating |
| β βββ tokenizer.c # BPE tokenizer (sentencepiece compatible) |
| β βββ quantize.c # FigQuant format reader |
| β βββ detect.c # Hardware detection (which kernels to use) |
| β βββ allocator.c # Custom memory allocator (arena-based) |
| β |
| βββ format/ # Model file format |
| β βββ lila_format.h # Custom binary format (or GGUF-compatible) |
| β βββ convert.py # Convert safetensors β lila format |
| β |
| βββ interface/ # How the outside world talks to the engine |
| β βββ cli.c # Command-line interface |
| β βββ server.c # HTTP/WebSocket server (for harness) |
| β βββ voice_bridge.c # Audio I/O bridge |
| β βββ python_bind.c # Optional: Python bindings for testing |
| β |
| βββ tests/ # Correctness tests |
| β βββ test_matmul.c |
| β βββ test_attention.c |
| β βββ test_dequant.c |
| β βββ test_lora.c |
| β βββ bench/ # Performance benchmarks |
| β βββ bench_matmul.c |
| β βββ bench_attention.c |
| β βββ bench_e2e.c |
| β |
| βββ Makefile # Build system (detect arch, assemble, link) |
| βββ README.md |
| ``` |
| |
| ## Build Phases |
| |
| ### Phase 1: Foundation (get tokens flowing) |
| 1. Hardware detection (x86_64 feature flags: AVX2, AVX-512, AMX) |
| 2. Model loading via mmap (zero-copy from disk) |
| 3. Basic matmul kernel (AVX2 β works on most x86 CPUs) |
| 4. RMSNorm, SiLU, Softmax kernels |
| 5. Single transformer block forward pass |
| 6. Full model forward pass |
| 7. Token generation loop (greedy decode) |
| 8. **TEST: generate coherent text from Gemma 4B weights** |
| |
| ### Phase 2: INT4 Quantization (shrink the model) |
| 1. FigQuant INT4 dequantization kernel |
| 2. Fused dequant + matmul (never fully dequantize to RAM) |
| 3. Custom model format with packed INT4 weights |
| 4. Converter: safetensors β lila INT4 format |
| 5. **TEST: same output quality as FP16, 4x less memory** |
| |
| ### Phase 3: Memory Fabric (multi-LoRA) |
| 1. LoRA forward pass kernel (base + AΓB correction) |
| 2. Multi-adapter routing (5 namespaces, gated) |
| 3. Gate computation (sigmoid projection) |
| 4. Fused: dequant_base + lora_correction in one kernel |
| 5. **TEST: Memory Fabric produces correct output, adapters influence generation** |
| |
| ### Phase 4: Optimization (make it fast) |
| 1. AVX-512 matmul (2x over AVX2 on supported hardware) |
| 2. Cache-optimal tiling (L1/L2/L3 aware) |
| 3. Prefetch hints in assembly |
| 4. Thread parallelism (one thread per transformer block layer) |
| 5. KV cache with paged allocation (no reallocation during generation) |
| 6. **BENCHMARK: tokens/second vs llama.cpp on same model** |
| |
| ### Phase 5: ARM (mobile/edge) |
| 1. Port all kernels to ARM64 NEON |
| 2. ARM SVE kernels for newer chips (Apple M-series, Snapdragon) |
| 3. Memory-constrained mode (streaming layers from disk) |
| 4. **TEST: runs on Raspberry Pi 5 / phone-class hardware** |
| |
| ### Phase 6: Voice + Interface |
| 1. Audio I/O bridge (ALSA/CoreAudio/WASAPI) |
| 2. WebSocket server for harness communication |
| 3. Hot-reload: load new weights without restarting |
| 4. **TEST: Lila speaks and listens in real-time** |
| |
| ## Key Design Decisions |
| |
| | Decision | Choice | Why | |
| |---|---|---| |
| | Language | Assembly + C | Maximum speed, zero overhead | |
| | Weight format | Custom (FigQuant INT4) | Optimized for our fused kernels | |
| | Memory model | mmap + arena allocator | Zero-copy loading, no malloc fragmentation | |
| | Threading | pthread (1 thread per layer group) | Simple, predictable, no framework | |
| | Build system | Makefile with arch detection | No cmake/meson complexity | |
| | Architecture dispatch | Runtime CPU feature detection | One binary runs on any x86/ARM | |
| |
| ## Performance Targets |
| |
| | Metric | Target | Why | |
| |---|---|---| |
| | Tokens/sec (4B, INT4, CPU) | >30 tok/s | Conversational speed | |
| | Tokens/sec (4B, INT4, GPU) | >100 tok/s | Real-time interaction | |
| | First token latency | <200ms | Feels instant | |
| | Memory (4B, INT4) | <3 GB | Runs on 4GB RAM machines | |
| | Binary size | <1 MB | Minimal footprint | |
| | Startup time | <500ms | Near-instant boot | |
| |
| ## Experiments Needed |
| |
| Before writing final kernels, we need to test: |
| |
| 1. **Tiling strategy** β What tile size gives best L1/L2 cache hit rate? |
| Test: 64Γ64, 128Γ128, 256Γ256 tiles, measure throughput per architecture |
| |
| 2. **INT4 dequant placement** β Dequant to register (per-element) vs dequant to L1 (per-tile)? |
| Test: both approaches, measure memory bandwidth vs compute utilization |
| |
| 3. **LoRA fusion overhead** β Is fused (base+LoRA in one kernel) actually faster than separate? |
| Test: fused vs split, across different LoRA ranks (4, 8, 16, 32) |
| |
| 4. **Thread scaling** β How many threads before diminishing returns on 4/8/16 core machines? |
| Test: 1, 2, 4, 8, 16 threads on matmul of different sizes |
| |
| 5. **ARM NEON vs SVE** β SVE has variable vector length. Is it worth the complexity? |
| Test: same kernel in NEON (128-bit fixed) vs SVE (scalable), measure on M-series |
| |
| 6. **Memory layout** β Row-major vs column-major vs tiled storage for weights? |
| Test: all three, measure matmul throughput (cache line utilization) |
| |
| 7. **Codebook in register** β FigQuant's 16-value codebook fits in one 512-bit register. |
| Test: keep codebook permanently in zmm register vs reload per group |
| |
| ## Dependencies |
| |
| - NASM or GAS (assembler) |
| - GCC or Clang (C compiler, for runtime/interface) |
| - No other dependencies. No libraries. No frameworks. Pure code. |
| |
| ## Relationship to Little Fig |
| |
| ``` |
| Little Fig (Python, runs offline) |
| β |
| β TRAINS the model |
| β Produces: model weights (safetensors) |
| β |
| βΌ |
| format/convert.py |
| β |
| β CONVERTS to Lila format |
| β Produces: .lila binary (INT4 packed with FigQuant) |
| β |
| βΌ |
| Lila Engine (assembly + C, runs always) |
| β |
| β LOADS and RUNS the model |
| β Pure machine code, no Python |
| β |
| βΌ |
| LILA (the intelligence) |
| ``` |
| |
| --- |
| |
| *Private. Not open source. Built by Sammie for Sammie.* |
| ''') |
|
|
| |
| |
| |
| with open("src/core/lilacore.py", "w") as f: |
| f.write('''""" |
| LilaCore β The Central Intelligence |
| |
| Little Fig TRAINS the model (offline, produces weight files). |
| LilaCore RUNS the trained model (loads weights, does inference). |
| No Little Fig dependency at runtime. |
| |
| Loading priority: |
| 1. Lila Engine (custom assembly β when built) |
| 2. llama-cpp-python (GGUF format) β fastest existing option |
| 3. transformers (safetensors) β fallback |
| 4. External API (Phase 1 only) |
| |
| Continuous learning cycle: |
| Interactions logged β Little Fig trains offline β new weights β Lila hot-reloads |
| """ |
| |
| import os |
| import json |
| import re |
| from typing import Optional, Dict, List |
| from dataclasses import dataclass |
| from datetime import datetime |
| |
| |
| @dataclass |
| class LilaResponse: |
| """What Lila produces after thinking.""" |
| text: str |
| memory_ops: List[Dict] |
| actions: List[Dict] |
| confidence: float |
| should_speak: bool = True |
| |
| |
| class LilaCore: |
| """ |
| The central intelligence. Loads trained model, runs inference. |
| |
| Usage: |
| lila = LilaCore() |
| lila.boot() |
| response = lila.think("Hey Lila, what's on my schedule today?") |
| """ |
| |
| def __init__(self, model_path: str = None, gguf_path: str = None, api_mode: bool = False): |
| self.model_path = model_path |
| self.gguf_path = gguf_path |
| self.api_mode = api_mode |
| self._model = None |
| self._tokenizer = None |
| self._llm = None |
| self._booted = False |
| self._conversation_history = [] |
| self._training_log = [] |
| |
| def boot(self): |
| """Wake Lila up. Load the model.""" |
| print("\\U0001f338 Lila is waking up...") |
| |
| if self.gguf_path and os.path.exists(self.gguf_path): |
| self._boot_gguf() |
| elif self.model_path and os.path.exists(self.model_path): |
| self._boot_transformers() |
| elif self.api_mode: |
| self._boot_api() |
| else: |
| default_gguf = os.path.expanduser("~/.lila/model.gguf") |
| default_hf = os.path.expanduser("~/.lila/model/") |
| if os.path.exists(default_gguf): |
| self.gguf_path = default_gguf |
| self._boot_gguf() |
| elif os.path.exists(default_hf): |
| self.model_path = default_hf |
| self._boot_transformers() |
| else: |
| print("\\U0001f338 No local model found. Phase 1 (API) mode.") |
| self._boot_api() |
| |
| self._booted = True |
| print("\\U0001f338 Lila is awake.") |
| |
| def _boot_gguf(self): |
| try: |
| from llama_cpp import Llama |
| print(f" Loading GGUF: {self.gguf_path}") |
| self._llm = Llama( |
| model_path=self.gguf_path, n_ctx=4096, |
| n_threads=os.cpu_count(), n_gpu_layers=-1, verbose=False) |
| print(" Done (llama.cpp)") |
| except ImportError: |
| print(" llama-cpp-python not installed. Falling back...") |
| self._boot_transformers() |
| |
| def _boot_transformers(self): |
| try: |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| path = self.model_path or "google/gemma-3-4b-it" |
| print(f" Loading: {path}") |
| self._tokenizer = AutoTokenizer.from_pretrained(path) |
| self._model = AutoModelForCausalLM.from_pretrained( |
| path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| device_map="auto" if torch.cuda.is_available() else "cpu", |
| low_cpu_mem_usage=True) |
| if self._tokenizer.pad_token is None: |
| self._tokenizer.pad_token = self._tokenizer.eos_token |
| print(" Done (transformers)") |
| except Exception as e: |
| print(f" Failed: {e}") |
| self._boot_api() |
| |
| def _boot_api(self): |
| self.api_mode = True |
| print(" API mode (Phase 1)") |
| |
| def think(self, input_text: str, context: Optional[Dict] = None) -> LilaResponse: |
| """Core cognitive loop.""" |
| if not self._booted: |
| raise RuntimeError("Call lila.boot() first.") |
| |
| prompt = self._build_prompt(input_text, context) |
| |
| if self._llm: |
| response_text = self._generate_gguf(prompt) |
| elif self._model: |
| response_text = self._generate_transformers(prompt) |
| else: |
| response_text = self._generate_api(prompt) |
| |
| memory_ops = self._extract_memory_ops(response_text) |
| clean_text = self._clean_response(response_text) |
| self._log_interaction(input_text, clean_text) |
| |
| self._conversation_history.append({"role": "user", "content": input_text}) |
| self._conversation_history.append({"role": "assistant", "content": clean_text}) |
| |
| return LilaResponse(text=clean_text, memory_ops=memory_ops, |
| actions=[], confidence=1.0, should_speak=True) |
| |
| def _build_prompt(self, input_text: str, context: Optional[Dict]) -> str: |
| identity = ("You are Lila, Sammie's private family AI assistant. " |
| "Persistent, caring, grows smarter over time. " |
| "Remembers everything. Speaks with warmth and personality.") |
| history = "" |
| for msg in self._conversation_history[-10:]: |
| role = "Sammie" if msg["role"] == "user" else "Lila" |
| history += f"{role}: {msg['content']}\\n" |
| if context and context.get("mode") == "reflection": |
| return f"[Internal reflection]\\n{input_text}" |
| return f"{identity}\\n\\n{history}Sammie: {input_text}\\nLila:" |
| |
| def _generate_gguf(self, prompt: str) -> str: |
| output = self._llm(prompt, max_tokens=512, temperature=0.7, |
| top_p=0.9, stop=["Sammie:", "\\n\\n\\n"]) |
| return output["choices"][0]["text"] |
| |
| def _generate_transformers(self, prompt: str) -> str: |
| import torch |
| enc = self._tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True) |
| device = next(self._model.parameters()).device |
| enc = {k: v.to(device) for k, v in enc.items()} |
| with torch.no_grad(): |
| out = self._model.generate(**enc, max_new_tokens=512, do_sample=True, |
| temperature=0.7, top_p=0.9, |
| pad_token_id=self._tokenizer.eos_token_id) |
| return self._tokenizer.decode(out[0][enc["input_ids"].shape[1]:], skip_special_tokens=False) |
| |
| def _generate_api(self, prompt: str) -> str: |
| return "[Phase 1 β wire API here]" |
| |
| def _extract_memory_ops(self, text: str) -> List[Dict]: |
| ops = [] |
| if "<|mem_store|>" in text: ops.append({"type": "store", "raw": text}) |
| if "<|mem_recall|>" in text: ops.append({"type": "recall", "raw": text}) |
| if "<|mem_conflict|>" in text: ops.append({"type": "conflict", "raw": text}) |
| return ops |
| |
| def _clean_response(self, text: str) -> str: |
| clean = re.sub(r'<\\|memory_start\\|>.*?<\\|memory_end\\|>', '', text, flags=re.DOTALL) |
| clean = re.sub(r'<\\|mem_\\\\w+\\|>', '', clean) |
| clean = clean.split("Sammie:")[0].strip() |
| return clean |
| |
| def _log_interaction(self, user_input: str, lila_response: str): |
| entry = {"timestamp": datetime.now().isoformat(), "user": user_input, "assistant": lila_response} |
| self._training_log.append(entry) |
| log_dir = os.path.expanduser("~/.lila/training_data/") |
| os.makedirs(log_dir, exist_ok=True) |
| with open(os.path.join(log_dir, "interactions.jsonl"), "a") as f: |
| f.write(json.dumps(entry) + "\\n") |
| |
| def remember(self, namespace: str, content: str): |
| self._log_interaction(f"[MEMORY:{namespace}]", content) |
| |
| def what_do_i_know(self) -> Dict: |
| return {"turns": len(self._conversation_history), |
| "pending_training": len(self._training_log), |
| "engine": "gguf" if self._llm else ("hf" if self._model else "api")} |
| |
| @property |
| def is_awake(self) -> bool: |
| return self._booted |
| ''') |
|
|
| |
| os.makedirs("engine/kernels/x86_64", exist_ok=True) |
| os.makedirs("engine/kernels/arm64", exist_ok=True) |
| os.makedirs("engine/runtime", exist_ok=True) |
| os.makedirs("engine/format", exist_ok=True) |
| os.makedirs("engine/interface", exist_ok=True) |
| os.makedirs("engine/tests/bench", exist_ok=True) |
|
|
| with open("engine/README.md", "w") as f: |
| f.write("# Lila Inference Engine\n\nCustom assembly + C inference engine. See INFERENCE_ENGINE_PLAN.md for full design.\n\nStatus: Phase 1 (planning). Using llama-cpp-python as interim runtime.\n") |
|
|
| with open("engine/kernels/x86_64/.gitkeep", "w") as f: f.write("") |
| with open("engine/kernels/arm64/.gitkeep", "w") as f: f.write("") |
| with open("engine/runtime/.gitkeep", "w") as f: f.write("") |
|
|
| |
| subprocess.run(["git", "add", "-A"], check=True) |
| subprocess.run(["git", "commit", "-m", |
| "Add inference engine build plan + fix LilaCore runtime\n\n" |
| "INFERENCE_ENGINE_PLAN.md: Full design for custom assembly+C engine\n" |
| " - Assembly kernels per architecture (x86_64 AVX-512, ARM NEON/SVE)\n" |
| " - 6-phase build plan with experiments needed\n" |
| " - Performance targets: >30 tok/s CPU, <3GB RAM, <500ms boot\n\n" |
| "src/core/lilacore.py: Fixed runtime (no Little Fig dependency)\n" |
| " - GGUF β transformers β API fallback chain\n" |
| " - Interaction logging for offline training cycles\n\n" |
| "engine/: Directory structure for custom inference engine (Phase 1: planning)"], |
| check=True) |
| subprocess.run(["git", "push", "origin", "main"], check=True) |
| print("β
Done! Engine plan + LilaCore fix pushed.") |
|
|