#!/usr/bin/env python3 """Push LilaCore fix + inference engine build plan to Lila repo.""" import subprocess, os TOKEN = "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" subprocess.run(["git", "clone", f"https://{TOKEN}@github.com/ticketguy/Lila.git", "/app/lila"], check=True) os.chdir("/app/lila") subprocess.run(["git", "config", "user.name", "0xticketguy"], check=True) subprocess.run(["git", "config", "user.email", "0xticketguy@harboria.dev"], check=True) # ═══════════════════════════════════════════════════════════════════════════════ # INFERENCE ENGINE BUILD PLAN # ═══════════════════════════════════════════════════════════════════════════════ with open("INFERENCE_ENGINE_PLAN.md", "w") as f: f.write('''# Lila Inference Engine — Build Plan ## What This Is A custom inference engine for Lila written in assembly + C. No Python at runtime. No dependency on llama.cpp, vLLM, transformers, or any third-party inference library. Lila runs as native machine code — the fastest possible execution on any hardware. ## Why Custom 1. **Speed** — Hand-tuned assembly for the hot path (matmul, attention) beats any compiler output 2. **Portability** — Assembly kernels per architecture means it runs on anything: x86 desktop, ARM phone, RISC-V edge device 3. **Control** — Memory Fabric (multi-LoRA) is native to the engine, not bolted on 4. **Identity** — Lila speaks machine language. Her own inference IS machine language. Aligned. 5. **Independence** — No supply chain risk. No one else's bugs. No license constraints. ## Architecture ``` lila-engine/ ├── kernels/ # Assembly kernels (THE hot path) │ ├── x86_64/ │ │ ├── matmul_avx512.S # Matrix multiply (AVX-512, INT4 fused) │ │ ├── matmul_avx2.S # Fallback for older CPUs │ │ ├── rmsnorm.S # RMS normalization │ │ ├── softmax.S # Softmax with online computation │ │ ├── silu.S # SiLU activation │ │ ├── dequant_int4.S # FigQuant INT4 dequantization │ │ ├── lora_fused.S # Dequant + base matmul + LoRA in one pass │ │ └── rope.S # Rotary position embeddings │ ├── arm64/ │ │ ├── matmul_neon.S # Matrix multiply (NEON) │ │ ├── matmul_sve.S # Matrix multiply (SVE — newer ARM) │ │ ├── rmsnorm.S │ │ ├── softmax.S │ │ ├── silu.S │ │ ├── dequant_int4.S │ │ ├── lora_fused.S │ │ └── rope.S │ └── riscv/ # Future: RISC-V vector extension │ └── (same pattern) │ ├── runtime/ # C runtime (orchestrates kernels) │ ├── model.c # Model struct, weight loading (mmap) │ ├── model.h │ ├── inference.c # Token generation loop │ ├── attention.c # Multi-head attention (dispatches to kernels) │ ├── transformer.c # Full transformer block │ ├── kv_cache.c # KV cache management │ ├── memory_fabric.c # Multi-adapter LoRA routing + gating │ ├── tokenizer.c # BPE tokenizer (sentencepiece compatible) │ ├── quantize.c # FigQuant format reader │ ├── detect.c # Hardware detection (which kernels to use) │ └── allocator.c # Custom memory allocator (arena-based) │ ├── format/ # Model file format │ ├── lila_format.h # Custom binary format (or GGUF-compatible) │ └── convert.py # Convert safetensors → lila format │ ├── interface/ # How the outside world talks to the engine │ ├── cli.c # Command-line interface │ ├── server.c # HTTP/WebSocket server (for harness) │ ├── voice_bridge.c # Audio I/O bridge │ └── python_bind.c # Optional: Python bindings for testing │ ├── tests/ # Correctness tests │ ├── test_matmul.c │ ├── test_attention.c │ ├── test_dequant.c │ ├── test_lora.c │ └── bench/ # Performance benchmarks │ ├── bench_matmul.c │ ├── bench_attention.c │ └── bench_e2e.c │ ├── Makefile # Build system (detect arch, assemble, link) └── README.md ``` ## Build Phases ### Phase 1: Foundation (get tokens flowing) 1. Hardware detection (x86_64 feature flags: AVX2, AVX-512, AMX) 2. Model loading via mmap (zero-copy from disk) 3. Basic matmul kernel (AVX2 — works on most x86 CPUs) 4. RMSNorm, SiLU, Softmax kernels 5. Single transformer block forward pass 6. Full model forward pass 7. Token generation loop (greedy decode) 8. **TEST: generate coherent text from Gemma 4B weights** ### Phase 2: INT4 Quantization (shrink the model) 1. FigQuant INT4 dequantization kernel 2. Fused dequant + matmul (never fully dequantize to RAM) 3. Custom model format with packed INT4 weights 4. Converter: safetensors → lila INT4 format 5. **TEST: same output quality as FP16, 4x less memory** ### Phase 3: Memory Fabric (multi-LoRA) 1. LoRA forward pass kernel (base + A×B correction) 2. Multi-adapter routing (5 namespaces, gated) 3. Gate computation (sigmoid projection) 4. Fused: dequant_base + lora_correction in one kernel 5. **TEST: Memory Fabric produces correct output, adapters influence generation** ### Phase 4: Optimization (make it fast) 1. AVX-512 matmul (2x over AVX2 on supported hardware) 2. Cache-optimal tiling (L1/L2/L3 aware) 3. Prefetch hints in assembly 4. Thread parallelism (one thread per transformer block layer) 5. KV cache with paged allocation (no reallocation during generation) 6. **BENCHMARK: tokens/second vs llama.cpp on same model** ### Phase 5: ARM (mobile/edge) 1. Port all kernels to ARM64 NEON 2. ARM SVE kernels for newer chips (Apple M-series, Snapdragon) 3. Memory-constrained mode (streaming layers from disk) 4. **TEST: runs on Raspberry Pi 5 / phone-class hardware** ### Phase 6: Voice + Interface 1. Audio I/O bridge (ALSA/CoreAudio/WASAPI) 2. WebSocket server for harness communication 3. Hot-reload: load new weights without restarting 4. **TEST: Lila speaks and listens in real-time** ## Key Design Decisions | Decision | Choice | Why | |---|---|---| | Language | Assembly + C | Maximum speed, zero overhead | | Weight format | Custom (FigQuant INT4) | Optimized for our fused kernels | | Memory model | mmap + arena allocator | Zero-copy loading, no malloc fragmentation | | Threading | pthread (1 thread per layer group) | Simple, predictable, no framework | | Build system | Makefile with arch detection | No cmake/meson complexity | | Architecture dispatch | Runtime CPU feature detection | One binary runs on any x86/ARM | ## Performance Targets | Metric | Target | Why | |---|---|---| | Tokens/sec (4B, INT4, CPU) | >30 tok/s | Conversational speed | | Tokens/sec (4B, INT4, GPU) | >100 tok/s | Real-time interaction | | First token latency | <200ms | Feels instant | | Memory (4B, INT4) | <3 GB | Runs on 4GB RAM machines | | Binary size | <1 MB | Minimal footprint | | Startup time | <500ms | Near-instant boot | ## Experiments Needed Before writing final kernels, we need to test: 1. **Tiling strategy** — What tile size gives best L1/L2 cache hit rate? Test: 64×64, 128×128, 256×256 tiles, measure throughput per architecture 2. **INT4 dequant placement** — Dequant to register (per-element) vs dequant to L1 (per-tile)? Test: both approaches, measure memory bandwidth vs compute utilization 3. **LoRA fusion overhead** — Is fused (base+LoRA in one kernel) actually faster than separate? Test: fused vs split, across different LoRA ranks (4, 8, 16, 32) 4. **Thread scaling** — How many threads before diminishing returns on 4/8/16 core machines? Test: 1, 2, 4, 8, 16 threads on matmul of different sizes 5. **ARM NEON vs SVE** — SVE has variable vector length. Is it worth the complexity? Test: same kernel in NEON (128-bit fixed) vs SVE (scalable), measure on M-series 6. **Memory layout** — Row-major vs column-major vs tiled storage for weights? Test: all three, measure matmul throughput (cache line utilization) 7. **Codebook in register** — FigQuant's 16-value codebook fits in one 512-bit register. Test: keep codebook permanently in zmm register vs reload per group ## Dependencies - NASM or GAS (assembler) - GCC or Clang (C compiler, for runtime/interface) - No other dependencies. No libraries. No frameworks. Pure code. ## Relationship to Little Fig ``` Little Fig (Python, runs offline) │ │ TRAINS the model │ Produces: model weights (safetensors) │ ▼ format/convert.py │ │ CONVERTS to Lila format │ Produces: .lila binary (INT4 packed with FigQuant) │ ▼ Lila Engine (assembly + C, runs always) │ │ LOADS and RUNS the model │ Pure machine code, no Python │ ▼ LILA (the intelligence) ``` --- *Private. Not open source. Built by Sammie for Sammie.* ''') # ═══════════════════════════════════════════════════════════════════════════════ # Fix LilaCore (proper Python Phase 1 version) # ═══════════════════════════════════════════════════════════════════════════════ with open("src/core/lilacore.py", "w") as f: f.write('''""" LilaCore — The Central Intelligence Little Fig TRAINS the model (offline, produces weight files). LilaCore RUNS the trained model (loads weights, does inference). No Little Fig dependency at runtime. Loading priority: 1. Lila Engine (custom assembly — when built) 2. llama-cpp-python (GGUF format) — fastest existing option 3. transformers (safetensors) — fallback 4. External API (Phase 1 only) Continuous learning cycle: Interactions logged → Little Fig trains offline → new weights → Lila hot-reloads """ import os import json import re from typing import Optional, Dict, List from dataclasses import dataclass from datetime import datetime @dataclass class LilaResponse: """What Lila produces after thinking.""" text: str memory_ops: List[Dict] actions: List[Dict] confidence: float should_speak: bool = True class LilaCore: """ The central intelligence. Loads trained model, runs inference. Usage: lila = LilaCore() lila.boot() response = lila.think("Hey Lila, what's on my schedule today?") """ def __init__(self, model_path: str = None, gguf_path: str = None, api_mode: bool = False): self.model_path = model_path self.gguf_path = gguf_path self.api_mode = api_mode self._model = None self._tokenizer = None self._llm = None self._booted = False self._conversation_history = [] self._training_log = [] def boot(self): """Wake Lila up. Load the model.""" print("\\U0001f338 Lila is waking up...") if self.gguf_path and os.path.exists(self.gguf_path): self._boot_gguf() elif self.model_path and os.path.exists(self.model_path): self._boot_transformers() elif self.api_mode: self._boot_api() else: default_gguf = os.path.expanduser("~/.lila/model.gguf") default_hf = os.path.expanduser("~/.lila/model/") if os.path.exists(default_gguf): self.gguf_path = default_gguf self._boot_gguf() elif os.path.exists(default_hf): self.model_path = default_hf self._boot_transformers() else: print("\\U0001f338 No local model found. Phase 1 (API) mode.") self._boot_api() self._booted = True print("\\U0001f338 Lila is awake.") def _boot_gguf(self): try: from llama_cpp import Llama print(f" Loading GGUF: {self.gguf_path}") self._llm = Llama( model_path=self.gguf_path, n_ctx=4096, n_threads=os.cpu_count(), n_gpu_layers=-1, verbose=False) print(" Done (llama.cpp)") except ImportError: print(" llama-cpp-python not installed. Falling back...") self._boot_transformers() def _boot_transformers(self): try: import torch from transformers import AutoModelForCausalLM, AutoTokenizer path = self.model_path or "google/gemma-3-4b-it" print(f" Loading: {path}") self._tokenizer = AutoTokenizer.from_pretrained(path) self._model = AutoModelForCausalLM.from_pretrained( path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else "cpu", low_cpu_mem_usage=True) if self._tokenizer.pad_token is None: self._tokenizer.pad_token = self._tokenizer.eos_token print(" Done (transformers)") except Exception as e: print(f" Failed: {e}") self._boot_api() def _boot_api(self): self.api_mode = True print(" API mode (Phase 1)") def think(self, input_text: str, context: Optional[Dict] = None) -> LilaResponse: """Core cognitive loop.""" if not self._booted: raise RuntimeError("Call lila.boot() first.") prompt = self._build_prompt(input_text, context) if self._llm: response_text = self._generate_gguf(prompt) elif self._model: response_text = self._generate_transformers(prompt) else: response_text = self._generate_api(prompt) memory_ops = self._extract_memory_ops(response_text) clean_text = self._clean_response(response_text) self._log_interaction(input_text, clean_text) self._conversation_history.append({"role": "user", "content": input_text}) self._conversation_history.append({"role": "assistant", "content": clean_text}) return LilaResponse(text=clean_text, memory_ops=memory_ops, actions=[], confidence=1.0, should_speak=True) def _build_prompt(self, input_text: str, context: Optional[Dict]) -> str: identity = ("You are Lila, Sammie's private family AI assistant. " "Persistent, caring, grows smarter over time. " "Remembers everything. Speaks with warmth and personality.") history = "" for msg in self._conversation_history[-10:]: role = "Sammie" if msg["role"] == "user" else "Lila" history += f"{role}: {msg['content']}\\n" if context and context.get("mode") == "reflection": return f"[Internal reflection]\\n{input_text}" return f"{identity}\\n\\n{history}Sammie: {input_text}\\nLila:" def _generate_gguf(self, prompt: str) -> str: output = self._llm(prompt, max_tokens=512, temperature=0.7, top_p=0.9, stop=["Sammie:", "\\n\\n\\n"]) return output["choices"][0]["text"] def _generate_transformers(self, prompt: str) -> str: import torch enc = self._tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True) device = next(self._model.parameters()).device enc = {k: v.to(device) for k, v in enc.items()} with torch.no_grad(): out = self._model.generate(**enc, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=self._tokenizer.eos_token_id) return self._tokenizer.decode(out[0][enc["input_ids"].shape[1]:], skip_special_tokens=False) def _generate_api(self, prompt: str) -> str: return "[Phase 1 — wire API here]" def _extract_memory_ops(self, text: str) -> List[Dict]: ops = [] if "<|mem_store|>" in text: ops.append({"type": "store", "raw": text}) if "<|mem_recall|>" in text: ops.append({"type": "recall", "raw": text}) if "<|mem_conflict|>" in text: ops.append({"type": "conflict", "raw": text}) return ops def _clean_response(self, text: str) -> str: clean = re.sub(r'<\\|memory_start\\|>.*?<\\|memory_end\\|>', '', text, flags=re.DOTALL) clean = re.sub(r'<\\|mem_\\\\w+\\|>', '', clean) clean = clean.split("Sammie:")[0].strip() return clean def _log_interaction(self, user_input: str, lila_response: str): entry = {"timestamp": datetime.now().isoformat(), "user": user_input, "assistant": lila_response} self._training_log.append(entry) log_dir = os.path.expanduser("~/.lila/training_data/") os.makedirs(log_dir, exist_ok=True) with open(os.path.join(log_dir, "interactions.jsonl"), "a") as f: f.write(json.dumps(entry) + "\\n") def remember(self, namespace: str, content: str): self._log_interaction(f"[MEMORY:{namespace}]", content) def what_do_i_know(self) -> Dict: return {"turns": len(self._conversation_history), "pending_training": len(self._training_log), "engine": "gguf" if self._llm else ("hf" if self._model else "api")} @property def is_awake(self) -> bool: return self._booted ''') # Create engine directory placeholder os.makedirs("engine/kernels/x86_64", exist_ok=True) os.makedirs("engine/kernels/arm64", exist_ok=True) os.makedirs("engine/runtime", exist_ok=True) os.makedirs("engine/format", exist_ok=True) os.makedirs("engine/interface", exist_ok=True) os.makedirs("engine/tests/bench", exist_ok=True) with open("engine/README.md", "w") as f: f.write("# Lila Inference Engine\n\nCustom assembly + C inference engine. See INFERENCE_ENGINE_PLAN.md for full design.\n\nStatus: Phase 1 (planning). Using llama-cpp-python as interim runtime.\n") with open("engine/kernels/x86_64/.gitkeep", "w") as f: f.write("") with open("engine/kernels/arm64/.gitkeep", "w") as f: f.write("") with open("engine/runtime/.gitkeep", "w") as f: f.write("") # Commit and push subprocess.run(["git", "add", "-A"], check=True) subprocess.run(["git", "commit", "-m", "Add inference engine build plan + fix LilaCore runtime\n\n" "INFERENCE_ENGINE_PLAN.md: Full design for custom assembly+C engine\n" " - Assembly kernels per architecture (x86_64 AVX-512, ARM NEON/SVE)\n" " - 6-phase build plan with experiments needed\n" " - Performance targets: >30 tok/s CPU, <3GB RAM, <500ms boot\n\n" "src/core/lilacore.py: Fixed runtime (no Little Fig dependency)\n" " - GGUF → transformers → API fallback chain\n" " - Interaction logging for offline training cycles\n\n" "engine/: Directory structure for custom inference engine (Phase 1: planning)"], check=True) subprocess.run(["git", "push", "origin", "main"], check=True) print("✅ Done! Engine plan + LilaCore fix pushed.")