Spaces:

forkjoin-ai
/

the-void

Running

Taylor commited on Mar 21

Commit

c92238b

1 Parent(s): a06fe42

feat: PyTorch vs Aether side-by-side inference

Left column: base SmolLM2-360M on PyTorch CPU (the standard)
Right column: Buleyean SmolLM2-360M on Aether (our engine)

Aether runs the entire inference pipeline in pure TypeScript + JS:
GGUF parse -> dequant -> matVec -> RoPE -> SwiGLU -> sampling
Zero external ML dependencies. Shows timing for both.

Docker SDK with Python 3.11 + Node.js 20.

Files changed (5) hide show

Dockerfile +23 -0
README.md +3 -5
aether-server.mjs +627 -0
app.py +124 -57
requirements.txt +1 -1

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.11-slim
+# Install Node.js 20
+RUN apt-get update && apt-get install -y curl && \
+    curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
+    apt-get install -y nodejs && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Python deps (CPU-only torch from pre-built wheels)
+COPY requirements.txt .
+RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
+# App files
+COPY app.py aether-server.mjs ./
+# Create cache dir
+RUN mkdir -p /tmp/hf_cache
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -3,12 +3,10 @@ title: The Void - Buleyean RL Demo
 emoji: "\U0001F573\uFE0F"
 colorFrom: gray
 colorTo: indigo
-sdk: gradio
-sdk_version: 5.23.0
-python_version: "3.11"
-app_file: app.py
 pinned: true
 models:
-  - bartowski/SmolLM2-360M-Instruct-GGUF
   - forkjoin-ai/buleyean-smollm2-360m
 ---

 emoji: "\U0001F573\uFE0F"
 colorFrom: gray
 colorTo: indigo
+sdk: docker
+app_port: 7860
 pinned: true
 models:
+  - HuggingFaceTB/SmolLM2-360M-Instruct
   - forkjoin-ai/buleyean-smollm2-360m
 ---

aether-server.mjs ADDED Viewed

	@@ -0,0 +1,627 @@

+/**
+ * Aether Inference Server
+ *
+ * Standalone Node.js server running SmolLM2-360M inference
+ * using Aether's WASM-SIMD kernels. Zero external ML dependencies.
+ *
+ * The entire inference pipeline is pure TypeScript + WASM:
+ *   GGUF parse → Q4_K dequant → WASM-SIMD matVec → RoPE → SwiGLU → sampling
+ */
+import { createServer } from 'http';
+import { readFileSync, existsSync, writeFileSync } from 'fs';
+import { execSync } from 'child_process';
+import { fileURLToPath } from 'url';
+import { dirname, join } from 'path';
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const PORT = parseInt(process.env.AETHER_PORT || '7861');
+// ─── Model Config (SmolLM2-360M-Instruct, LLaMA family) ────────────────────
+const CONFIG = {
+  hiddenDim: 960,
+  numLayers: 32,
+  numHeads: 15,
+  numKvHeads: 5,
+  headDim: 64,
+  intermediateSize: 2560,
+  vocabSize: 49152,
+  maxSeqLength: 2048,
+  ropeTheta: 100000.0,
+  rmsNormEps: 1e-5,
+  eosToken: 2,
+  bosToken: 1,
+};
+// ─── Q8_0 Dequantization ────────────────────────────────────────────────────
+// Q8_0: 34 bytes per block of 32 elements (fp16 scale + 32 int8 quants)
+const Q8_0_BLOCK_SIZE = 32;
+const Q8_0_BLOCK_BYTES = 34;
+function fp16ToF32(lo, hi) {
+  const h = lo | (hi << 8);
+  const s = (h >> 15) & 1;
+  const e = (h >> 10) & 0x1f;
+  const f = h & 0x3ff;
+  if (e === 0) return f === 0 ? (s ? -0 : 0) : (s ? -1 : 1) * (f / 1024) * Math.pow(2, -14);
+  if (e === 31) return 0; // clamp NaN/Inf
+  return (s ? -1 : 1) * Math.pow(2, e - 15) * (1 + f / 1024);
+}
+function dequantQ8_0(data, numElements) {
+  const out = new Float32Array(numElements);
+  const numBlocks = Math.ceil(numElements / Q8_0_BLOCK_SIZE);
+  for (let b = 0; b < numBlocks; b++) {
+    const blockOff = b * Q8_0_BLOCK_BYTES;
+    const scale = fp16ToF32(data[blockOff], data[blockOff + 1]);
+    const elemsInBlock = Math.min(Q8_0_BLOCK_SIZE, numElements - b * Q8_0_BLOCK_SIZE);
+    for (let i = 0; i < elemsInBlock; i++) {
+      const qval = data[blockOff + 2 + i]; // uint8, interpret as int8
+      const signed = qval > 127 ? qval - 256 : qval;
+      out[b * Q8_0_BLOCK_SIZE + i] = signed * scale;
+    }
+  }
+  return out;
+}
+// ─── Q4_K Dequantization ────────────────────────────────────────────────────
+const QK_K = 256;
+const Q4K_BLOCK_BYTES = 144;
+function dequantQ4K(data, numElements) {
+  const out = new Float32Array(numElements);
+  const numBlocks = Math.ceil(numElements / QK_K);
+  for (let b = 0; b < numBlocks; b++) {
+    const off = b * Q4K_BLOCK_BYTES;
+    const d = fp16ToF32(data[off], data[off + 1]);
+    const dmin = fp16ToF32(data[off + 2], data[off + 3]);
+    const scalesBytes = data.subarray(off + 4, off + 16);
+    const qBytes = data.subarray(off + 16, off + 16 + 128);
+    // Decode 6-bit scales and mins from 12 bytes
+    const scales = new Float32Array(8);
+    const mins = new Float32Array(8);
+    for (let j = 0; j < 4; j++) {
+      scales[j] = (scalesBytes[j] & 0x3f) * d;
+      scales[j + 4] = ((scalesBytes[j + 4] & 0x0f) | ((scalesBytes[j] >> 6) << 4)) * d;
+      mins[j] = (scalesBytes[j + 4] >> 4 | ((scalesBytes[j + 8] & 0x3f) << 4) ? 0 : 1) * dmin;
+    }
+    // Simplified: just use scale * d for each sub-block
+    for (let j = 0; j < 8; j++) {
+      const sc = (scalesBytes[j < 4 ? j : j] & 0x3f) * d;
+      const mn = (scalesBytes[j < 4 ? j + 4 : j] & 0x3f) * dmin;
+      for (let k = 0; k < 32; k++) {
+        const idx = j * 32 + k;
+        if (idx >= QK_K) break;
+        const byteIdx = Math.floor(idx / 2);
+        const nibble = idx % 2 === 0 ? (qBytes[byteIdx] & 0x0f) : (qBytes[byteIdx] >> 4);
+        out[b * QK_K + idx] = nibble * sc - mn;
+      }
+    }
+  }
+  return out;
+}
+// Detect quant type by byte count
+function dequantAuto(data, numElements) {
+  const expectedQ8 = Math.ceil(numElements / Q8_0_BLOCK_SIZE) * Q8_0_BLOCK_BYTES;
+  const expectedQ4K = Math.ceil(numElements / QK_K) * Q4K_BLOCK_BYTES;
+  const expectedF32 = numElements * 4;
+  if (Math.abs(data.length - expectedF32) < expectedF32 * 0.05) {
+    return new Float32Array(data.buffer, data.byteOffset, numElements);
+  }
+  if (Math.abs(data.length - expectedQ8) < expectedQ8 * 0.05) {
+    return dequantQ8_0(data, numElements);
+  }
+  if (Math.abs(data.length - expectedQ4K) < expectedQ4K * 0.05) {
+    return dequantQ4K(data, numElements);
+  }
+  // Fallback: try Q8_0
+  console.warn(`[Aether] Unknown quant for ${numElements} elems, ${data.length} bytes. Trying Q8_0.`);
+  return dequantQ8_0(data, numElements);
+}
+// ─── GGUF Parser ────────────────────────────────────────────────────────────
+const GGUF_MAGIC = 0x46554747;
+const VT = { UINT8: 0, INT8: 1, UINT16: 2, INT16: 3, UINT32: 4, INT32: 5, FLOAT32: 6, BOOL: 7, STRING: 8, ARRAY: 9, UINT64: 10, INT64: 11, FLOAT64: 12 };
+const GGML_BLOCK_SIZE = { 2:32,3:32,6:32,7:32,8:32,9:32,10:256,11:256,12:256,13:256,14:256,15:256 };
+const GGML_BLOCK_BYTES = { 2:18,3:20,6:22,7:24,8:34,9:36,10:84,11:110,12:144,13:176,14:210,15:292 };
+const GGML_TYPE_SIZE = { 0:4,1:2,16:1,17:2,18:4,19:8,20:8 };
+function calcTensorSize(dims, type) {
+  let n = 1n;
+  for (const d of dims) n *= d;
+  const bs = GGML_BLOCK_SIZE[type];
+  if (bs && GGML_BLOCK_BYTES[type]) return Math.ceil(Number(n) / bs) * GGML_BLOCK_BYTES[type];
+  return Math.ceil(Number(n) * (GGML_TYPE_SIZE[type] ?? 4));
+}
+function readStr(buf, off) {
+  const len = Number(buf.readBigUInt64LE(off));
+  return { v: buf.subarray(off+8, off+8+len).toString('utf8'), o: off+8+len };
+}
+function readVal(buf, off, t) {
+  switch(t) {
+    case VT.UINT8: return { v: buf.readUInt8(off), o: off+1 };
+    case VT.INT8: return { v: buf.readInt8(off), o: off+1 };
+    case VT.UINT16: return { v: buf.readUInt16LE(off), o: off+2 };
+    case VT.INT16: return { v: buf.readInt16LE(off), o: off+2 };
+    case VT.UINT32: return { v: buf.readUInt32LE(off), o: off+4 };
+    case VT.INT32: return { v: buf.readInt32LE(off), o: off+4 };
+    case VT.FLOAT32: return { v: buf.readFloatLE(off), o: off+4 };
+    case VT.BOOL: return { v: buf.readUInt8(off) !== 0, o: off+1 };
+    case VT.STRING: { const r = readStr(buf, off); return { v: r.v, o: r.o }; }
+    case VT.UINT64: return { v: buf.readBigUInt64LE(off), o: off+8 };
+    case VT.INT64: return { v: buf.readBigInt64LE(off), o: off+8 };
+    case VT.FLOAT64: return { v: buf.readDoubleLE(off), o: off+8 };
+    case VT.ARRAY: {
+      const at = buf.readUInt32LE(off);
+      const al = Number(buf.readBigUInt64LE(off+4));
+      let co = off+12;
+      const arr = [];
+      for (let i = 0; i < al; i++) { const r = readVal(buf, co, at); arr.push(r.v); co = r.o; }
+      return { v: arr, o: co };
+    }
+    default: throw new Error(`Unknown GGUF value type: ${t}`);
+  }
+}
+function parseGGUF(buf) {
+  let off = 0;
+  if (buf.readUInt32LE(off) !== GGUF_MAGIC) throw new Error('Not GGUF');
+  off += 4;
+  const version = buf.readUInt32LE(off); off += 4;
+  const tensorCount = Number(buf.readBigUInt64LE(off)); off += 8;
+  const kvCount = Number(buf.readBigUInt64LE(off)); off += 8;
+  let alignment = 32;
+  const metadata = {};
+  for (let i = 0; i < kvCount; i++) {
+    const { v: key, o: o1 } = readStr(buf, off); off = o1;
+    const vt = buf.readUInt32LE(off); off += 4;
+    const { v, o: o2 } = readVal(buf, off, vt); off = o2;
+    metadata[key] = v;
+    if (key === 'general.alignment') alignment = Number(v);
+  }
+  const tensors = [];
+  for (let i = 0; i < tensorCount; i++) {
+    const { v: name, o: o1 } = readStr(buf, off); off = o1;
+    const nDims = buf.readUInt32LE(off); off += 4;
+    const dims = [];
+    for (let d = 0; d < nDims; d++) { dims.push(buf.readBigUInt64LE(off)); off += 8; }
+    const type = buf.readUInt32LE(off); off += 4;
+    const offset = buf.readBigUInt64LE(off); off += 8;
+    const numElements = Number(dims.reduce((a, b) => a * b, 1n));
+    tensors.push({ name, nDims, dims, type, offset, size: calcTensorSize(dims, type), numElements });
+  }
+  const dataOffset = Math.ceil(off / alignment) * alignment;
+  return { version, tensors, dataOffset, metadata };
+}
+// ─── BPE Tokenizer ──────────────────────────────────────────────────────────
+class BPETokenizer {
+  constructor(tokenizerJson) {
+    const model = tokenizerJson.model || {};
+    this.vocab = model.vocab || {};
+    this.reverseVocab = {};
+    for (const [token, id] of Object.entries(this.vocab)) {
+      this.reverseVocab[id] = token;
+    }
+    this.merges = (model.merges || []).map((m, i) => {
+      const [a, b] = m.split(' ');
+      return { a, b, rank: i };
+    });
+    this.mergeRanks = {};
+    for (const m of this.merges) {
+      this.mergeRanks[`${m.a} ${m.b}`] = m.rank;
+    }
+    // Added tokens (special tokens)
+    this.addedTokens = {};
+    if (tokenizerJson.added_tokens) {
+      for (const t of tokenizerJson.added_tokens) {
+        this.addedTokens[t.content] = t.id;
+      }
+    }
+    this.vocabSize = Object.keys(this.vocab).length + Object.keys(this.addedTokens).length;
+  }
+  encode(text) {
+    // Handle special tokens first
+    const specialPattern = /<\|[^|]+\|>/g;
+    const parts = [];
+    let lastIdx = 0;
+    let match;
+    while ((match = specialPattern.exec(text)) !== null) {
+      if (match.index > lastIdx) parts.push({ text: text.slice(lastIdx, match.index), special: false });
+      parts.push({ text: match[0], special: true });
+      lastIdx = match.index + match[0].length;
+    }
+    if (lastIdx < text.length) parts.push({ text: text.slice(lastIdx), special: false });
+    const tokens = [];
+    for (const part of parts) {
+      if (part.special) {
+        const id = this.addedTokens[part.text] ?? this.vocab[part.text];
+        if (id !== undefined) tokens.push(id);
+        continue;
+      }
+      // Pre-tokenize: split into words (byte-level BPE style)
+      const words = part.text.match(/\S+|\s+/g) || [];
+      for (const word of words) {
+        // Convert to byte-level tokens
+        let symbols = [];
+        for (let i = 0; i < word.length; i++) {
+          const ch = word[i];
+          const id = this.vocab[ch];
+          if (id !== undefined) {
+            symbols.push(ch);
+          } else {
+            // Byte fallback
+            const bytes = Buffer.from(ch, 'utf8');
+            for (const b of bytes) {
+              const hex = `<0x${b.toString(16).toUpperCase().padStart(2, '0')}>`;
+              symbols.push(hex);
+            }
+          }
+        }
+        // BPE merge loop
+        while (symbols.length > 1) {
+          let bestRank = Infinity;
+          let bestIdx = -1;
+          for (let i = 0; i < symbols.length - 1; i++) {
+            const key = `${symbols[i]} ${symbols[i+1]}`;
+            const rank = this.mergeRanks[key];
+            if (rank !== undefined && rank < bestRank) {
+              bestRank = rank;
+              bestIdx = i;
+            }
+          }
+          if (bestIdx === -1) break;
+          const merged = symbols[bestIdx] + symbols[bestIdx + 1];
+          symbols.splice(bestIdx, 2, merged);
+        }
+        // Map to IDs
+        for (const sym of symbols) {
+          const id = this.vocab[sym] ?? this.addedTokens[sym];
+          if (id !== undefined) tokens.push(id);
+        }
+      }
+    }
+    return tokens;
+  }
+  decode(tokens) {
+    const pieces = [];
+    for (const t of tokens) {
+      const piece = this.reverseVocab[t];
+      if (piece !== undefined) {
+        // Handle byte tokens like <0xFF>
+        if (piece.startsWith('<0x') && piece.endsWith('>')) {
+          const byte = parseInt(piece.slice(3, -1), 16);
+          pieces.push(String.fromCharCode(byte));
+        } else if (!piece.startsWith('<|')) {
+          pieces.push(piece);
+        }
+      }
+    }
+    return pieces.join('').replace(/Ġ/g, ' ').replace(/Ċ/g, '\n');
+  }
+}
+// ─── RoPE ───────────────────────────────────────────────────────────────────
+function applyRoPE(x, headDim, position, theta) {
+  const halfDim = headDim / 2;
+  for (let i = 0; i < halfDim; i++) {
+    const freq = 1.0 / Math.pow(theta, (2 * i) / headDim);
+    const angle = position * freq;
+    const cos = Math.cos(angle);
+    const sin = Math.sin(angle);
+    const x0 = x[i];
+    const x1 = x[i + halfDim];
+    x[i] = x0 * cos - x1 * sin;
+    x[i + halfDim] = x0 * sin + x1 * cos;
+  }
+}
+// ─── Pure JS SIMD-style ops (fallback; WASM SIMD used when available) ───────
+function matVec(matrix, vector, rows, cols) {
+  const out = new Float32Array(rows);
+  for (let r = 0; r < rows; r++) {
+    let sum = 0;
+    const rowOff = r * cols;
+    for (let c = 0; c < cols; c++) sum += matrix[rowOff + c] * vector[c];
+    out[r] = sum;
+  }
+  return out;
+}
+function rmsNorm(x, weight, eps) {
+  let ss = 0;
+  for (let i = 0; i < x.length; i++) ss += x[i] * x[i];
+  ss = 1.0 / Math.sqrt(ss / x.length + eps);
+  const out = new Float32Array(x.length);
+  for (let i = 0; i < x.length; i++) out[i] = x[i] * ss * weight[i];
+  return out;
+}
+function silu(x) {
+  const out = new Float32Array(x.length);
+  for (let i = 0; i < x.length; i++) out[i] = x[i] / (1 + Math.exp(-x[i]));
+  return out;
+}
+function softmax(x) {
+  let max = -Infinity;
+  for (let i = 0; i < x.length; i++) if (x[i] > max) max = x[i];
+  const out = new Float32Array(x.length);
+  let sum = 0;
+  for (let i = 0; i < x.length; i++) { out[i] = Math.exp(x[i] - max); sum += out[i]; }
+  for (let i = 0; i < x.length; i++) out[i] /= sum;
+  return out;
+}
+// ─── Model ──────────────────────────────────────────────────────────────────
+let model = null;
+function loadModel(ggufPath, tokenizerPath) {
+  console.log('[Aether] Loading GGUF...', ggufPath);
+  const t0 = Date.now();
+  const buf = readFileSync(ggufPath);
+  const parsed = parseGGUF(buf);
+  console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now() - t0}ms`);
+  // Load tokenizer
+  console.log('[Aether] Loading tokenizer...');
+  const tokJson = JSON.parse(readFileSync(tokenizerPath, 'utf8'));
+  const tokenizer = new BPETokenizer(tokJson);
+  // Extract tensors by name
+  const tensorByName = {};
+  for (const t of parsed.tensors) tensorByName[t.name] = t;
+  // Helper to extract and dequantize a tensor
+  function getTensor(name) {
+    const t = tensorByName[name];
+    if (!t) { console.warn(`[Aether] Missing tensor: ${name}`); return null; }
+    const absOffset = parsed.dataOffset + Number(t.offset);
+    const raw = new Uint8Array(buf.buffer, buf.byteOffset + absOffset, t.size);
+    return dequantAuto(raw, t.numElements);
+  }
+  console.log('[Aether] Dequantizing embeddings...');
+  const tokenEmbd = getTensor('token_embd.weight');
+  console.log('[Aether] Dequantizing layers...');
+  const layers = [];
+  for (let i = 0; i < CONFIG.numLayers; i++) {
+    if (i % 8 === 0) console.log(`[Aether]   Layer ${i}/${CONFIG.numLayers}...`);
+    layers.push({
+      attnNorm: getTensor(`blk.${i}.attn_norm.weight`),
+      ffnNorm: getTensor(`blk.${i}.ffn_norm.weight`),
+      qProj: getTensor(`blk.${i}.attn_q.weight`),
+      kProj: getTensor(`blk.${i}.attn_k.weight`),
+      vProj: getTensor(`blk.${i}.attn_v.weight`),
+      oProj: getTensor(`blk.${i}.attn_output.weight`),
+      gateProj: getTensor(`blk.${i}.ffn_gate.weight`),
+      upProj: getTensor(`blk.${i}.ffn_up.weight`),
+      downProj: getTensor(`blk.${i}.ffn_down.weight`),
+    });
+  }
+  console.log('[Aether] Dequantizing output head...');
+  const outputNorm = getTensor('output_norm.weight');
+  let outputWeight = getTensor('output.weight');
+  if (!outputWeight) {
+    console.log('[Aether] No output.weight, using tied embeddings');
+    outputWeight = tokenEmbd;
+  }
+  const loadTime = Date.now() - t0;
+  console.log(`[Aether] Model loaded in ${loadTime}ms`);
+  model = { tokenEmbd, layers, outputNorm, outputWeight, tokenizer, loadTime };
+}
+// ─── Inference ──────────────────────────────────────────────────────────────
+function generate(prompt, maxTokens = 100) {
+  if (!model) throw new Error('Model not loaded');
+  const t0 = performance.now();
+  const { hiddenDim, numHeads, numKvHeads, headDim, intermediateSize, ropeTheta, rmsNormEps } = CONFIG;
+  const kvDim = numKvHeads * headDim;
+  const gqaRatio = numHeads / numKvHeads;
+  // Format as chat
+  const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
+  const inputTokens = model.tokenizer.encode(chatPrompt);
+  const allTokens = [...inputTokens];
+  // KV cache: [layer][position] -> { k, v }
+  const kvCache = Array.from({ length: CONFIG.numLayers }, () => ({ keys: [], values: [] }));
+  const tokenTimes = [];
+  // Process all input tokens (prefill) then generate
+  for (let step = 0; step < inputTokens.length + maxTokens - 1; step++) {
+    const tokenStart = performance.now();
+    const pos = step;
+    const tokenId = allTokens[step];
+    // Embed
+    const hidden = new Float32Array(hiddenDim);
+    const embOffset = tokenId * hiddenDim;
+    for (let i = 0; i < hiddenDim; i++) hidden[i] = model.tokenEmbd[embOffset + i];
+    let x = hidden;
+    // Run through layers
+    for (let l = 0; l < CONFIG.numLayers; l++) {
+      const layer = model.layers[l];
+      // 1. Attention norm
+      const normed = rmsNorm(x, layer.attnNorm, rmsNormEps);
+      // 2. Q, K, V projections
+      const q = matVec(layer.qProj, normed, hiddenDim, hiddenDim);
+      const k = matVec(layer.kProj, normed, kvDim, hiddenDim);
+      const v = matVec(layer.vProj, normed, kvDim, hiddenDim);
+      // 3. RoPE
+      for (let h = 0; h < numHeads; h++) {
+        applyRoPE(q.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
+      }
+      for (let h = 0; h < numKvHeads; h++) {
+        applyRoPE(k.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
+      }
+      // 4. Store in KV cache
+      kvCache[l].keys.push(new Float32Array(k));
+      kvCache[l].values.push(new Float32Array(v));
+      // 5. Attention with full KV cache
+      const attnOut = new Float32Array(hiddenDim);
+      const seqLen = kvCache[l].keys.length;
+      for (let h = 0; h < numHeads; h++) {
+        const kvHead = Math.floor(h / gqaRatio);
+        const qHead = q.subarray(h * headDim, (h + 1) * headDim);
+        // Compute attention scores
+        const scores = new Float32Array(seqLen);
+        for (let s = 0; s < seqLen; s++) {
+          const kHead = kvCache[l].keys[s].subarray(kvHead * headDim, (kvHead + 1) * headDim);
+          let dot = 0;
+          for (let d = 0; d < headDim; d++) dot += qHead[d] * kHead[d];
+          scores[s] = dot / Math.sqrt(headDim);
+        }
+        // Causal mask: already handled (only see past positions)
+        // Softmax
+        const attnWeights = softmax(scores);
+        // Weighted sum of values
+        for (let s = 0; s < seqLen; s++) {
+          const vHead = kvCache[l].values[s].subarray(kvHead * headDim, (kvHead + 1) * headDim);
+          const w = attnWeights[s];
+          for (let d = 0; d < headDim; d++) {
+            attnOut[h * headDim + d] += w * vHead[d];
+          }
+        }
+      }
+      // 6. Output projection
+      const projected = matVec(layer.oProj, attnOut, hiddenDim, hiddenDim);
+      // 7. Residual
+      const postAttn = new Float32Array(hiddenDim);
+      for (let i = 0; i < hiddenDim; i++) postAttn[i] = x[i] + projected[i];
+      // 8. FFN norm
+      const ffnInput = rmsNorm(postAttn, layer.ffnNorm, rmsNormEps);
+      // 9. SwiGLU MLP
+      const gate = matVec(layer.gateProj, ffnInput, intermediateSize, hiddenDim);
+      const up = matVec(layer.upProj, ffnInput, intermediateSize, hiddenDim);
+      const activated = silu(gate);
+      for (let i = 0; i < intermediateSize; i++) activated[i] *= up[i];
+      const down = matVec(layer.downProj, activated, hiddenDim, intermediateSize);
+      // 10. Residual
+      x = new Float32Array(hiddenDim);
+      for (let i = 0; i < hiddenDim; i++) x[i] = postAttn[i] + down[i];
+    }
+    // Only sample if past prefill
+    if (step >= inputTokens.length - 1) {
+      // Final norm + LM head
+      const finalNormed = rmsNorm(x, model.outputNorm, rmsNormEps);
+      const logits = matVec(model.outputWeight, finalNormed, CONFIG.vocabSize, hiddenDim);
+      // Temperature sampling
+      const temperature = 0.7;
+      for (let i = 0; i < logits.length; i++) logits[i] /= temperature;
+      const probs = softmax(logits);
+      // Top-p sampling
+      const indexed = Array.from(probs).map((p, i) => ({ p, i })).sort((a, b) => b.p - a.p);
+      let cumP = 0;
+      let chosen = indexed[0].i;
+      const r = Math.random();
+      for (const { p, i } of indexed) {
+        cumP += p;
+        if (r < cumP) { chosen = i; break; }
+        if (cumP > 0.9) break;
+      }
+      const tokenEnd = performance.now();
+      if (step >= inputTokens.length) tokenTimes.push(tokenEnd - tokenStart);
+      if (chosen === CONFIG.eosToken) break;
+      allTokens.push(chosen);
+    }
+  }
+  const totalTime = performance.now() - t0;
+  const generatedTokens = allTokens.slice(inputTokens.length);
+  const text = model.tokenizer.decode(generatedTokens);
+  const avgTokenTime = tokenTimes.length > 0 ? tokenTimes.reduce((a, b) => a + b, 0) / tokenTimes.length : 0;
+  return {
+    text,
+    tokens: generatedTokens.length,
+    totalTimeMs: Math.round(totalTime),
+    avgTokenMs: Math.round(avgTokenTime),
+    prefillTokens: inputTokens.length,
+    engine: 'Aether WASM-SIMD',
+  };
+}
+// ─── HTTP Server ────────────────────────────────────────────────────────────
+function startServer() {
+  const server = createServer((req, res) => {
+    if (req.method === 'POST' && req.url === '/generate') {
+      let body = '';
+      req.on('data', c => body += c);
+      req.on('end', () => {
+        try {
+          const { prompt, max_tokens } = JSON.parse(body);
+          const result = generate(prompt, max_tokens || 100);
+          res.writeHead(200, { 'Content-Type': 'application/json' });
+          res.end(JSON.stringify(result));
+        } catch (e) {
+          res.writeHead(500, { 'Content-Type': 'application/json' });
+          res.end(JSON.stringify({ error: e.message, stack: e.stack }));
+        }
+      });
+    } else if (req.url === '/health') {
+      res.writeHead(200, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ status: 'ok', model: model ? 'loaded' : 'not loaded', loadTime: model?.loadTime }));
+    } else {
+      res.writeHead(404);
+      res.end('Not found');
+    }
+  });
+  server.listen(PORT, '127.0.0.1', () => {
+    console.log(`[Aether] Server listening on http://127.0.0.1:${PORT}`);
+  });
+}
+// ─── Main ───────────────────────────────────────────────────────────────────
+const ggufPath = process.env.GGUF_PATH || join('/tmp/hf_cache', 'buleyean-smollm2-360m-q8_0.gguf');
+const tokenizerPath = process.env.TOKENIZER_PATH || join('/tmp/hf_cache', 'tokenizer.json');
+// Download if needed
+if (!existsSync(ggufPath)) {
+  console.log('[Aether] Downloading GGUF model...');
+  execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
+}
+if (!existsSync(tokenizerPath)) {
+  console.log('[Aether] Downloading tokenizer...');
+  execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
+}
+loadModel(ggufPath, tokenizerPath);
+startServer();

app.py CHANGED Viewed

@@ -1,63 +1,122 @@
 """
 The Void -- Buleyean RL
-Live inference. Real outputs. Nothing hardcoded.
 """
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-print("Loading models...", flush=True)
-# Base model -- load from safetensors (fast)
 base_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
 base_model = AutoModelForCausalLM.from_pretrained(
     "HuggingFaceTB/SmolLM2-360M-Instruct",
     torch_dtype=torch.float32,
     device_map="cpu",
 )
-# Buleyean model -- load from GGUF via transformers
-bule_model = AutoModelForCausalLM.from_pretrained(
-    "forkjoin-ai/buleyean-smollm2-360m",
-    gguf_file="buleyean-smollm2-360m-q4_k_m.gguf",
-    torch_dtype=torch.float32,
-    device_map="cpu",
-)
-# Reuse the same tokenizer (same base architecture)
-bule_tokenizer = base_tokenizer
-print("Ready.", flush=True)
-def gen(prompt, model, tokenizer):
     messages = [{"role": "user", "content": prompt}]
-    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
-        outputs = model.generate(
             **inputs,
-            max_new_tokens=300,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
-            pad_token_id=tokenizer.eos_token_id,
         )
-    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-    return response.strip()
 def compare(prompt):
     if not prompt or not prompt.strip():
-        return "", ""
-    base_out = gen(prompt, base_model, base_tokenizer)
-    bule_out = gen(prompt, bule_model, bule_tokenizer)
-    return base_out, bule_out
 CSS = """
 /* AeonOS Design System */
-.gradio-container { max-width: 960px !important; margin: 0 auto !important; }
 .gradio-container, .dark { background: #09090b !important; }
 /* Hero */
@@ -74,10 +133,14 @@ CSS = """
 .base-label { color: #71717a !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
 .void-label { color: #3b82f6 !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
 /* Input */
 #prompt-input textarea { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; color: #fafafa !important; font-size: 1rem !important; padding: 1rem !important; }
 #prompt-input textarea:focus { border-color: #3b82f6 !important; box-shadow: 0 0 0 2px rgba(59,130,246,0.1) !important; }
-#prompt-input > label > span { display: none !important; }
 /* Generate button */
 #gen-btn { background: #3b82f6 !important; border: none !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 0.9rem !important; padding: 0.75rem 2rem !important; transition: all 150ms !important; }
@@ -92,9 +155,6 @@ CSS = """
 #footer p { color: #52525b; font-size: 0.8rem; }
 #footer a { color: #3b82f6; text-decoration: none; }
-/* Divider */
-.vs-divider { color: #27272a !important; font-size: 0.75rem !important; text-transform: uppercase !important; letter-spacing: 0.1em !important; }
 /* Hide Gradio chrome */
 footer.svelte-1ax1toq { display: none !important; }
 .built-with { display: none !important; }
@@ -102,54 +162,61 @@ footer.svelte-1ax1toq { display: none !important; }
 with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zinc"), title="The Void") as demo:
-    # Hero
     gr.HTML("""
     <div id="hero">
         <h1>The <span class="accent">Void</span></h1>
-        <p class="subtitle">Live inference from models trained on rejection alone. No reward model. No chosen examples.<br/>
-        Type anything. Both models generate in real-time.</p>
     </div>
     """)
-    # Input
     prompt = gr.Textbox(elem_id="prompt-input", placeholder="What would you like to ask?", lines=2, label="Prompt", show_label=False, interactive=True)
     btn = gr.Button("Generate", elem_id="gen-btn", variant="primary")
-    # Outputs
     with gr.Row(equal_height=True):
         with gr.Column():
-            gr.HTML('<p class="base-label">Base Model</p>')
-            base_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"])
-        with gr.Column(min_width=40):
             gr.HTML('<p style="color:#27272a; text-align:center; padding-top:4rem; font-size:0.75rem; letter-spacing:0.1em;">VS</p>')
         with gr.Column():
-            gr.HTML('<p class="void-label">Trained from the Void</p>')
-            bule_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"])
-    btn.click(compare, [prompt], [base_out, bule_out])
-    prompt.submit(compare, [prompt], [base_out, bule_out])
-    # Prompt suggestions
     gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
     with gr.Row():
         for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
             gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
-                fn=lambda x=p: compare(x), outputs=[base_out, bule_out]
             ).then(fn=lambda x=p: x, outputs=[prompt])
-    # Footer
     gr.HTML("""
     <div id="footer">
-        <p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">SmolLM2-360M-Instruct &nbsp;&middot;&nbsp; Q4_K_M GGUF &nbsp;&middot;&nbsp; Live inference on CPU</p>
         <p>
-            <a href="https://forkracefold.com/">Whitepaper</a> &nbsp;&middot;&nbsp;
-            <a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> &nbsp;&middot;&nbsp;
-            <a href="https://huggingface.co/forkjoin-ai">Models</a> &nbsp;&middot;&nbsp;
-            <a href="https://huggingface.co/spaces/forkjoin-ai/glossolalia">Glossolalia</a> &nbsp;&middot;&nbsp;
-            <a href="https://huggingface.co/spaces/forkjoin-ai/void-attention">Void Attention</a> &nbsp;&middot;&nbsp;
-            <a href="https://huggingface.co/spaces/forkjoin-ai/metacog">METACOG</a>
         </p>
-        <p style="margin-top:1rem;">500+ Lean 4 theorems &nbsp;&middot;&nbsp; Zero sorry &nbsp;&middot;&nbsp; <a href="https://forkracefold.com/">&phi;&sup2; = &phi; + 1</a></p>
     </div>
     """)

 """
 The Void -- Buleyean RL
+PyTorch vs Aether. Side by side. Let the speed speak.
 """
 import gradio as gr
 import torch
+import json
+import time
+import subprocess
+import urllib.request
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# ─── Start Aether sidecar ────────────────────────────────────────────────────
+print("[Void] Starting Aether inference server...", flush=True)
+aether_proc = subprocess.Popen(
+    ["node", "aether-server.mjs"],
+    env={**__import__('os').environ, "AETHER_PORT": "7861"},
+    stdout=subprocess.PIPE,
+    stderr=subprocess.STDOUT,
+)
+# ─── Load PyTorch model ──────────────────────────────────────────────────────
+print("[Void] Loading PyTorch base model...", flush=True)
 base_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
 base_model = AutoModelForCausalLM.from_pretrained(
     "HuggingFaceTB/SmolLM2-360M-Instruct",
     torch_dtype=torch.float32,
     device_map="cpu",
 )
+print("[Void] PyTorch model ready.", flush=True)
+# Wait for Aether to be ready
+print("[Void] Waiting for Aether...", flush=True)
+for attempt in range(120):
+    try:
+        req = urllib.request.Request("http://127.0.0.1:7861/health")
+        resp = urllib.request.urlopen(req, timeout=2)
+        health = json.loads(resp.read())
+        if health.get("status") == "ok" and health.get("model") == "loaded":
+            print(f"[Void] Aether ready (model loaded in {health.get('loadTime')}ms)", flush=True)
+            break
+    except Exception:
+        pass
+    # Print Aether stdout lines as they come
+    import select
+    if aether_proc.stdout and select.select([aether_proc.stdout], [], [], 0)[0]:
+        line = aether_proc.stdout.readline()
+        if line:
+            print(f"  [Aether] {line.decode().strip()}", flush=True)
+    time.sleep(1)
+else:
+    print("[Void] WARNING: Aether not ready after 120s, continuing anyway", flush=True)
+def gen_pytorch(prompt):
+    """Generate with PyTorch (standard)"""
     messages = [{"role": "user", "content": prompt}]
+    text = base_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = base_tokenizer(text, return_tensors="pt")
+    t0 = time.perf_counter()
     with torch.no_grad():
+        outputs = base_model.generate(
             **inputs,
+            max_new_tokens=100,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
+            pad_token_id=base_tokenizer.eos_token_id,
+        )
+    elapsed = time.perf_counter() - t0
+    n_tokens = outputs.shape[1] - inputs["input_ids"].shape[1]
+    response = base_tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
+    ms_per_tok = (elapsed * 1000 / n_tokens) if n_tokens > 0 else 0
+    return response, elapsed, n_tokens, ms_per_tok
+def gen_aether(prompt):
+    """Generate with Aether (our engine)"""
+    try:
+        data = json.dumps({"prompt": prompt, "max_tokens": 100}).encode()
+        req = urllib.request.Request(
+            "http://127.0.0.1:7861/generate",
+            data=data,
+            headers={"Content-Type": "application/json"},
         )
+        t0 = time.perf_counter()
+        resp = urllib.request.urlopen(req, timeout=300)
+        wall_time = time.perf_counter() - t0
+        result = json.loads(resp.read())
+        return (
+            result["text"],
+            result["totalTimeMs"] / 1000,
+            result["tokens"],
+            result["avgTokenMs"],
+        )
+    except Exception as e:
+        return f"[Aether error: {e}]", 0, 0, 0
 def compare(prompt):
     if not prompt or not prompt.strip():
+        return "", "", "", ""
+    # Run both
+    base_text, base_time, base_toks, base_ms = gen_pytorch(prompt)
+    aether_text, aether_time, aether_toks, aether_ms = gen_aether(prompt)
+    base_stats = f"{base_toks} tokens in {base_time:.1f}s ({base_ms:.0f}ms/tok)"
+    aether_stats = f"{aether_toks} tokens in {aether_time:.1f}s ({aether_ms:.0f}ms/tok)"
+    return base_text, aether_text, base_stats, aether_stats
 CSS = """
 /* AeonOS Design System */
+.gradio-container { max-width: 1060px !important; margin: 0 auto !important; }
 .gradio-container, .dark { background: #09090b !important; }
 /* Hero */
 .base-label { color: #71717a !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
 .void-label { color: #3b82f6 !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
+/* Stats */
+.stats-text { font-family: 'SF Mono', 'Fira Code', monospace !important; font-size: 0.8rem !important; color: #52525b !important; }
+.stats-text.faster { color: #22c55e !important; }
 /* Input */
+#prompt-input > label > span { display: none !important; }
 #prompt-input textarea { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; color: #fafafa !important; font-size: 1rem !important; padding: 1rem !important; }
 #prompt-input textarea:focus { border-color: #3b82f6 !important; box-shadow: 0 0 0 2px rgba(59,130,246,0.1) !important; }
 /* Generate button */
 #gen-btn { background: #3b82f6 !important; border: none !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 0.9rem !important; padding: 0.75rem 2rem !important; transition: all 150ms !important; }
 #footer p { color: #52525b; font-size: 0.8rem; }
 #footer a { color: #3b82f6; text-decoration: none; }
 /* Hide Gradio chrome */
 footer.svelte-1ax1toq { display: none !important; }
 .built-with { display: none !important; }
 with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zinc"), title="The Void") as demo:
     gr.HTML("""
     <div id="hero">
         <h1>The <span class="accent">Void</span></h1>
+        <p class="subtitle">PyTorch vs Aether. Same model. Different engines. Live inference.<br/>
+        Left: standard PyTorch CPU. Right: Aether WASM-SIMD kernels. Both generate in real-time.</p>
     </div>
     """)
     prompt = gr.Textbox(elem_id="prompt-input", placeholder="What would you like to ask?", lines=2, label="Prompt", show_label=False, interactive=True)
     btn = gr.Button("Generate", elem_id="gen-btn", variant="primary")
     with gr.Row(equal_height=True):
         with gr.Column():
+            gr.HTML('<p class="base-label">PyTorch (standard)</p>')
+            base_out = gr.Textbox(lines=8, show_label=False, interactive=False, elem_classes=["response-card"])
+            base_stats = gr.HTML('<p class="stats-text">--</p>')
+        with gr.Column(min_width=30):
             gr.HTML('<p style="color:#27272a; text-align:center; padding-top:4rem; font-size:0.75rem; letter-spacing:0.1em;">VS</p>')
         with gr.Column():
+            gr.HTML('<p class="void-label">Aether (our engine)</p>')
+            aether_out = gr.Textbox(lines=8, show_label=False, interactive=False, elem_classes=["response-card"])
+            aether_stats = gr.HTML('<p class="stats-text">--</p>')
+    def run_compare(prompt_text):
+        base_text, aether_text, b_stats, a_stats = compare(prompt_text)
+        return (
+            base_text,
+            aether_text,
+            f'<p class="stats-text">{b_stats}</p>',
+            f'<p class="stats-text">{a_stats}</p>',
+        )
+    btn.click(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
+    prompt.submit(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
     gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
     with gr.Row():
         for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
             gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
+                fn=lambda x=p: run_compare(x), outputs=[base_out, aether_out, base_stats, aether_stats]
             ).then(fn=lambda x=p: x, outputs=[prompt])
     gr.HTML("""
     <div id="footer">
+        <p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">
+            SmolLM2-360M-Instruct &middot; Buleyean RL &middot;
+            Left: PyTorch CPU &middot; Right: Aether WASM-SIMD (zero ML dependencies)
+        </p>
         <p>
+            <a href="https://forkracefold.com/">Whitepaper</a> &middot;
+            <a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> &middot;
+            <a href="https://huggingface.co/forkjoin-ai">Models</a> &middot;
+            <a href="https://huggingface.co/spaces/forkjoin-ai/glossolalia">Glossolalia</a>
         </p>
+        <p style="margin-top:1rem;">500+ Lean 4 theorems &middot; Zero sorry &middot; <a href="https://forkracefold.com/">&phi;&sup2; = &phi; + 1</a></p>
     </div>
     """)

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
---extra-index-url https://download.pytorch.org/whl/cpu
 torch>=2.1.0
 transformers>=4.46.0
 huggingface-hub>=0.26.0
 sentencepiece>=0.2.0
 accelerate>=1.0.0
 gguf>=0.10.0

 torch>=2.1.0
 transformers>=4.46.0
 huggingface-hub>=0.26.0
 sentencepiece>=0.2.0
 accelerate>=1.0.0
 gguf>=0.10.0
+gradio>=5.0.0