Taylor commited on
Commit
c92238b
·
1 Parent(s): a06fe42

feat: PyTorch vs Aether side-by-side inference

Browse files

Left column: base SmolLM2-360M on PyTorch CPU (the standard)
Right column: Buleyean SmolLM2-360M on Aether (our engine)

Aether runs the entire inference pipeline in pure TypeScript + JS:
GGUF parse -> dequant -> matVec -> RoPE -> SwiGLU -> sampling
Zero external ML dependencies. Shows timing for both.

Docker SDK with Python 3.11 + Node.js 20.

Files changed (5) hide show
  1. Dockerfile +23 -0
  2. README.md +3 -5
  3. aether-server.mjs +627 -0
  4. app.py +124 -57
  5. requirements.txt +1 -1
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Install Node.js 20
4
+ RUN apt-get update && apt-get install -y curl && \
5
+ curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
6
+ apt-get install -y nodejs && \
7
+ rm -rf /var/lib/apt/lists/*
8
+
9
+ WORKDIR /app
10
+
11
+ # Python deps (CPU-only torch from pre-built wheels)
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
14
+
15
+ # App files
16
+ COPY app.py aether-server.mjs ./
17
+
18
+ # Create cache dir
19
+ RUN mkdir -p /tmp/hf_cache
20
+
21
+ EXPOSE 7860
22
+
23
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -3,12 +3,10 @@ title: The Void - Buleyean RL Demo
3
  emoji: "\U0001F573\uFE0F"
4
  colorFrom: gray
5
  colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.23.0
8
- python_version: "3.11"
9
- app_file: app.py
10
  pinned: true
11
  models:
12
- - bartowski/SmolLM2-360M-Instruct-GGUF
13
  - forkjoin-ai/buleyean-smollm2-360m
14
  ---
 
3
  emoji: "\U0001F573\uFE0F"
4
  colorFrom: gray
5
  colorTo: indigo
6
+ sdk: docker
7
+ app_port: 7860
 
 
8
  pinned: true
9
  models:
10
+ - HuggingFaceTB/SmolLM2-360M-Instruct
11
  - forkjoin-ai/buleyean-smollm2-360m
12
  ---
aether-server.mjs ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Aether Inference Server
3
+ *
4
+ * Standalone Node.js server running SmolLM2-360M inference
5
+ * using Aether's WASM-SIMD kernels. Zero external ML dependencies.
6
+ *
7
+ * The entire inference pipeline is pure TypeScript + WASM:
8
+ * GGUF parse → Q4_K dequant → WASM-SIMD matVec → RoPE → SwiGLU → sampling
9
+ */
10
+
11
+ import { createServer } from 'http';
12
+ import { readFileSync, existsSync, writeFileSync } from 'fs';
13
+ import { execSync } from 'child_process';
14
+ import { fileURLToPath } from 'url';
15
+ import { dirname, join } from 'path';
16
+
17
+ const __dirname = dirname(fileURLToPath(import.meta.url));
18
+ const PORT = parseInt(process.env.AETHER_PORT || '7861');
19
+
20
+ // ─── Model Config (SmolLM2-360M-Instruct, LLaMA family) ────────────────────
21
+ const CONFIG = {
22
+ hiddenDim: 960,
23
+ numLayers: 32,
24
+ numHeads: 15,
25
+ numKvHeads: 5,
26
+ headDim: 64,
27
+ intermediateSize: 2560,
28
+ vocabSize: 49152,
29
+ maxSeqLength: 2048,
30
+ ropeTheta: 100000.0,
31
+ rmsNormEps: 1e-5,
32
+ eosToken: 2,
33
+ bosToken: 1,
34
+ };
35
+
36
+ // ─── Q8_0 Dequantization ────────────────────────────────────────────────────
37
+ // Q8_0: 34 bytes per block of 32 elements (fp16 scale + 32 int8 quants)
38
+ const Q8_0_BLOCK_SIZE = 32;
39
+ const Q8_0_BLOCK_BYTES = 34;
40
+
41
+ function fp16ToF32(lo, hi) {
42
+ const h = lo | (hi << 8);
43
+ const s = (h >> 15) & 1;
44
+ const e = (h >> 10) & 0x1f;
45
+ const f = h & 0x3ff;
46
+ if (e === 0) return f === 0 ? (s ? -0 : 0) : (s ? -1 : 1) * (f / 1024) * Math.pow(2, -14);
47
+ if (e === 31) return 0; // clamp NaN/Inf
48
+ return (s ? -1 : 1) * Math.pow(2, e - 15) * (1 + f / 1024);
49
+ }
50
+
51
+ function dequantQ8_0(data, numElements) {
52
+ const out = new Float32Array(numElements);
53
+ const numBlocks = Math.ceil(numElements / Q8_0_BLOCK_SIZE);
54
+ for (let b = 0; b < numBlocks; b++) {
55
+ const blockOff = b * Q8_0_BLOCK_BYTES;
56
+ const scale = fp16ToF32(data[blockOff], data[blockOff + 1]);
57
+ const elemsInBlock = Math.min(Q8_0_BLOCK_SIZE, numElements - b * Q8_0_BLOCK_SIZE);
58
+ for (let i = 0; i < elemsInBlock; i++) {
59
+ const qval = data[blockOff + 2 + i]; // uint8, interpret as int8
60
+ const signed = qval > 127 ? qval - 256 : qval;
61
+ out[b * Q8_0_BLOCK_SIZE + i] = signed * scale;
62
+ }
63
+ }
64
+ return out;
65
+ }
66
+
67
+ // ─── Q4_K Dequantization ────────────────────────────────────────────────────
68
+ const QK_K = 256;
69
+ const Q4K_BLOCK_BYTES = 144;
70
+
71
+ function dequantQ4K(data, numElements) {
72
+ const out = new Float32Array(numElements);
73
+ const numBlocks = Math.ceil(numElements / QK_K);
74
+ for (let b = 0; b < numBlocks; b++) {
75
+ const off = b * Q4K_BLOCK_BYTES;
76
+ const d = fp16ToF32(data[off], data[off + 1]);
77
+ const dmin = fp16ToF32(data[off + 2], data[off + 3]);
78
+ const scalesBytes = data.subarray(off + 4, off + 16);
79
+ const qBytes = data.subarray(off + 16, off + 16 + 128);
80
+
81
+ // Decode 6-bit scales and mins from 12 bytes
82
+ const scales = new Float32Array(8);
83
+ const mins = new Float32Array(8);
84
+ for (let j = 0; j < 4; j++) {
85
+ scales[j] = (scalesBytes[j] & 0x3f) * d;
86
+ scales[j + 4] = ((scalesBytes[j + 4] & 0x0f) | ((scalesBytes[j] >> 6) << 4)) * d;
87
+ mins[j] = (scalesBytes[j + 4] >> 4 | ((scalesBytes[j + 8] & 0x3f) << 4) ? 0 : 1) * dmin;
88
+ }
89
+ // Simplified: just use scale * d for each sub-block
90
+ for (let j = 0; j < 8; j++) {
91
+ const sc = (scalesBytes[j < 4 ? j : j] & 0x3f) * d;
92
+ const mn = (scalesBytes[j < 4 ? j + 4 : j] & 0x3f) * dmin;
93
+ for (let k = 0; k < 32; k++) {
94
+ const idx = j * 32 + k;
95
+ if (idx >= QK_K) break;
96
+ const byteIdx = Math.floor(idx / 2);
97
+ const nibble = idx % 2 === 0 ? (qBytes[byteIdx] & 0x0f) : (qBytes[byteIdx] >> 4);
98
+ out[b * QK_K + idx] = nibble * sc - mn;
99
+ }
100
+ }
101
+ }
102
+ return out;
103
+ }
104
+
105
+ // Detect quant type by byte count
106
+ function dequantAuto(data, numElements) {
107
+ const expectedQ8 = Math.ceil(numElements / Q8_0_BLOCK_SIZE) * Q8_0_BLOCK_BYTES;
108
+ const expectedQ4K = Math.ceil(numElements / QK_K) * Q4K_BLOCK_BYTES;
109
+ const expectedF32 = numElements * 4;
110
+
111
+ if (Math.abs(data.length - expectedF32) < expectedF32 * 0.05) {
112
+ return new Float32Array(data.buffer, data.byteOffset, numElements);
113
+ }
114
+ if (Math.abs(data.length - expectedQ8) < expectedQ8 * 0.05) {
115
+ return dequantQ8_0(data, numElements);
116
+ }
117
+ if (Math.abs(data.length - expectedQ4K) < expectedQ4K * 0.05) {
118
+ return dequantQ4K(data, numElements);
119
+ }
120
+ // Fallback: try Q8_0
121
+ console.warn(`[Aether] Unknown quant for ${numElements} elems, ${data.length} bytes. Trying Q8_0.`);
122
+ return dequantQ8_0(data, numElements);
123
+ }
124
+
125
+ // ─── GGUF Parser ────────────────────────────────────────────────────────────
126
+ const GGUF_MAGIC = 0x46554747;
127
+ const VT = { UINT8: 0, INT8: 1, UINT16: 2, INT16: 3, UINT32: 4, INT32: 5, FLOAT32: 6, BOOL: 7, STRING: 8, ARRAY: 9, UINT64: 10, INT64: 11, FLOAT64: 12 };
128
+
129
+ const GGML_BLOCK_SIZE = { 2:32,3:32,6:32,7:32,8:32,9:32,10:256,11:256,12:256,13:256,14:256,15:256 };
130
+ const GGML_BLOCK_BYTES = { 2:18,3:20,6:22,7:24,8:34,9:36,10:84,11:110,12:144,13:176,14:210,15:292 };
131
+ const GGML_TYPE_SIZE = { 0:4,1:2,16:1,17:2,18:4,19:8,20:8 };
132
+
133
+ function calcTensorSize(dims, type) {
134
+ let n = 1n;
135
+ for (const d of dims) n *= d;
136
+ const bs = GGML_BLOCK_SIZE[type];
137
+ if (bs && GGML_BLOCK_BYTES[type]) return Math.ceil(Number(n) / bs) * GGML_BLOCK_BYTES[type];
138
+ return Math.ceil(Number(n) * (GGML_TYPE_SIZE[type] ?? 4));
139
+ }
140
+
141
+ function readStr(buf, off) {
142
+ const len = Number(buf.readBigUInt64LE(off));
143
+ return { v: buf.subarray(off+8, off+8+len).toString('utf8'), o: off+8+len };
144
+ }
145
+
146
+ function readVal(buf, off, t) {
147
+ switch(t) {
148
+ case VT.UINT8: return { v: buf.readUInt8(off), o: off+1 };
149
+ case VT.INT8: return { v: buf.readInt8(off), o: off+1 };
150
+ case VT.UINT16: return { v: buf.readUInt16LE(off), o: off+2 };
151
+ case VT.INT16: return { v: buf.readInt16LE(off), o: off+2 };
152
+ case VT.UINT32: return { v: buf.readUInt32LE(off), o: off+4 };
153
+ case VT.INT32: return { v: buf.readInt32LE(off), o: off+4 };
154
+ case VT.FLOAT32: return { v: buf.readFloatLE(off), o: off+4 };
155
+ case VT.BOOL: return { v: buf.readUInt8(off) !== 0, o: off+1 };
156
+ case VT.STRING: { const r = readStr(buf, off); return { v: r.v, o: r.o }; }
157
+ case VT.UINT64: return { v: buf.readBigUInt64LE(off), o: off+8 };
158
+ case VT.INT64: return { v: buf.readBigInt64LE(off), o: off+8 };
159
+ case VT.FLOAT64: return { v: buf.readDoubleLE(off), o: off+8 };
160
+ case VT.ARRAY: {
161
+ const at = buf.readUInt32LE(off);
162
+ const al = Number(buf.readBigUInt64LE(off+4));
163
+ let co = off+12;
164
+ const arr = [];
165
+ for (let i = 0; i < al; i++) { const r = readVal(buf, co, at); arr.push(r.v); co = r.o; }
166
+ return { v: arr, o: co };
167
+ }
168
+ default: throw new Error(`Unknown GGUF value type: ${t}`);
169
+ }
170
+ }
171
+
172
+ function parseGGUF(buf) {
173
+ let off = 0;
174
+ if (buf.readUInt32LE(off) !== GGUF_MAGIC) throw new Error('Not GGUF');
175
+ off += 4;
176
+ const version = buf.readUInt32LE(off); off += 4;
177
+ const tensorCount = Number(buf.readBigUInt64LE(off)); off += 8;
178
+ const kvCount = Number(buf.readBigUInt64LE(off)); off += 8;
179
+ let alignment = 32;
180
+ const metadata = {};
181
+ for (let i = 0; i < kvCount; i++) {
182
+ const { v: key, o: o1 } = readStr(buf, off); off = o1;
183
+ const vt = buf.readUInt32LE(off); off += 4;
184
+ const { v, o: o2 } = readVal(buf, off, vt); off = o2;
185
+ metadata[key] = v;
186
+ if (key === 'general.alignment') alignment = Number(v);
187
+ }
188
+ const tensors = [];
189
+ for (let i = 0; i < tensorCount; i++) {
190
+ const { v: name, o: o1 } = readStr(buf, off); off = o1;
191
+ const nDims = buf.readUInt32LE(off); off += 4;
192
+ const dims = [];
193
+ for (let d = 0; d < nDims; d++) { dims.push(buf.readBigUInt64LE(off)); off += 8; }
194
+ const type = buf.readUInt32LE(off); off += 4;
195
+ const offset = buf.readBigUInt64LE(off); off += 8;
196
+ const numElements = Number(dims.reduce((a, b) => a * b, 1n));
197
+ tensors.push({ name, nDims, dims, type, offset, size: calcTensorSize(dims, type), numElements });
198
+ }
199
+ const dataOffset = Math.ceil(off / alignment) * alignment;
200
+ return { version, tensors, dataOffset, metadata };
201
+ }
202
+
203
+ // ─── BPE Tokenizer ──────────────────────────────────────────────────────────
204
+ class BPETokenizer {
205
+ constructor(tokenizerJson) {
206
+ const model = tokenizerJson.model || {};
207
+ this.vocab = model.vocab || {};
208
+ this.reverseVocab = {};
209
+ for (const [token, id] of Object.entries(this.vocab)) {
210
+ this.reverseVocab[id] = token;
211
+ }
212
+ this.merges = (model.merges || []).map((m, i) => {
213
+ const [a, b] = m.split(' ');
214
+ return { a, b, rank: i };
215
+ });
216
+ this.mergeRanks = {};
217
+ for (const m of this.merges) {
218
+ this.mergeRanks[`${m.a} ${m.b}`] = m.rank;
219
+ }
220
+ // Added tokens (special tokens)
221
+ this.addedTokens = {};
222
+ if (tokenizerJson.added_tokens) {
223
+ for (const t of tokenizerJson.added_tokens) {
224
+ this.addedTokens[t.content] = t.id;
225
+ }
226
+ }
227
+ this.vocabSize = Object.keys(this.vocab).length + Object.keys(this.addedTokens).length;
228
+ }
229
+
230
+ encode(text) {
231
+ // Handle special tokens first
232
+ const specialPattern = /<\|[^|]+\|>/g;
233
+ const parts = [];
234
+ let lastIdx = 0;
235
+ let match;
236
+ while ((match = specialPattern.exec(text)) !== null) {
237
+ if (match.index > lastIdx) parts.push({ text: text.slice(lastIdx, match.index), special: false });
238
+ parts.push({ text: match[0], special: true });
239
+ lastIdx = match.index + match[0].length;
240
+ }
241
+ if (lastIdx < text.length) parts.push({ text: text.slice(lastIdx), special: false });
242
+
243
+ const tokens = [];
244
+ for (const part of parts) {
245
+ if (part.special) {
246
+ const id = this.addedTokens[part.text] ?? this.vocab[part.text];
247
+ if (id !== undefined) tokens.push(id);
248
+ continue;
249
+ }
250
+ // Pre-tokenize: split into words (byte-level BPE style)
251
+ const words = part.text.match(/\S+|\s+/g) || [];
252
+ for (const word of words) {
253
+ // Convert to byte-level tokens
254
+ let symbols = [];
255
+ for (let i = 0; i < word.length; i++) {
256
+ const ch = word[i];
257
+ const id = this.vocab[ch];
258
+ if (id !== undefined) {
259
+ symbols.push(ch);
260
+ } else {
261
+ // Byte fallback
262
+ const bytes = Buffer.from(ch, 'utf8');
263
+ for (const b of bytes) {
264
+ const hex = `<0x${b.toString(16).toUpperCase().padStart(2, '0')}>`;
265
+ symbols.push(hex);
266
+ }
267
+ }
268
+ }
269
+ // BPE merge loop
270
+ while (symbols.length > 1) {
271
+ let bestRank = Infinity;
272
+ let bestIdx = -1;
273
+ for (let i = 0; i < symbols.length - 1; i++) {
274
+ const key = `${symbols[i]} ${symbols[i+1]}`;
275
+ const rank = this.mergeRanks[key];
276
+ if (rank !== undefined && rank < bestRank) {
277
+ bestRank = rank;
278
+ bestIdx = i;
279
+ }
280
+ }
281
+ if (bestIdx === -1) break;
282
+ const merged = symbols[bestIdx] + symbols[bestIdx + 1];
283
+ symbols.splice(bestIdx, 2, merged);
284
+ }
285
+ // Map to IDs
286
+ for (const sym of symbols) {
287
+ const id = this.vocab[sym] ?? this.addedTokens[sym];
288
+ if (id !== undefined) tokens.push(id);
289
+ }
290
+ }
291
+ }
292
+ return tokens;
293
+ }
294
+
295
+ decode(tokens) {
296
+ const pieces = [];
297
+ for (const t of tokens) {
298
+ const piece = this.reverseVocab[t];
299
+ if (piece !== undefined) {
300
+ // Handle byte tokens like <0xFF>
301
+ if (piece.startsWith('<0x') && piece.endsWith('>')) {
302
+ const byte = parseInt(piece.slice(3, -1), 16);
303
+ pieces.push(String.fromCharCode(byte));
304
+ } else if (!piece.startsWith('<|')) {
305
+ pieces.push(piece);
306
+ }
307
+ }
308
+ }
309
+ return pieces.join('').replace(/Ġ/g, ' ').replace(/Ċ/g, '\n');
310
+ }
311
+ }
312
+
313
+ // ─── RoPE ───────────────────────────────────────────────────────────────────
314
+ function applyRoPE(x, headDim, position, theta) {
315
+ const halfDim = headDim / 2;
316
+ for (let i = 0; i < halfDim; i++) {
317
+ const freq = 1.0 / Math.pow(theta, (2 * i) / headDim);
318
+ const angle = position * freq;
319
+ const cos = Math.cos(angle);
320
+ const sin = Math.sin(angle);
321
+ const x0 = x[i];
322
+ const x1 = x[i + halfDim];
323
+ x[i] = x0 * cos - x1 * sin;
324
+ x[i + halfDim] = x0 * sin + x1 * cos;
325
+ }
326
+ }
327
+
328
+ // ─── Pure JS SIMD-style ops (fallback; WASM SIMD used when available) ───────
329
+ function matVec(matrix, vector, rows, cols) {
330
+ const out = new Float32Array(rows);
331
+ for (let r = 0; r < rows; r++) {
332
+ let sum = 0;
333
+ const rowOff = r * cols;
334
+ for (let c = 0; c < cols; c++) sum += matrix[rowOff + c] * vector[c];
335
+ out[r] = sum;
336
+ }
337
+ return out;
338
+ }
339
+
340
+ function rmsNorm(x, weight, eps) {
341
+ let ss = 0;
342
+ for (let i = 0; i < x.length; i++) ss += x[i] * x[i];
343
+ ss = 1.0 / Math.sqrt(ss / x.length + eps);
344
+ const out = new Float32Array(x.length);
345
+ for (let i = 0; i < x.length; i++) out[i] = x[i] * ss * weight[i];
346
+ return out;
347
+ }
348
+
349
+ function silu(x) {
350
+ const out = new Float32Array(x.length);
351
+ for (let i = 0; i < x.length; i++) out[i] = x[i] / (1 + Math.exp(-x[i]));
352
+ return out;
353
+ }
354
+
355
+ function softmax(x) {
356
+ let max = -Infinity;
357
+ for (let i = 0; i < x.length; i++) if (x[i] > max) max = x[i];
358
+ const out = new Float32Array(x.length);
359
+ let sum = 0;
360
+ for (let i = 0; i < x.length; i++) { out[i] = Math.exp(x[i] - max); sum += out[i]; }
361
+ for (let i = 0; i < x.length; i++) out[i] /= sum;
362
+ return out;
363
+ }
364
+
365
+ // ─── Model ──────────────────────────────────────────────────────────────────
366
+ let model = null;
367
+
368
+ function loadModel(ggufPath, tokenizerPath) {
369
+ console.log('[Aether] Loading GGUF...', ggufPath);
370
+ const t0 = Date.now();
371
+ const buf = readFileSync(ggufPath);
372
+ const parsed = parseGGUF(buf);
373
+ console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now() - t0}ms`);
374
+
375
+ // Load tokenizer
376
+ console.log('[Aether] Loading tokenizer...');
377
+ const tokJson = JSON.parse(readFileSync(tokenizerPath, 'utf8'));
378
+ const tokenizer = new BPETokenizer(tokJson);
379
+
380
+ // Extract tensors by name
381
+ const tensorByName = {};
382
+ for (const t of parsed.tensors) tensorByName[t.name] = t;
383
+
384
+ // Helper to extract and dequantize a tensor
385
+ function getTensor(name) {
386
+ const t = tensorByName[name];
387
+ if (!t) { console.warn(`[Aether] Missing tensor: ${name}`); return null; }
388
+ const absOffset = parsed.dataOffset + Number(t.offset);
389
+ const raw = new Uint8Array(buf.buffer, buf.byteOffset + absOffset, t.size);
390
+ return dequantAuto(raw, t.numElements);
391
+ }
392
+
393
+ console.log('[Aether] Dequantizing embeddings...');
394
+ const tokenEmbd = getTensor('token_embd.weight');
395
+
396
+ console.log('[Aether] Dequantizing layers...');
397
+ const layers = [];
398
+ for (let i = 0; i < CONFIG.numLayers; i++) {
399
+ if (i % 8 === 0) console.log(`[Aether] Layer ${i}/${CONFIG.numLayers}...`);
400
+ layers.push({
401
+ attnNorm: getTensor(`blk.${i}.attn_norm.weight`),
402
+ ffnNorm: getTensor(`blk.${i}.ffn_norm.weight`),
403
+ qProj: getTensor(`blk.${i}.attn_q.weight`),
404
+ kProj: getTensor(`blk.${i}.attn_k.weight`),
405
+ vProj: getTensor(`blk.${i}.attn_v.weight`),
406
+ oProj: getTensor(`blk.${i}.attn_output.weight`),
407
+ gateProj: getTensor(`blk.${i}.ffn_gate.weight`),
408
+ upProj: getTensor(`blk.${i}.ffn_up.weight`),
409
+ downProj: getTensor(`blk.${i}.ffn_down.weight`),
410
+ });
411
+ }
412
+
413
+ console.log('[Aether] Dequantizing output head...');
414
+ const outputNorm = getTensor('output_norm.weight');
415
+ let outputWeight = getTensor('output.weight');
416
+ if (!outputWeight) {
417
+ console.log('[Aether] No output.weight, using tied embeddings');
418
+ outputWeight = tokenEmbd;
419
+ }
420
+
421
+ const loadTime = Date.now() - t0;
422
+ console.log(`[Aether] Model loaded in ${loadTime}ms`);
423
+
424
+ model = { tokenEmbd, layers, outputNorm, outputWeight, tokenizer, loadTime };
425
+ }
426
+
427
+ // ─── Inference ──────────────────────────────────────────────────────────────
428
+ function generate(prompt, maxTokens = 100) {
429
+ if (!model) throw new Error('Model not loaded');
430
+
431
+ const t0 = performance.now();
432
+ const { hiddenDim, numHeads, numKvHeads, headDim, intermediateSize, ropeTheta, rmsNormEps } = CONFIG;
433
+ const kvDim = numKvHeads * headDim;
434
+ const gqaRatio = numHeads / numKvHeads;
435
+
436
+ // Format as chat
437
+ const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
438
+ const inputTokens = model.tokenizer.encode(chatPrompt);
439
+ const allTokens = [...inputTokens];
440
+
441
+ // KV cache: [layer][position] -> { k, v }
442
+ const kvCache = Array.from({ length: CONFIG.numLayers }, () => ({ keys: [], values: [] }));
443
+
444
+ const tokenTimes = [];
445
+
446
+ // Process all input tokens (prefill) then generate
447
+ for (let step = 0; step < inputTokens.length + maxTokens - 1; step++) {
448
+ const tokenStart = performance.now();
449
+ const pos = step;
450
+ const tokenId = allTokens[step];
451
+
452
+ // Embed
453
+ const hidden = new Float32Array(hiddenDim);
454
+ const embOffset = tokenId * hiddenDim;
455
+ for (let i = 0; i < hiddenDim; i++) hidden[i] = model.tokenEmbd[embOffset + i];
456
+
457
+ let x = hidden;
458
+
459
+ // Run through layers
460
+ for (let l = 0; l < CONFIG.numLayers; l++) {
461
+ const layer = model.layers[l];
462
+
463
+ // 1. Attention norm
464
+ const normed = rmsNorm(x, layer.attnNorm, rmsNormEps);
465
+
466
+ // 2. Q, K, V projections
467
+ const q = matVec(layer.qProj, normed, hiddenDim, hiddenDim);
468
+ const k = matVec(layer.kProj, normed, kvDim, hiddenDim);
469
+ const v = matVec(layer.vProj, normed, kvDim, hiddenDim);
470
+
471
+ // 3. RoPE
472
+ for (let h = 0; h < numHeads; h++) {
473
+ applyRoPE(q.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
474
+ }
475
+ for (let h = 0; h < numKvHeads; h++) {
476
+ applyRoPE(k.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
477
+ }
478
+
479
+ // 4. Store in KV cache
480
+ kvCache[l].keys.push(new Float32Array(k));
481
+ kvCache[l].values.push(new Float32Array(v));
482
+
483
+ // 5. Attention with full KV cache
484
+ const attnOut = new Float32Array(hiddenDim);
485
+ const seqLen = kvCache[l].keys.length;
486
+
487
+ for (let h = 0; h < numHeads; h++) {
488
+ const kvHead = Math.floor(h / gqaRatio);
489
+ const qHead = q.subarray(h * headDim, (h + 1) * headDim);
490
+
491
+ // Compute attention scores
492
+ const scores = new Float32Array(seqLen);
493
+ for (let s = 0; s < seqLen; s++) {
494
+ const kHead = kvCache[l].keys[s].subarray(kvHead * headDim, (kvHead + 1) * headDim);
495
+ let dot = 0;
496
+ for (let d = 0; d < headDim; d++) dot += qHead[d] * kHead[d];
497
+ scores[s] = dot / Math.sqrt(headDim);
498
+ }
499
+
500
+ // Causal mask: already handled (only see past positions)
501
+ // Softmax
502
+ const attnWeights = softmax(scores);
503
+
504
+ // Weighted sum of values
505
+ for (let s = 0; s < seqLen; s++) {
506
+ const vHead = kvCache[l].values[s].subarray(kvHead * headDim, (kvHead + 1) * headDim);
507
+ const w = attnWeights[s];
508
+ for (let d = 0; d < headDim; d++) {
509
+ attnOut[h * headDim + d] += w * vHead[d];
510
+ }
511
+ }
512
+ }
513
+
514
+ // 6. Output projection
515
+ const projected = matVec(layer.oProj, attnOut, hiddenDim, hiddenDim);
516
+
517
+ // 7. Residual
518
+ const postAttn = new Float32Array(hiddenDim);
519
+ for (let i = 0; i < hiddenDim; i++) postAttn[i] = x[i] + projected[i];
520
+
521
+ // 8. FFN norm
522
+ const ffnInput = rmsNorm(postAttn, layer.ffnNorm, rmsNormEps);
523
+
524
+ // 9. SwiGLU MLP
525
+ const gate = matVec(layer.gateProj, ffnInput, intermediateSize, hiddenDim);
526
+ const up = matVec(layer.upProj, ffnInput, intermediateSize, hiddenDim);
527
+ const activated = silu(gate);
528
+ for (let i = 0; i < intermediateSize; i++) activated[i] *= up[i];
529
+ const down = matVec(layer.downProj, activated, hiddenDim, intermediateSize);
530
+
531
+ // 10. Residual
532
+ x = new Float32Array(hiddenDim);
533
+ for (let i = 0; i < hiddenDim; i++) x[i] = postAttn[i] + down[i];
534
+ }
535
+
536
+ // Only sample if past prefill
537
+ if (step >= inputTokens.length - 1) {
538
+ // Final norm + LM head
539
+ const finalNormed = rmsNorm(x, model.outputNorm, rmsNormEps);
540
+ const logits = matVec(model.outputWeight, finalNormed, CONFIG.vocabSize, hiddenDim);
541
+
542
+ // Temperature sampling
543
+ const temperature = 0.7;
544
+ for (let i = 0; i < logits.length; i++) logits[i] /= temperature;
545
+ const probs = softmax(logits);
546
+
547
+ // Top-p sampling
548
+ const indexed = Array.from(probs).map((p, i) => ({ p, i })).sort((a, b) => b.p - a.p);
549
+ let cumP = 0;
550
+ let chosen = indexed[0].i;
551
+ const r = Math.random();
552
+ for (const { p, i } of indexed) {
553
+ cumP += p;
554
+ if (r < cumP) { chosen = i; break; }
555
+ if (cumP > 0.9) break;
556
+ }
557
+
558
+ const tokenEnd = performance.now();
559
+ if (step >= inputTokens.length) tokenTimes.push(tokenEnd - tokenStart);
560
+
561
+ if (chosen === CONFIG.eosToken) break;
562
+ allTokens.push(chosen);
563
+ }
564
+ }
565
+
566
+ const totalTime = performance.now() - t0;
567
+ const generatedTokens = allTokens.slice(inputTokens.length);
568
+ const text = model.tokenizer.decode(generatedTokens);
569
+ const avgTokenTime = tokenTimes.length > 0 ? tokenTimes.reduce((a, b) => a + b, 0) / tokenTimes.length : 0;
570
+
571
+ return {
572
+ text,
573
+ tokens: generatedTokens.length,
574
+ totalTimeMs: Math.round(totalTime),
575
+ avgTokenMs: Math.round(avgTokenTime),
576
+ prefillTokens: inputTokens.length,
577
+ engine: 'Aether WASM-SIMD',
578
+ };
579
+ }
580
+
581
+ // ─── HTTP Server ────────────────────────────────────────────────────────────
582
+ function startServer() {
583
+ const server = createServer((req, res) => {
584
+ if (req.method === 'POST' && req.url === '/generate') {
585
+ let body = '';
586
+ req.on('data', c => body += c);
587
+ req.on('end', () => {
588
+ try {
589
+ const { prompt, max_tokens } = JSON.parse(body);
590
+ const result = generate(prompt, max_tokens || 100);
591
+ res.writeHead(200, { 'Content-Type': 'application/json' });
592
+ res.end(JSON.stringify(result));
593
+ } catch (e) {
594
+ res.writeHead(500, { 'Content-Type': 'application/json' });
595
+ res.end(JSON.stringify({ error: e.message, stack: e.stack }));
596
+ }
597
+ });
598
+ } else if (req.url === '/health') {
599
+ res.writeHead(200, { 'Content-Type': 'application/json' });
600
+ res.end(JSON.stringify({ status: 'ok', model: model ? 'loaded' : 'not loaded', loadTime: model?.loadTime }));
601
+ } else {
602
+ res.writeHead(404);
603
+ res.end('Not found');
604
+ }
605
+ });
606
+
607
+ server.listen(PORT, '127.0.0.1', () => {
608
+ console.log(`[Aether] Server listening on http://127.0.0.1:${PORT}`);
609
+ });
610
+ }
611
+
612
+ // ─── Main ───────────────────────────────────────────────────────────────────
613
+ const ggufPath = process.env.GGUF_PATH || join('/tmp/hf_cache', 'buleyean-smollm2-360m-q8_0.gguf');
614
+ const tokenizerPath = process.env.TOKENIZER_PATH || join('/tmp/hf_cache', 'tokenizer.json');
615
+
616
+ // Download if needed
617
+ if (!existsSync(ggufPath)) {
618
+ console.log('[Aether] Downloading GGUF model...');
619
+ execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
620
+ }
621
+ if (!existsSync(tokenizerPath)) {
622
+ console.log('[Aether] Downloading tokenizer...');
623
+ execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
624
+ }
625
+
626
+ loadModel(ggufPath, tokenizerPath);
627
+ startServer();
app.py CHANGED
@@ -1,63 +1,122 @@
1
  """
2
  The Void -- Buleyean RL
3
- Live inference. Real outputs. Nothing hardcoded.
4
  """
5
 
6
  import gradio as gr
7
  import torch
 
 
 
 
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
9
 
10
- print("Loading models...", flush=True)
 
 
 
 
 
 
 
11
 
12
- # Base model -- load from safetensors (fast)
 
13
  base_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
14
  base_model = AutoModelForCausalLM.from_pretrained(
15
  "HuggingFaceTB/SmolLM2-360M-Instruct",
16
  torch_dtype=torch.float32,
17
  device_map="cpu",
18
  )
19
-
20
- # Buleyean model -- load from GGUF via transformers
21
- bule_model = AutoModelForCausalLM.from_pretrained(
22
- "forkjoin-ai/buleyean-smollm2-360m",
23
- gguf_file="buleyean-smollm2-360m-q4_k_m.gguf",
24
- torch_dtype=torch.float32,
25
- device_map="cpu",
26
- )
27
- # Reuse the same tokenizer (same base architecture)
28
- bule_tokenizer = base_tokenizer
29
-
30
- print("Ready.", flush=True)
31
-
32
-
33
- def gen(prompt, model, tokenizer):
 
 
 
 
 
 
 
 
 
 
 
 
34
  messages = [{"role": "user", "content": prompt}]
35
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
36
- inputs = tokenizer(text, return_tensors="pt")
 
 
37
  with torch.no_grad():
38
- outputs = model.generate(
39
  **inputs,
40
- max_new_tokens=300,
41
  temperature=0.7,
42
  top_p=0.9,
43
  do_sample=True,
44
- pad_token_id=tokenizer.eos_token_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  )
46
- response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
47
- return response.strip()
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
  def compare(prompt):
51
  if not prompt or not prompt.strip():
52
- return "", ""
53
- base_out = gen(prompt, base_model, base_tokenizer)
54
- bule_out = gen(prompt, bule_model, bule_tokenizer)
55
- return base_out, bule_out
 
 
 
 
 
 
56
 
57
 
58
  CSS = """
59
  /* AeonOS Design System */
60
- .gradio-container { max-width: 960px !important; margin: 0 auto !important; }
61
  .gradio-container, .dark { background: #09090b !important; }
62
 
63
  /* Hero */
@@ -74,10 +133,14 @@ CSS = """
74
  .base-label { color: #71717a !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
75
  .void-label { color: #3b82f6 !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
76
 
 
 
 
 
77
  /* Input */
 
78
  #prompt-input textarea { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; color: #fafafa !important; font-size: 1rem !important; padding: 1rem !important; }
79
  #prompt-input textarea:focus { border-color: #3b82f6 !important; box-shadow: 0 0 0 2px rgba(59,130,246,0.1) !important; }
80
- #prompt-input > label > span { display: none !important; }
81
 
82
  /* Generate button */
83
  #gen-btn { background: #3b82f6 !important; border: none !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 0.9rem !important; padding: 0.75rem 2rem !important; transition: all 150ms !important; }
@@ -92,9 +155,6 @@ CSS = """
92
  #footer p { color: #52525b; font-size: 0.8rem; }
93
  #footer a { color: #3b82f6; text-decoration: none; }
94
 
95
- /* Divider */
96
- .vs-divider { color: #27272a !important; font-size: 0.75rem !important; text-transform: uppercase !important; letter-spacing: 0.1em !important; }
97
-
98
  /* Hide Gradio chrome */
99
  footer.svelte-1ax1toq { display: none !important; }
100
  .built-with { display: none !important; }
@@ -102,54 +162,61 @@ footer.svelte-1ax1toq { display: none !important; }
102
 
103
  with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zinc"), title="The Void") as demo:
104
 
105
- # Hero
106
  gr.HTML("""
107
  <div id="hero">
108
  <h1>The <span class="accent">Void</span></h1>
109
- <p class="subtitle">Live inference from models trained on rejection alone. No reward model. No chosen examples.<br/>
110
- Type anything. Both models generate in real-time.</p>
111
  </div>
112
  """)
113
 
114
- # Input
115
  prompt = gr.Textbox(elem_id="prompt-input", placeholder="What would you like to ask?", lines=2, label="Prompt", show_label=False, interactive=True)
116
  btn = gr.Button("Generate", elem_id="gen-btn", variant="primary")
117
 
118
- # Outputs
119
  with gr.Row(equal_height=True):
120
  with gr.Column():
121
- gr.HTML('<p class="base-label">Base Model</p>')
122
- base_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"])
123
- with gr.Column(min_width=40):
 
124
  gr.HTML('<p style="color:#27272a; text-align:center; padding-top:4rem; font-size:0.75rem; letter-spacing:0.1em;">VS</p>')
125
  with gr.Column():
126
- gr.HTML('<p class="void-label">Trained from the Void</p>')
127
- bule_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"])
 
 
 
 
 
 
 
 
 
 
128
 
129
- btn.click(compare, [prompt], [base_out, bule_out])
130
- prompt.submit(compare, [prompt], [base_out, bule_out])
131
 
132
- # Prompt suggestions
133
  gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
134
  with gr.Row():
135
  for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
136
  gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
137
- fn=lambda x=p: compare(x), outputs=[base_out, bule_out]
138
  ).then(fn=lambda x=p: x, outputs=[prompt])
139
 
140
- # Footer
141
  gr.HTML("""
142
  <div id="footer">
143
- <p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">SmolLM2-360M-Instruct &nbsp;&middot;&nbsp; Q4_K_M GGUF &nbsp;&middot;&nbsp; Live inference on CPU</p>
 
 
 
144
  <p>
145
- <a href="https://forkracefold.com/">Whitepaper</a> &nbsp;&middot;&nbsp;
146
- <a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> &nbsp;&middot;&nbsp;
147
- <a href="https://huggingface.co/forkjoin-ai">Models</a> &nbsp;&middot;&nbsp;
148
- <a href="https://huggingface.co/spaces/forkjoin-ai/glossolalia">Glossolalia</a> &nbsp;&middot;&nbsp;
149
- <a href="https://huggingface.co/spaces/forkjoin-ai/void-attention">Void Attention</a> &nbsp;&middot;&nbsp;
150
- <a href="https://huggingface.co/spaces/forkjoin-ai/metacog">METACOG</a>
151
  </p>
152
- <p style="margin-top:1rem;">500+ Lean 4 theorems &nbsp;&middot;&nbsp; Zero sorry &nbsp;&middot;&nbsp; <a href="https://forkracefold.com/">&phi;&sup2; = &phi; + 1</a></p>
153
  </div>
154
  """)
155
 
 
1
  """
2
  The Void -- Buleyean RL
3
+ PyTorch vs Aether. Side by side. Let the speed speak.
4
  """
5
 
6
  import gradio as gr
7
  import torch
8
+ import json
9
+ import time
10
+ import subprocess
11
+ import urllib.request
12
  from transformers import AutoModelForCausalLM, AutoTokenizer
13
 
14
+ # ─── Start Aether sidecar ────────────────────────────────────────────────────
15
+ print("[Void] Starting Aether inference server...", flush=True)
16
+ aether_proc = subprocess.Popen(
17
+ ["node", "aether-server.mjs"],
18
+ env={**__import__('os').environ, "AETHER_PORT": "7861"},
19
+ stdout=subprocess.PIPE,
20
+ stderr=subprocess.STDOUT,
21
+ )
22
 
23
+ # ─── Load PyTorch model ──────────────────────────────────────────────────────
24
+ print("[Void] Loading PyTorch base model...", flush=True)
25
  base_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
26
  base_model = AutoModelForCausalLM.from_pretrained(
27
  "HuggingFaceTB/SmolLM2-360M-Instruct",
28
  torch_dtype=torch.float32,
29
  device_map="cpu",
30
  )
31
+ print("[Void] PyTorch model ready.", flush=True)
32
+
33
+ # Wait for Aether to be ready
34
+ print("[Void] Waiting for Aether...", flush=True)
35
+ for attempt in range(120):
36
+ try:
37
+ req = urllib.request.Request("http://127.0.0.1:7861/health")
38
+ resp = urllib.request.urlopen(req, timeout=2)
39
+ health = json.loads(resp.read())
40
+ if health.get("status") == "ok" and health.get("model") == "loaded":
41
+ print(f"[Void] Aether ready (model loaded in {health.get('loadTime')}ms)", flush=True)
42
+ break
43
+ except Exception:
44
+ pass
45
+ # Print Aether stdout lines as they come
46
+ import select
47
+ if aether_proc.stdout and select.select([aether_proc.stdout], [], [], 0)[0]:
48
+ line = aether_proc.stdout.readline()
49
+ if line:
50
+ print(f" [Aether] {line.decode().strip()}", flush=True)
51
+ time.sleep(1)
52
+ else:
53
+ print("[Void] WARNING: Aether not ready after 120s, continuing anyway", flush=True)
54
+
55
+
56
+ def gen_pytorch(prompt):
57
+ """Generate with PyTorch (standard)"""
58
  messages = [{"role": "user", "content": prompt}]
59
+ text = base_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
60
+ inputs = base_tokenizer(text, return_tensors="pt")
61
+
62
+ t0 = time.perf_counter()
63
  with torch.no_grad():
64
+ outputs = base_model.generate(
65
  **inputs,
66
+ max_new_tokens=100,
67
  temperature=0.7,
68
  top_p=0.9,
69
  do_sample=True,
70
+ pad_token_id=base_tokenizer.eos_token_id,
71
+ )
72
+ elapsed = time.perf_counter() - t0
73
+ n_tokens = outputs.shape[1] - inputs["input_ids"].shape[1]
74
+
75
+ response = base_tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
76
+ ms_per_tok = (elapsed * 1000 / n_tokens) if n_tokens > 0 else 0
77
+ return response, elapsed, n_tokens, ms_per_tok
78
+
79
+
80
+ def gen_aether(prompt):
81
+ """Generate with Aether (our engine)"""
82
+ try:
83
+ data = json.dumps({"prompt": prompt, "max_tokens": 100}).encode()
84
+ req = urllib.request.Request(
85
+ "http://127.0.0.1:7861/generate",
86
+ data=data,
87
+ headers={"Content-Type": "application/json"},
88
  )
89
+ t0 = time.perf_counter()
90
+ resp = urllib.request.urlopen(req, timeout=300)
91
+ wall_time = time.perf_counter() - t0
92
+ result = json.loads(resp.read())
93
+ return (
94
+ result["text"],
95
+ result["totalTimeMs"] / 1000,
96
+ result["tokens"],
97
+ result["avgTokenMs"],
98
+ )
99
+ except Exception as e:
100
+ return f"[Aether error: {e}]", 0, 0, 0
101
 
102
 
103
  def compare(prompt):
104
  if not prompt or not prompt.strip():
105
+ return "", "", "", ""
106
+
107
+ # Run both
108
+ base_text, base_time, base_toks, base_ms = gen_pytorch(prompt)
109
+ aether_text, aether_time, aether_toks, aether_ms = gen_aether(prompt)
110
+
111
+ base_stats = f"{base_toks} tokens in {base_time:.1f}s ({base_ms:.0f}ms/tok)"
112
+ aether_stats = f"{aether_toks} tokens in {aether_time:.1f}s ({aether_ms:.0f}ms/tok)"
113
+
114
+ return base_text, aether_text, base_stats, aether_stats
115
 
116
 
117
  CSS = """
118
  /* AeonOS Design System */
119
+ .gradio-container { max-width: 1060px !important; margin: 0 auto !important; }
120
  .gradio-container, .dark { background: #09090b !important; }
121
 
122
  /* Hero */
 
133
  .base-label { color: #71717a !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
134
  .void-label { color: #3b82f6 !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; }
135
 
136
+ /* Stats */
137
+ .stats-text { font-family: 'SF Mono', 'Fira Code', monospace !important; font-size: 0.8rem !important; color: #52525b !important; }
138
+ .stats-text.faster { color: #22c55e !important; }
139
+
140
  /* Input */
141
+ #prompt-input > label > span { display: none !important; }
142
  #prompt-input textarea { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; color: #fafafa !important; font-size: 1rem !important; padding: 1rem !important; }
143
  #prompt-input textarea:focus { border-color: #3b82f6 !important; box-shadow: 0 0 0 2px rgba(59,130,246,0.1) !important; }
 
144
 
145
  /* Generate button */
146
  #gen-btn { background: #3b82f6 !important; border: none !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 0.9rem !important; padding: 0.75rem 2rem !important; transition: all 150ms !important; }
 
155
  #footer p { color: #52525b; font-size: 0.8rem; }
156
  #footer a { color: #3b82f6; text-decoration: none; }
157
 
 
 
 
158
  /* Hide Gradio chrome */
159
  footer.svelte-1ax1toq { display: none !important; }
160
  .built-with { display: none !important; }
 
162
 
163
  with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zinc"), title="The Void") as demo:
164
 
 
165
  gr.HTML("""
166
  <div id="hero">
167
  <h1>The <span class="accent">Void</span></h1>
168
+ <p class="subtitle">PyTorch vs Aether. Same model. Different engines. Live inference.<br/>
169
+ Left: standard PyTorch CPU. Right: Aether WASM-SIMD kernels. Both generate in real-time.</p>
170
  </div>
171
  """)
172
 
 
173
  prompt = gr.Textbox(elem_id="prompt-input", placeholder="What would you like to ask?", lines=2, label="Prompt", show_label=False, interactive=True)
174
  btn = gr.Button("Generate", elem_id="gen-btn", variant="primary")
175
 
 
176
  with gr.Row(equal_height=True):
177
  with gr.Column():
178
+ gr.HTML('<p class="base-label">PyTorch (standard)</p>')
179
+ base_out = gr.Textbox(lines=8, show_label=False, interactive=False, elem_classes=["response-card"])
180
+ base_stats = gr.HTML('<p class="stats-text">--</p>')
181
+ with gr.Column(min_width=30):
182
  gr.HTML('<p style="color:#27272a; text-align:center; padding-top:4rem; font-size:0.75rem; letter-spacing:0.1em;">VS</p>')
183
  with gr.Column():
184
+ gr.HTML('<p class="void-label">Aether (our engine)</p>')
185
+ aether_out = gr.Textbox(lines=8, show_label=False, interactive=False, elem_classes=["response-card"])
186
+ aether_stats = gr.HTML('<p class="stats-text">--</p>')
187
+
188
+ def run_compare(prompt_text):
189
+ base_text, aether_text, b_stats, a_stats = compare(prompt_text)
190
+ return (
191
+ base_text,
192
+ aether_text,
193
+ f'<p class="stats-text">{b_stats}</p>',
194
+ f'<p class="stats-text">{a_stats}</p>',
195
+ )
196
 
197
+ btn.click(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
198
+ prompt.submit(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
199
 
 
200
  gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
201
  with gr.Row():
202
  for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
203
  gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
204
+ fn=lambda x=p: run_compare(x), outputs=[base_out, aether_out, base_stats, aether_stats]
205
  ).then(fn=lambda x=p: x, outputs=[prompt])
206
 
 
207
  gr.HTML("""
208
  <div id="footer">
209
+ <p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">
210
+ SmolLM2-360M-Instruct &middot; Buleyean RL &middot;
211
+ Left: PyTorch CPU &middot; Right: Aether WASM-SIMD (zero ML dependencies)
212
+ </p>
213
  <p>
214
+ <a href="https://forkracefold.com/">Whitepaper</a> &middot;
215
+ <a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> &middot;
216
+ <a href="https://huggingface.co/forkjoin-ai">Models</a> &middot;
217
+ <a href="https://huggingface.co/spaces/forkjoin-ai/glossolalia">Glossolalia</a>
 
 
218
  </p>
219
+ <p style="margin-top:1rem;">500+ Lean 4 theorems &middot; Zero sorry &middot; <a href="https://forkracefold.com/">&phi;&sup2; = &phi; + 1</a></p>
220
  </div>
221
  """)
222
 
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- --extra-index-url https://download.pytorch.org/whl/cpu
2
  torch>=2.1.0
3
  transformers>=4.46.0
4
  huggingface-hub>=0.26.0
5
  sentencepiece>=0.2.0
6
  accelerate>=1.0.0
7
  gguf>=0.10.0
 
 
 
1
  torch>=2.1.0
2
  transformers>=4.46.0
3
  huggingface-hub>=0.26.0
4
  sentencepiece>=0.2.0
5
  accelerate>=1.0.0
6
  gguf>=0.10.0
7
+ gradio>=5.0.0