| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| WORKDIR=/root/ternary_engine |
| MODEL_HF=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B |
| MODEL_HF_DIR=$WORKDIR/deepseek-r1-1.5b-hf |
| TERNARY_DIR=$WORKDIR/deepseek-r1-1.5b-ternary |
|
|
| echo "=== Ternary Inference Engine Build ===" |
| echo "Target: AVX-512 Skylake" |
| echo "" |
|
|
| mkdir -p $WORKDIR |
| cd $WORKDIR |
|
|
| |
| echo "[1/4] Compiling AVX-512 kernel..." |
| gcc -O3 -march=skylake-avx512 -mavx512f -mavx512bw -mavx512dq -mavx512vl \ |
| -shared -fPIC -lm \ |
| -o ternary_kernel.so ternary_kernel.c |
| echo " -> ternary_kernel.so built" |
| ls -lh ternary_kernel.so |
|
|
| |
| echo "" |
| echo "[2/4] Downloading model weights..." |
| pip install --break-system-packages -q safetensors tokenizers 2>/dev/null |
| python3 -c " |
| from huggingface_hub import snapshot_download |
| snapshot_download('$MODEL_HF', local_dir='$MODEL_HF_DIR', |
| ignore_patterns=['*.md', '*.txt', 'figures/*']) |
| print('Download complete') |
| " |
|
|
| |
| echo "" |
| echo "[3/4] Converting to ternary format..." |
| python3 convert.py "$MODEL_HF_DIR" "$TERNARY_DIR" 0.7 |
|
|
| |
| echo "" |
| echo "[4/4] Verifying..." |
| ls -lh $TERNARY_DIR/ | head -20 |
| echo "" |
| du -sh $TERNARY_DIR/ |
| echo "" |
|
|
| |
| echo "Running speed test..." |
| python3 -c " |
| from inference import TernaryQwen, load_kernel |
| import time |
| import os |
| |
| kernel = load_kernel('$WORKDIR/ternary_kernel.so') |
| model = TernaryQwen('$TERNARY_DIR', kernel) |
| |
| # Warm up |
| import numpy as np |
| cache_module = __import__('inference') |
| cache = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim) |
| hidden = model.forward_token(9707, cache, 0) # 'Hello' |
| |
| # Benchmark single token |
| times = [] |
| for i in range(5): |
| cache2 = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim) |
| t0 = time.time() |
| h = model.forward_token(9707, cache2, 0) |
| times.append(time.time() - t0) |
| |
| avg = sum(times) / len(times) |
| print(f'Single token forward: {avg*1000:.1f}ms ({1/avg:.1f} tok/s)') |
| print(f'Times: {[f\"{t*1000:.1f}ms\" for t in times]}') |
| " |
|
|
| echo "" |
| echo "=== Build complete ===" |
| echo "To start server: cd $WORKDIR && TERNARY_MODEL_DIR=$TERNARY_DIR TOKENIZER_DIR=$MODEL_HF_DIR python3 server.py" |
|
|