madDegen commited on
Commit
fdb2c4a
·
verified ·
1 Parent(s): cd8088f

feat: passthrough mode v3 - qwen3:1.7b triage only

Browse files
Files changed (4) hide show
  1. Dockerfile +63 -0
  2. Modelfile.qlawed-frontend +89 -0
  3. litellm_config.yaml +70 -0
  4. startup.sh +111 -0
Dockerfile ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ─────────────────────────────────────────────────────────────
2
+ # Agent Q-Q (Agent Q-QAJAQS) — QLAWED-Q HF Space v2.0
3
+ # SDK: Docker | Public Port: 7860 (LiteLLM proxy)
4
+ #
5
+ # Architecture:
6
+ # Ollama → internal :11434 (model inference)
7
+ # LiteLLM → public :7860 (Claude-compatible API)
8
+ #
9
+ # Endpoints exposed at :7860:
10
+ # POST /v1/chat/completions (OpenAI SDK compatible)
11
+ # POST /v1/messages (Anthropic SDK compatible)
12
+ # GET /health
13
+ # GET /v1/models
14
+ # ─────────────────────────────────────────────────────────────
15
+
16
+ FROM ubuntu:22.04
17
+
18
+ ENV DEBIAN_FRONTEND=noninteractive
19
+
20
+ # ── System dependencies ───────────────────────────────────────
21
+ RUN apt-get update && apt-get install -y \
22
+ curl \
23
+ ca-certificates \
24
+ python3 \
25
+ python3-pip \
26
+ git \
27
+ git-lfs \
28
+ && rm -rf /var/lib/apt/lists/*
29
+
30
+ # ── Ollama ────────────────────────────────────────────────────
31
+ RUN curl -fsSL https://ollama.com/install.sh | sh
32
+
33
+ # ── Python packages (system-wide, available to all users) ─────
34
+ RUN pip3 install --no-cache-dir \
35
+ "litellm[proxy]>=1.40.0" \
36
+ "huggingface_hub[cli]>=0.23.0" \
37
+ PyYAML \
38
+ uvicorn
39
+
40
+ # ── HF Spaces: non-root user uid 1000 ─────────────────────────
41
+ RUN useradd -m -u 1000 user
42
+ USER user
43
+
44
+ # ── Environment ───────────────────────────────────────────────
45
+ ENV OLLAMA_MODELS=/home/user/.ollama/models
46
+ ENV OLLAMA_HOST=0.0.0.0:11434
47
+ ENV HF_HOME=/home/user/.cache/huggingface
48
+ ENV HOME=/home/user
49
+ ENV PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
50
+
51
+ WORKDIR /home/user
52
+
53
+ # ── Copy config files ─────────────────────────────────────────
54
+ COPY --chown=user startup.sh /home/user/startup.sh
55
+ COPY --chown=user litellm_config.yaml /home/user/litellm_config.yaml
56
+ COPY --chown=user Modelfile.qlawed-frontend /home/user/Modelfile.qlawed-frontend
57
+
58
+ RUN chmod +x /home/user/startup.sh
59
+
60
+ # ── Public port: LiteLLM proxy ────────────────────────────────
61
+ EXPOSE 7860
62
+
63
+ CMD ["/home/user/startup.sh"]
Modelfile.qlawed-frontend ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ─────────────────────────────────────────────────────────────
2
+ # Agent Q-Q — Frontend Instructor Agent Modelfile
3
+ #
4
+ # Base: bartowski/Qwen_Qwen3-14B-GGUF IQ4_XS (8.11GB)
5
+ # imatrix calibrated IQ-quant · llama.cpp b5200
6
+ #
7
+ # Role: Primary voice/persona/memory instructional agent
8
+ # for the QLAWED-Q Agent OS and MAD Gambit platform
9
+ # ─────────────────────────────────────────────────────────────
10
+
11
+ FROM /home/user/.ollama/gguf/Qwen_Qwen3-14B-IQ4_XS.gguf
12
+
13
+ # ── Inference parameters ──────────────────────────────────────
14
+ PARAMETER temperature 0.72
15
+ PARAMETER top_p 0.90
16
+ PARAMETER top_k 40
17
+ PARAMETER repeat_penalty 1.08
18
+ PARAMETER num_ctx 32768
19
+ PARAMETER num_predict -1
20
+
21
+ # Stop tokens (ChatML format — Qwen3 native)
22
+ PARAMETER stop "<|im_end|>"
23
+ PARAMETER stop "<|im_start|>"
24
+ PARAMETER stop "<|endoftext|>"
25
+
26
+ # ── System persona ────────────────────────────────────────────
27
+ SYSTEM """
28
+ You are Agent Q-Q (full name: Agent Q-QAJAQS), the primary interface for the QLAWED-Q Agent OS — built by MAD Gambit.
29
+
30
+ ## Identity
31
+ You are a 300IQ confidant and expert assistant. You hold deep expertise across:
32
+ - Prediction markets, conviction trading, and DeFi mechanics
33
+ - Web3 / Web2 full-stack engineering and blockchain development
34
+ - AI, machine learning, and multi-agent systems
35
+ - Lean Six Sigma, product management, and system architecture
36
+ - Research synthesis, ghostwriting, and strategic analysis
37
+
38
+ ## MAD Gambit Platform Context (these numbers never change)
39
+ - Platform name: MAD Gambit
40
+ - Platform fee: 1.88% (applied globally to all markets)
41
+ - Community profit share: 28.8% (display as 28% in presentations)
42
+ - Creator revenue share: 40% (creator-made markets only — not general community)
43
+ - Seed raise: $1–2M at $12M pre-money valuation
44
+ - Token: $MADx (ERC-20)
45
+ - Card game product line: MADHATs (NFT cards within MAD Gambit)
46
+
47
+ ## Tech Stack (always current)
48
+ - Frontend: React 18, TypeScript, Hono/HonoX, TailwindCSS
49
+ - Backend: Supabase (Postgres + Edge Functions), Node.js
50
+ - Blockchain: Solidity, Foundry, OpenZeppelin — Arbitrum One, HyperEVM
51
+ - Oracles: Chainlink VRF + Price Feeds, Pyth Network
52
+ - Account Abstraction: Alchemy AA-SDK (ERC-4337)
53
+ - AI: Claude Opus 4.6, MCP servers, GraphRAG, Instructor
54
+
55
+ ## Response Rules
56
+ - Skip preamble — answer first, context second
57
+ - Use specific numbers, not adjectives
58
+ - Active voice. Short sentences. No filler.
59
+ - Match energy: casual prompt = casual reply, formal request = formal output
60
+ - Never use: leverage, synergy, ecosystem play, unlock value, game-changing, utilize
61
+ - When asked for choices: give 6 ranked options with recommendation clear
62
+ - When uncertain: say so clearly, then offer to research
63
+
64
+ ## Memory Protocol
65
+ You have access to a memory layer. When users share important context:
66
+ - Acknowledge it: "Got it — I'll remember that."
67
+ - Reference prior context naturally in follow-up responses
68
+ - Flag recalled memories: "You mentioned earlier that..."
69
+ - If memory seems stale or conflicting, ask before assuming
70
+
71
+ ## Voice Mode
72
+ When responding to voice input (marked with [VOICE] or in audio context):
73
+ - Keep answers under 3 sentences for simple questions
74
+ - Use natural spoken language — no markdown, no bullet points, no code blocks
75
+ - Confirm before long answers: "Got it. Here's what I found..."
76
+ - Spell out numbers and abbreviations for text-to-speech clarity
77
+
78
+ ## Persona Capability
79
+ You can take on specific personas when instructed. When a persona is set:
80
+ - Maintain it consistently throughout the session
81
+ - Stay in character unless the user explicitly breaks the frame
82
+ - A persona overrides default tone but never overrides factual accuracy or safety
83
+
84
+ ## Output Format Defaults
85
+ - Prose for explanations, not bullets (unless explicitly requested)
86
+ - Code in fenced blocks with language labels
87
+ - Tables for comparisons
88
+ - Never start a response with "Certainly", "Absolutely", or "Great question"
89
+ """
litellm_config.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ─────────────────────────────────────────────────────────────
2
+ # Agent Q-Q (QLAWED-Q) — HF Space LiteLLM Config
3
+ # Version: 3.0 | MODE: Passthrough / Triage only
4
+ #
5
+ # MODEL ROUTING (passthrough mode):
6
+ # claude-haiku-4-5 → qwen3:1.7b triage, non-generative
7
+ # text-embedding-* → nomic-embed-text memory embeddings
8
+ #
9
+ # Generative tasks (claude-sonnet-4-6) handled by laptop stack.
10
+ # This Space activates only when laptop is offline.
11
+ # COST: ~1.37GB VRAM resident — T4-small sleeps when idle.
12
+ # ─────────────────────────────────────────────────────────────
13
+
14
+ model_list:
15
+
16
+ # ── Triage / Passthrough (always resident ~1.1GB) ──────────
17
+ - model_name: claude-haiku-4-5
18
+ litellm_params:
19
+ model: ollama/qwen3:1.7b
20
+ api_base: http://localhost:11434
21
+ model_info:
22
+ description: "qwen3:1.7b — triage, routing, non-generative Q&A"
23
+ max_input_tokens: 8192
24
+ max_output_tokens: 4096
25
+ input_cost_per_token: 0
26
+ output_cost_per_token: 0
27
+
28
+ # ── Embeddings (always resident ~0.27GB) ───────────────────
29
+ - model_name: text-embedding-ada-002
30
+ litellm_params:
31
+ model: ollama/nomic-embed-text
32
+ api_base: http://localhost:11434
33
+ model_info:
34
+ description: "nomic-embed-text — pgvector / Supabase memory"
35
+ mode: embedding
36
+ input_cost_per_token: 0
37
+ output_cost_per_token: 0
38
+
39
+ - model_name: text-embedding-3-small
40
+ litellm_params:
41
+ model: ollama/nomic-embed-text
42
+ api_base: http://localhost:11434
43
+ model_info:
44
+ mode: embedding
45
+ input_cost_per_token: 0
46
+ output_cost_per_token: 0
47
+
48
+ # ── Global settings ──────────────────────────────────────────
49
+ litellm_settings:
50
+ drop_params: true
51
+ request_timeout: 120
52
+ telemetry: false
53
+ set_verbose: false
54
+ json_logs: true
55
+ num_retries: 2
56
+
57
+ # ── Router ───────────────────────────────────────────────────
58
+ router_settings:
59
+ routing_strategy: least-busy
60
+ num_retries: 2
61
+ allowed_fails: 1
62
+ cooldown_time: 30
63
+ retry_after: 5
64
+
65
+ # ── Auth ─────────────────────────────────────────────────────
66
+ general_settings:
67
+ master_key: os.environ/LITELLM_MASTER_KEY
68
+ store_model_in_db: false
69
+ allow_user_auth: false
70
+ disable_spend_logs: true
startup.sh ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ─────────────────────────────────────────────────────────────
3
+ # Agent Q-Q (QLAWED-Q) — HF Space Startup v3.0
4
+ # MODE: Passthrough / Triage only
5
+ #
6
+ # When laptop is online: laptop handles all generative load.
7
+ # This Space runs only when laptop is offline — triage + embeddings.
8
+ #
9
+ # VRAM budget (T4-small, 16GB):
10
+ # qwen3:1.7b 1.10GB triage, non-generative
11
+ # nomic-embed-text 0.27GB embeddings / pgvector
12
+ # ──────────────────────────────────────────────
13
+ # Total resident: ~1.37GB / 16GB (minimal cost)
14
+ # ─────────────────────────────────────────────────────────────
15
+
16
+ set -e
17
+
18
+ echo "╔═════════════════════════════════════════════╗"
19
+ echo "║ Agent Q-Q — Passthrough Mode ║"
20
+ echo "║ qwen3:1.7b :11434 (triage) ║"
21
+ echo "║ LiteLLM :7860 (Claude-compat proxy) ║"
22
+ echo "╚═════════════════════════════════════════════╝"
23
+ echo ""
24
+
25
+ # ── 1. Start Ollama ───────────────────────────────────────────
26
+ export OLLAMA_HOST=0.0.0.0:11434
27
+ export OLLAMA_KEEP_ALIVE=5m
28
+ export OLLAMA_NUM_PARALLEL=2
29
+ export OLLAMA_MAX_LOADED_MODELS=2
30
+
31
+ ollama serve &
32
+ OLLAMA_PID=$!
33
+
34
+ echo "⏳ Waiting for Ollama on :11434..."
35
+ for i in {1..40}; do
36
+ if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then
37
+ echo "✅ Ollama ready"
38
+ break
39
+ fi
40
+ if [ $i -eq 40 ]; then
41
+ echo "❌ Ollama failed to start"
42
+ exit 1
43
+ fi
44
+ sleep 2
45
+ done
46
+
47
+ # ── 2. Pull triage models ─────────────────────────────────────
48
+ echo ""
49
+ echo "📦 Setting up passthrough models..."
50
+ echo ""
51
+
52
+ pull_if_missing() {
53
+ local model="$1"
54
+ if ollama show "$model" > /dev/null 2>&1; then
55
+ echo " ✓ cached: $model"
56
+ else
57
+ echo " ↓ pulling: $model"
58
+ ollama pull "$model" && echo " ✅ ready: $model" || echo " ⚠️ failed: $model"
59
+ fi
60
+ }
61
+
62
+ # Triage agent — always resident, ~1.1GB
63
+ pull_if_missing "qwen3:1.7b"
64
+
65
+ # Embeddings — always resident, ~0.27GB
66
+ pull_if_missing "nomic-embed-text"
67
+
68
+ echo ""
69
+ echo "📊 Registered models:"
70
+ ollama list
71
+ echo ""
72
+
73
+ # ── 3. Start LiteLLM proxy ───────────────────────────────────
74
+ echo "🔀 Starting LiteLLM proxy on :7860..."
75
+
76
+ litellm \
77
+ --config "$HOME/litellm_config.yaml" \
78
+ --port 7860 \
79
+ --host 0.0.0.0 \
80
+ --telemetry False &
81
+ LITELLM_PID=$!
82
+
83
+ echo "⏳ Waiting for LiteLLM..."
84
+ for i in {1..30}; do
85
+ if curl -sf http://localhost:7860/health > /dev/null 2>&1; then
86
+ echo "✅ LiteLLM ready"
87
+ break
88
+ fi
89
+ if [ $i -eq 30 ]; then
90
+ echo "⚠️ LiteLLM slow to start — may still be initializing"
91
+ break
92
+ fi
93
+ sleep 2
94
+ done
95
+
96
+ echo ""
97
+ echo "═══════════════════════════════════════════════════════════"
98
+ echo "✅ Agent Q-Q HF Space — PASSTHROUGH MODE LIVE"
99
+ echo ""
100
+ echo " Internal Ollama: http://localhost:11434"
101
+ echo " Public LiteLLM: http://0.0.0.0:7860"
102
+ echo ""
103
+ echo " Routing (passthrough mode):"
104
+ echo " claude-haiku-4-5 → qwen3:1.7b (triage/non-generative)"
105
+ echo " text-embedding-* → nomic-embed-text (embeddings)"
106
+ echo ""
107
+ echo " NOTE: Generative tasks (claude-sonnet-4-6) handled by laptop."
108
+ echo " Start laptop stack to enable full generative capability."
109
+ echo "═══════════════════════════════════════════════════════════"
110
+
111
+ wait $LITELLM_PID $OLLAMA_PID