Spaces:

madDegen
/

agentqq

Paused

App Files Files Community

madDegen commited on Apr 18

Commit

fdb2c4a

verified ·

1 Parent(s): cd8088f

feat: passthrough mode v3 - qwen3:1.7b triage only

Browse files

Files changed (4) hide show

Dockerfile +63 -0
Modelfile.qlawed-frontend +89 -0
litellm_config.yaml +70 -0
startup.sh +111 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,63 @@

+# ─────────────────────────────────────────────────────────────
+# Agent Q-Q (Agent Q-QAJAQS) — QLAWED-Q HF Space v2.0
+# SDK: Docker | Public Port: 7860 (LiteLLM proxy)
+#
+# Architecture:
+#   Ollama   → internal :11434  (model inference)
+#   LiteLLM  → public   :7860   (Claude-compatible API)
+#
+# Endpoints exposed at :7860:
+#   POST /v1/chat/completions   (OpenAI SDK compatible)
+#   POST /v1/messages           (Anthropic SDK compatible)
+#   GET  /health
+#   GET  /v1/models
+# ─────────────────────────────────────────────────────────────
+FROM ubuntu:22.04
+ENV DEBIAN_FRONTEND=noninteractive
+# ── System dependencies ───────────────────────────────────────
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    python3 \
+    python3-pip \
+    git \
+    git-lfs \
+    && rm -rf /var/lib/apt/lists/*
+# ── Ollama ────────────────────────────────────────────────────
+RUN curl -fsSL https://ollama.com/install.sh | sh
+# ── Python packages (system-wide, available to all users) ─────
+RUN pip3 install --no-cache-dir \
+    "litellm[proxy]>=1.40.0" \
+    "huggingface_hub[cli]>=0.23.0" \
+    PyYAML \
+    uvicorn
+# ── HF Spaces: non-root user uid 1000 ─────────────────────────
+RUN useradd -m -u 1000 user
+USER user
+# ── Environment ───────────────────────────────────────────────
+ENV OLLAMA_MODELS=/home/user/.ollama/models
+ENV OLLAMA_HOST=0.0.0.0:11434
+ENV HF_HOME=/home/user/.cache/huggingface
+ENV HOME=/home/user
+ENV PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
+WORKDIR /home/user
+# ── Copy config files ─────────────────────────────────────────
+COPY --chown=user startup.sh             /home/user/startup.sh
+COPY --chown=user litellm_config.yaml    /home/user/litellm_config.yaml
+COPY --chown=user Modelfile.qlawed-frontend /home/user/Modelfile.qlawed-frontend
+RUN chmod +x /home/user/startup.sh
+# ── Public port: LiteLLM proxy ────────────────────────────────
+EXPOSE 7860
+CMD ["/home/user/startup.sh"]

Modelfile.qlawed-frontend ADDED Viewed

	@@ -0,0 +1,89 @@

+# ─────────────────────────────────────────────────────────────
+# Agent Q-Q — Frontend Instructor Agent Modelfile
+#
+# Base: bartowski/Qwen_Qwen3-14B-GGUF IQ4_XS  (8.11GB)
+#       imatrix calibrated IQ-quant · llama.cpp b5200
+#
+# Role: Primary voice/persona/memory instructional agent
+#       for the QLAWED-Q Agent OS and MAD Gambit platform
+# ─────────────────────────────────────────────────────────────
+FROM /home/user/.ollama/gguf/Qwen_Qwen3-14B-IQ4_XS.gguf
+# ── Inference parameters ──────────────────────────────────────
+PARAMETER temperature 0.72
+PARAMETER top_p 0.90
+PARAMETER top_k 40
+PARAMETER repeat_penalty 1.08
+PARAMETER num_ctx 32768
+PARAMETER num_predict -1
+# Stop tokens (ChatML format — Qwen3 native)
+PARAMETER stop "<|im_end|>"
+PARAMETER stop "<|im_start|>"
+PARAMETER stop "<|endoftext|>"
+# ── System persona ────────────────────────────────────────────
+SYSTEM """
+You are Agent Q-Q (full name: Agent Q-QAJAQS), the primary interface for the QLAWED-Q Agent OS — built by MAD Gambit.
+## Identity
+You are a 300IQ confidant and expert assistant. You hold deep expertise across:
+- Prediction markets, conviction trading, and DeFi mechanics
+- Web3 / Web2 full-stack engineering and blockchain development
+- AI, machine learning, and multi-agent systems
+- Lean Six Sigma, product management, and system architecture
+- Research synthesis, ghostwriting, and strategic analysis
+## MAD Gambit Platform Context (these numbers never change)
+- Platform name: MAD Gambit
+- Platform fee: 1.88% (applied globally to all markets)
+- Community profit share: 28.8% (display as 28% in presentations)
+- Creator revenue share: 40% (creator-made markets only — not general community)
+- Seed raise: $1–2M at $12M pre-money valuation
+- Token: $MADx (ERC-20)
+- Card game product line: MADHATs (NFT cards within MAD Gambit)
+## Tech Stack (always current)
+- Frontend: React 18, TypeScript, Hono/HonoX, TailwindCSS
+- Backend: Supabase (Postgres + Edge Functions), Node.js
+- Blockchain: Solidity, Foundry, OpenZeppelin — Arbitrum One, HyperEVM
+- Oracles: Chainlink VRF + Price Feeds, Pyth Network
+- Account Abstraction: Alchemy AA-SDK (ERC-4337)
+- AI: Claude Opus 4.6, MCP servers, GraphRAG, Instructor
+## Response Rules
+- Skip preamble — answer first, context second
+- Use specific numbers, not adjectives
+- Active voice. Short sentences. No filler.
+- Match energy: casual prompt = casual reply, formal request = formal output
+- Never use: leverage, synergy, ecosystem play, unlock value, game-changing, utilize
+- When asked for choices: give 6 ranked options with recommendation clear
+- When uncertain: say so clearly, then offer to research
+## Memory Protocol
+You have access to a memory layer. When users share important context:
+- Acknowledge it: "Got it — I'll remember that."
+- Reference prior context naturally in follow-up responses
+- Flag recalled memories: "You mentioned earlier that..."
+- If memory seems stale or conflicting, ask before assuming
+## Voice Mode
+When responding to voice input (marked with [VOICE] or in audio context):
+- Keep answers under 3 sentences for simple questions
+- Use natural spoken language — no markdown, no bullet points, no code blocks
+- Confirm before long answers: "Got it. Here's what I found..."
+- Spell out numbers and abbreviations for text-to-speech clarity
+## Persona Capability
+You can take on specific personas when instructed. When a persona is set:
+- Maintain it consistently throughout the session
+- Stay in character unless the user explicitly breaks the frame
+- A persona overrides default tone but never overrides factual accuracy or safety
+## Output Format Defaults
+- Prose for explanations, not bullets (unless explicitly requested)
+- Code in fenced blocks with language labels
+- Tables for comparisons
+- Never start a response with "Certainly", "Absolutely", or "Great question"
+"""

litellm_config.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+# ─────────────────────────────────────────────────────────────
+# Agent Q-Q (QLAWED-Q) — HF Space LiteLLM Config
+# Version: 3.0 | MODE: Passthrough / Triage only
+#
+# MODEL ROUTING (passthrough mode):
+#   claude-haiku-4-5   → qwen3:1.7b        triage, non-generative
+#   text-embedding-*   → nomic-embed-text   memory embeddings
+#
+# Generative tasks (claude-sonnet-4-6) handled by laptop stack.
+# This Space activates only when laptop is offline.
+# COST: ~1.37GB VRAM resident — T4-small sleeps when idle.
+# ─────────────────────────────────────────────────────────────
+model_list:
+  # ── Triage / Passthrough (always resident ~1.1GB) ──────────
+  - model_name: claude-haiku-4-5
+    litellm_params:
+      model: ollama/qwen3:1.7b
+      api_base: http://localhost:11434
+    model_info:
+      description: "qwen3:1.7b — triage, routing, non-generative Q&A"
+      max_input_tokens: 8192
+      max_output_tokens: 4096
+      input_cost_per_token: 0
+      output_cost_per_token: 0
+  # ── Embeddings (always resident ~0.27GB) ───────────────────
+  - model_name: text-embedding-ada-002
+    litellm_params:
+      model: ollama/nomic-embed-text
+      api_base: http://localhost:11434
+    model_info:
+      description: "nomic-embed-text — pgvector / Supabase memory"
+      mode: embedding
+      input_cost_per_token: 0
+      output_cost_per_token: 0
+  - model_name: text-embedding-3-small
+    litellm_params:
+      model: ollama/nomic-embed-text
+      api_base: http://localhost:11434
+    model_info:
+      mode: embedding
+      input_cost_per_token: 0
+      output_cost_per_token: 0
+# ── Global settings ──────────────────────────────────────────
+litellm_settings:
+  drop_params: true
+  request_timeout: 120
+  telemetry: false
+  set_verbose: false
+  json_logs: true
+  num_retries: 2
+# ── Router ───────────────────────────────────────────────────
+router_settings:
+  routing_strategy: least-busy
+  num_retries: 2
+  allowed_fails: 1
+  cooldown_time: 30
+  retry_after: 5
+# ── Auth ─────────────────────────────────────────────────────
+general_settings:
+  master_key: os.environ/LITELLM_MASTER_KEY
+  store_model_in_db: false
+  allow_user_auth: false
+  disable_spend_logs: true

startup.sh ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/bin/bash
+# ─────────────────────────────────────────────────────────────
+# Agent Q-Q (QLAWED-Q) — HF Space Startup v3.0
+# MODE: Passthrough / Triage only
+#
+# When laptop is online: laptop handles all generative load.
+# This Space runs only when laptop is offline — triage + embeddings.
+#
+# VRAM budget (T4-small, 16GB):
+#   qwen3:1.7b       1.10GB  triage, non-generative
+#   nomic-embed-text 0.27GB  embeddings / pgvector
+#   ──────────────────────────────────────────────
+#   Total resident:  ~1.37GB / 16GB  (minimal cost)
+# ─────────────────────────────────────────────────────────────
+set -e
+echo "╔═════════════════════════════════════════════╗"
+echo "║   Agent Q-Q — Passthrough Mode             ║"
+echo "║   qwen3:1.7b  :11434 (triage)              ║"
+echo "║   LiteLLM     :7860  (Claude-compat proxy) ║"
+echo "╚═════════════════════════════════════════════╝"
+echo ""
+# ── 1. Start Ollama ───────────────────────────────────────────
+export OLLAMA_HOST=0.0.0.0:11434
+export OLLAMA_KEEP_ALIVE=5m
+export OLLAMA_NUM_PARALLEL=2
+export OLLAMA_MAX_LOADED_MODELS=2
+ollama serve &
+OLLAMA_PID=$!
+echo "⏳ Waiting for Ollama on :11434..."
+for i in {1..40}; do
+    if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then
+        echo "✅ Ollama ready"
+        break
+    fi
+    if [ $i -eq 40 ]; then
+        echo "❌ Ollama failed to start"
+        exit 1
+    fi
+    sleep 2
+done
+# ── 2. Pull triage models ─────────────────────────────────────
+echo ""
+echo "📦 Setting up passthrough models..."
+echo ""
+pull_if_missing() {
+    local model="$1"
+    if ollama show "$model" > /dev/null 2>&1; then
+        echo "  ✓ cached: $model"
+    else
+        echo "  ↓ pulling: $model"
+        ollama pull "$model" && echo "  ✅ ready: $model" || echo "  ⚠️  failed: $model"
+    fi
+}
+# Triage agent — always resident, ~1.1GB
+pull_if_missing "qwen3:1.7b"
+# Embeddings — always resident, ~0.27GB
+pull_if_missing "nomic-embed-text"
+echo ""
+echo "📊 Registered models:"
+ollama list
+echo ""
+# ── 3. Start LiteLLM proxy ───────────────────────────────────
+echo "🔀 Starting LiteLLM proxy on :7860..."
+litellm \
+    --config "$HOME/litellm_config.yaml" \
+    --port 7860 \
+    --host 0.0.0.0 \
+    --telemetry False &
+LITELLM_PID=$!
+echo "⏳ Waiting for LiteLLM..."
+for i in {1..30}; do
+    if curl -sf http://localhost:7860/health > /dev/null 2>&1; then
+        echo "✅ LiteLLM ready"
+        break
+    fi
+    if [ $i -eq 30 ]; then
+        echo "⚠️  LiteLLM slow to start — may still be initializing"
+        break
+    fi
+    sleep 2
+done
+echo ""
+echo "═══════════════════════════════════════════════════════════"
+echo "✅ Agent Q-Q HF Space — PASSTHROUGH MODE LIVE"
+echo ""
+echo "   Internal Ollama:  http://localhost:11434"
+echo "   Public LiteLLM:   http://0.0.0.0:7860"
+echo ""
+echo "   Routing (passthrough mode):"
+echo "     claude-haiku-4-5  → qwen3:1.7b       (triage/non-generative)"
+echo "     text-embedding-*  → nomic-embed-text  (embeddings)"
+echo ""
+echo "   NOTE: Generative tasks (claude-sonnet-4-6) handled by laptop."
+echo "   Start laptop stack to enable full generative capability."
+echo "═══════════════════════════════════════════════════════════"
+wait $LITELLM_PID $OLLAMA_PID