surrogate-1 / bin /ai-fallback.sh
Ashira Pitchayapakayakul
fix: strip Mac /usr/bin/* hardcoded paths + expand dataset-enrich to 21 sources
023ab84
#!/usr/bin/env bash
# AI Fallback Chain (cost-optimized, cloud-only, no local LLM)
#
# Priority chain:
# 1. Claude Opus 4.7 via Max subscription (primary, flat $100/mo)
# 2. Claude Sonnet 4.6 via Max subscription (separate quota pool!)
# 3. OpenRouter pay-per-use (cheap+capable non-Sonnet picks)
# 4. Gemini 2.5 FL FREE 1000/day
# 5. Groq Llama-3.3 FREE 1000/day
#
# Usage:
# ai-fallback.sh "your question"
# ai-fallback.sh --force gpt5 "your question"
# ai-fallback.sh --tier cheap "your question" # OpenRouter uses DeepSeek
# ai-fallback.sh --skip claude-opus "your question"
set -e
# Source API keys FIRST β€” load BOTH env files (hermes + claude).
# Order matters: claude.env first, hermes.env wins on conflict
# (hermes has newer keys like GITHUB_MODELS_TOKEN, SAMBANOVA_API_KEY, CLOUDFLARE_*)
# shellcheck disable=SC1090
set -a
[ -f "$HOME/.surrogate/.env" ] && . "$HOME/.surrogate/.env"
[ -f "$HOME/.hermes/.env" ] && . "$HOME/.hermes/.env"
set +a
QUERY=""
FORCE=""
SKIP=""
VERBOSE=0
TASK=""
export OR_TIER=""
while [ $# -gt 0 ]; do
case "$1" in
--force) FORCE="$2"; shift 2 ;;
--skip) SKIP="$2"; shift 2 ;;
--tier) export OR_TIER="$2"; shift 2 ;;
--task) TASK="$2"; shift 2 ;;
--cheap) export OR_TIER="cheap"; shift ;;
--fast) export OR_TIER="fast"; shift ;;
--balanced) export OR_TIER="balanced"; shift ;;
--premium) export OR_TIER="premium"; shift ;;
-v|--verbose) VERBOSE=1; shift ;;
*) QUERY="$QUERY $1"; shift ;;
esac
done
QUERY=$(echo "$QUERY" | sed 's/^ *//')
[ -z "$QUERY" ] && { head -15 "$0"; exit 1; }
# --task <type> β€” pick the strongest free model per provider for the task.
# Sets per-provider env vars that try_* functions read (bridge --model alias).
# Auto-detect if not provided: code keywords β†’ coding, reasoning keywords β†’ reasoning.
if [ -z "$TASK" ]; then
q_lower=$(echo "$QUERY" | /usr/bin/tr '[:upper:]' '[:lower:]')
if echo "$q_lower" | grep -qE "code|function|implement|refactor|bug|class|method|api|sql|terraform|cloudformation|dockerfile|kubernetes|yaml|typescript|javascript|python|rust|golang"; then
TASK="coding"
elif echo "$q_lower" | grep -qE "analyze|reason|explain why|compare|evaluate|architect|design|trade-?off|deep|think step|proof|calculate|complex"; then
TASK="reasoning"
fi
fi
case "$TASK" in
coding)
# Code = Codestral (GitHub, Mistral) / DeepSeek-V3.1 (SambaNova) / Qwen Coder (local)
export GITHUB_MODEL="codestral" ; export SAMBANOVA_MODEL="deepseek"
export CLOUDFLARE_MODEL="deepseek" ; export GROQ_MODEL="qwen"
export LOCAL_MODEL="qwen-coder"
;;
reasoning)
# Reasoning = DeepSeek R1 (GitHub, <think> CoT) / Grok 3 / DeepSeek R1 distill (CF)
export GITHUB_MODEL="reasoning" ; export SAMBANOVA_MODEL="deepseek-latest"
export CLOUDFLARE_MODEL="reasoning" ; export GROQ_MODEL="qwen"
export LOCAL_MODEL="granite"
;;
fast)
# Fast = smallest/quickest tier per provider
export GITHUB_MODEL="mini" ; export SAMBANOVA_MODEL="fast"
export CLOUDFLARE_MODEL="fast" ; export GROQ_MODEL="fast"
export LOCAL_MODEL="tiny"
;;
long-context|long|kimi)
# 200k+ context β€” Kimi on CF, gpt-oss-120b elsewhere
export GITHUB_MODEL="llama405" ; export SAMBANOVA_MODEL="gpt-oss"
export CLOUDFLARE_MODEL="kimi" ; export GROQ_MODEL="gpt-oss"
export LOCAL_MODEL="granite"
;;
creative|chat|*)
# Default β€” smartest general-purpose free model per provider
export GITHUB_MODEL="gpt-4o" ; export SAMBANOVA_MODEL="llama70"
export CLOUDFLARE_MODEL="gpt-oss" ; export GROQ_MODEL="llama70"
export LOCAL_MODEL="granite"
;;
esac
# --- Semantic RAG context injection (embedding-powered) ---
# For coding/reasoning/creative tasks, fetch top-3 semantically similar docs
# from embeddings.db and prepend to QUERY. ~50ms overhead, improves grounding.
if [[ "$TASK" == "coding" || "$TASK" == "reasoning" || "$TASK" == "creative" ]]; then
if [[ -f "$HOME/.surrogate/embeddings.db" ]]; then
EMB_COUNT=$(/usr/bin/sqlite3 "$HOME/.surrogate/embeddings.db" 'SELECT COUNT(*) FROM embeddings' 2>/dev/null || echo 0)
if [[ "$EMB_COUNT" -ge 100 ]]; then
SEM_CONTEXT=$(python3 "$HOME/.surrogate/bin/embed-doc.py" --query "$QUERY" 2>/dev/null | head -15)
if [[ -n "$SEM_CONTEXT" ]]; then
QUERY="=== RAG CONTEXT (top-5 semantic matches from knowledge base) ===
$SEM_CONTEXT
=== TASK ===
$QUERY"
fi
fi
fi
fi
log() { [ $VERBOSE -eq 1 ] && echo "[$(date +%H:%M:%S)] $*" >&2; }
# Capture successful response β†’ log to knowledge base (non-blocking)
save_response() {
local provider="$1" model="$2" response="$3"
[ -z "$response" ] && return
( "$HOME/.surrogate/bin/log-interaction.sh" "$QUERY" "$response" "$provider" "$model" > /dev/null 2>&1 & ) || true
}
# --- System prompt from knowledge base + auto code-search if code query ---
build_system_prompt() {
local kb="" profile="" code_ctx="" q_lower
[ -f "$HOME/.surrogate/memory/knowledge_index.md" ] && kb="$(head -50 $HOME/.surrogate/memory/knowledge_index.md)"
[ -f "$HOME/.surrogate/memory/user_profile.md" ] && profile="$(cat $HOME/.surrogate/memory/user_profile.md)"
q_lower=$(echo "$QUERY" | /usr/bin/tr '[:upper:]' '[:lower:]')
local is_generate=0 is_code=0
echo "$q_lower" | grep -qE "code|function|implement|refactor|bug|error|class|method|api|endpoint|schema|model|service|controller|middleware|auth|database|query|sql|deploy|pipeline|terraform|cloudformation|dockerfile|kubernetes|helm|yaml" && is_code=1
echo "$q_lower" | grep -qE "create|generate|write|build|new|template|scaffold|design" && is_generate=1
if [ "$is_code" = "1" ] && [ -d "$HOME/.surrogate/code-vector-db" ]; then
if [ "$is_generate" = "1" ] && [ -x "$HOME/.surrogate/bin/find-gold-examples.sh" ]; then
# Generation task β†’ inject FULL reference files (better style match)
code_ctx=$("$HOME/.surrogate/bin/find-gold-examples.sh" --top 2 --max-bytes 5000 "$QUERY" 2>/dev/null)
elif [ -x "$HOME/.surrogate/bin/code-search.sh" ]; then
# Query task β†’ snippets only (faster)
code_ctx=$("$HOME/.surrogate/bin/code-search.sh" --top 3 "$QUERY" 2>/dev/null | head -60)
fi
fi
local prompt="You are Ashira's AI assistant. Context: $profile
Pattern index: $kb"
if [ -n "$code_ctx" ]; then
prompt="$prompt
=== ASHIRA'S EXISTING CODE (match this style EXACTLY) ===
$code_ctx
=== END EXAMPLES ===
Style rules enforced:
- Follow naming/indent/comment style from examples above
- Use exact same Parameter/Resource names when applicable
- Preserve existing conventions (tags, naming, Description format)"
fi
prompt="$prompt
Be concise. Cite file paths when referencing existing code."
echo "$prompt"
}
SYSTEM=$(build_system_prompt)
# --- Anthropic via Max plan (routes through claude-bridge.sh CLI) ---
# Direct HTTPS to api.anthropic.com with OAuth token returns 401 β€” OAuth flow
# is managed by `claude` CLI (keychain/config). Use the bridge instead.
try_anthropic() {
local model="$1" extra="$2"
log "β†’ Claude Max: $model"
local out
out=$(echo "$QUERY" | "$HOME/.surrogate/bin/claude-bridge.sh" --model "$model" $extra 2>>/tmp/ai-fallback.err) || return 1
[ -z "$out" ] && return 1
echo "$out"
save_response "anthropic" "$model" "$out"
return 0
}
# Opus needs --force outside 01:00-06:00 window; sonnet is always available
try_claude_opus() { try_anthropic "opus" "--force"; }
try_claude_sonnet() { try_anthropic "sonnet" ""; }
# OpenRouter FREE β€” tries multiple free models (each has strict rate limit)
# Order: coder-first β†’ general-powerhouse β†’ smaller fallbacks
try_openrouter_free() {
[ -z "${OPENROUTER_API_KEY:-}" ] && return 2
local free_models=(
"qwen/qwen3-coder:free"
"qwen/qwen3-next-80b-a3b-instruct:free"
"openai/gpt-oss-120b:free"
"nvidia/nemotron-3-super-120b-a12b:free"
"meta-llama/llama-3.3-70b-instruct:free"
"z-ai/glm-4.5-air:free"
"google/gemma-4-31b-it:free"
"openai/gpt-oss-20b:free"
)
for m in "${free_models[@]}"; do
OPENROUTER_MODEL="$m" try_openrouter && return 0
log " ↳ free '$m' unavailable, trying next free..."
done
return 1
}
# --- OpenRouter (cheap+capable non-Sonnet picks) ---
try_openrouter() {
[ -z "${OPENROUTER_API_KEY:-}" ] && return 2
# Default: GPT-5.4 (beats Claude Opus 4.6 per benchmarks, -50% cost vs Opus 4.7)
local model="${OPENROUTER_MODEL:-openai/gpt-5.4}"
case "${OR_TIER:-}" in
# PAID tiers
cheap) model="deepseek/deepseek-v3.2" ;; # $0.26/$0.42 β€” cheapest capable
fast) model="x-ai/grok-4.1-fast" ;; # $0.20/$0.50 β€” ultra cheap, 2M ctx
balanced) model="openai/gpt-5.4" ;; # $2.50/$15 β€” DEFAULT, beats Opus 4.6
premium) model="anthropic/claude-opus-4.7" ;; # $5/$25 β€” if really need Opus
grok) model="x-ai/grok-4.20" ;; # $2/$6 β€” 2M ctx, cool
gemini) model="google/gemini-3.1-pro-preview" ;;# $2/$12
# FREE tiers (29 models available)
free|free-coder) model="qwen/qwen3-coder:free" ;; # coding, 262k ctx
free-large) model="qwen/qwen3-next-80b-a3b-instruct:free" ;; # 80B MoE
free-nvidia) model="nvidia/nemotron-3-super-120b-a12b:free" ;; # 120B
free-gptoss) model="openai/gpt-oss-120b:free" ;; # OpenAI open-sourced
free-llama) model="meta-llama/llama-3.3-70b-instruct:free" ;;
free-kimi) model="moonshotai/kimi-k2.5" ;; # Kimi 256k ctx
free-glm) model="z-ai/glm-4.5-air:free" ;;
free-gemma) model="google/gemma-4-31b-it:free" ;; # Google Gemma 4
esac
log "β†’ OpenRouter: $model"
local body
# Use env vars β€” avoids quote-escape hell with multiline system prompt.
# max_tokens=4000 (GPT-5.4 requires >= 16; stay well above)
body=$(ORM="$model" SYS="$SYSTEM" Q="$QUERY" "$HOME/.surrogate/venv/bin/python" -c "
import json, os
m = {'model':os.environ['ORM'],'max_tokens':4000,
'messages':[{'role':'system','content':os.environ['SYS']},
{'role':'user','content':os.environ['Q']}]}
print(json.dumps(m))
" 2>&1) || { log " body-build failed: $body"; return 1; }
local resp code body_resp
resp=$(curl -sS -w "\n%{http_code}" \
--max-time 90 \
-X POST "https://openrouter.ai/api/v1/chat/completions" \
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
-H "HTTP-Referer: https://ashira.local" \
-H "X-Title: ai-fallback" \
-H "content-type: application/json" \
-d "$body" 2>&1)
code=$(echo "$resp" | tail -1)
body_resp=$(echo "$resp" | sed '$d')
if [ "$code" != "200" ]; then
# Log real error reason for debug
local errmsg
errmsg=$(echo "$body_resp" | "$HOME/.surrogate/venv/bin/python" -c "
import sys, json
try: d=json.load(sys.stdin); print(d.get('error',{}).get('message','unknown')[:120])
except: print('parse-fail')
" 2>/dev/null || echo "unknown")
log " [$code] $errmsg β€” falling through"
return 1
fi
local out
out=$(echo "$body_resp" | "$HOME/.surrogate/venv/bin/python" -c "
import sys, json
d = json.load(sys.stdin)
print(d['choices'][0]['message']['content'])
") || return 1
echo "$out"
save_response "openrouter" "$model" "$out"
return 0
}
# --- Gemini (free) ---
try_gemini() {
[ -z "${GEMINI_API_KEY:-}" ] && return 2
local model="${GEMINI_MODEL:-gemini-2.5-flash}"
log "β†’ Gemini: $model (free)"
local body
body=$("$HOME/.surrogate/venv/bin/python" -c "
import json
m = {'systemInstruction':{'parts':[{'text':'''$SYSTEM'''}]},
'contents':[{'role':'user','parts':[{'text':'''$QUERY'''}]}],
'generationConfig':{'maxOutputTokens':4000}}
print(json.dumps(m))
" 2>/dev/null)
local resp code body_resp
resp=$(curl -sS -w "\n%{http_code}" \
-X POST "https://generativelanguage.googleapis.com/v1beta/models/$model:generateContent?key=$GEMINI_API_KEY" \
-H "content-type: application/json" -d "$body" 2>&1)
code=$(echo "$resp" | tail -1)
body_resp=$(echo "$resp" | sed '$d')
[ "$code" != "200" ] && { log " [$code] falling through"; return 1; }
local out
out=$(echo "$body_resp" | "$HOME/.surrogate/venv/bin/python" -c "
import sys, json
d = json.load(sys.stdin)
print(d['candidates'][0]['content']['parts'][0]['text'])
") || return 1
echo "$out"
save_response "gemini" "$model" "$out"
return 0
}
# --- Groq (free, ultra-fast) ---
try_groq() {
[ -z "${GROQ_API_KEY:-}" ] && return 2
local model="${GROQ_MODEL:-llama70}"
log "β†’ Groq: $model (free)"
# Route through groq-bridge for consistent alias handling (llama70, fast, qwen, gpt-oss...)
local out
out=$(echo "$QUERY" | "$HOME/.surrogate/bin/groq-bridge.sh" --model "$model" 2>>/tmp/ai-fallback.err) || return 1
[ -z "$out" ] && return 1
echo "$out"
save_response "groq" "$model" "$out"
return 0
}
# --- GitHub Models (free via PAT, OpenAI-compat, GPT-4o-mini/Llama 3.3/Mistral/DeepSeek) ---
try_github() {
[ -z "${GITHUB_MODELS_TOKEN:-}${GITHUB_TOKEN:-}" ] && return 2
local model="${GITHUB_MODEL:-gpt-4o}"
log "β†’ GitHub Models: $model (free)"
local out
out=$(echo "$QUERY" | "$HOME/.surrogate/bin/github-bridge.sh" --model "$model" 2>>/tmp/ai-fallback.err) || return 1
[ -z "$out" ] && return 1
echo "$out"
save_response "github" "$model" "$out"
return 0
}
# --- SambaNova Cloud (free, ~500 tok/s Llama 3.3 70B / DeepSeek V3.2 / Llama 4) ---
try_sambanova() {
[ -z "${SAMBANOVA_API_KEY:-}" ] && return 2
local model="${SAMBANOVA_MODEL:-llama70}"
log "β†’ SambaNova: $model (free)"
local out
out=$(echo "$QUERY" | "$HOME/.surrogate/bin/sambanova-bridge.sh" --model "$model" 2>>/tmp/ai-fallback.err) || return 1
[ -z "$out" ] && return 1
echo "$out"
save_response "sambanova" "$model" "$out"
return 0
}
# --- Cloudflare Workers AI (free 10k neurons/day, Llama 3.3 / Gemma-3 / Qwen Coder) ---
try_cloudflare() {
[ -z "${CLOUDFLARE_API_TOKEN:-}${CF_API_TOKEN:-}" ] && return 2
[ -z "${CLOUDFLARE_ACCOUNT_ID:-}${CF_ACCOUNT_ID:-}" ] && return 2
local model="${CLOUDFLARE_MODEL:-gpt-oss}"
log "β†’ Cloudflare WAI: $model (free)"
local out
out=$(echo "$QUERY" | "$HOME/.surrogate/bin/cloudflare-bridge.sh" --model "$model" 2>>/tmp/ai-fallback.err) || return 1
[ -z "$out" ] && return 1
echo "$out"
save_response "cloudflare" "$model" "$out"
return 0
}
# --- Local Ollama β€” always-on, always-free ultimate fallback ---
# Bench (M3 24GB): granite4:7b-a1b-h (4.2GB, ~7s/fib+memo β€” fast & correct).
# Task-aware: code β†’ qwen-coder:7b, chat β†’ granite, tiny β†’ qwen:3b.
# gemma4:26b BLOCKED β€” user directive (too slow for this hw).
try_granite() {
# Check ollama running
curl -sS --max-time 3 http://localhost:11434/api/tags > /dev/null 2>&1 || return 2
local alias="${LOCAL_MODEL:-granite}"
log "β†’ Local Ollama: $alias (free, always-on)"
local out
out=$(echo "$QUERY" | "$HOME/.surrogate/bin/granite-bridge.sh" --model "$alias" 2>>/tmp/ai-fallback.err) || return 1
[ -z "$out" ] && return 1
echo "$out"
save_response "ollama-local" "$alias" "$out"
return 0
}
# --- Execute chain (FREE-FIRST for routine/bulk tasks) ---
# Order: free APIs β†’ claude-sonnet (Max plan safety net) β†’ local Ollama (ultimate backstop)
# IMPORTANT-tasks (retro/sprint/skill-sanitize/agent-critic/security-audit/mythos-audit)
# β†’ call claude-bridge.sh --model opus --force DIRECTLY, bypass this chain
# REVIEWER/hallucination-check β†’ call claude-bridge.sh --model sonnet DIRECTLY
# Paid OpenRouter removed per user direction (use Max plan instead of pay-per-use)
PROVIDERS="github sambanova cloudflare groq openrouter-free gemini claude-sonnet granite"
# Explicit --force
if [ -n "$FORCE" ]; then
case "$FORCE" in
claude-opus|opus) try_claude_opus && exit 0 ;;
claude-sonnet|sonnet) try_claude_sonnet && exit 0 ;;
openrouter|or) try_openrouter && exit 0 ;;
openrouter-free|free) try_openrouter_free && exit 0 ;;
gpt5|gpt) OPENROUTER_MODEL="openai/gpt-5.4" try_openrouter && exit 0 ;;
grok) OPENROUTER_MODEL="x-ai/grok-4.20" try_openrouter && exit 0 ;;
deepseek) OPENROUTER_MODEL="deepseek/deepseek-v3.2" try_openrouter && exit 0 ;;
gemini) try_gemini && exit 0 ;;
groq) try_groq && exit 0 ;;
github|gh) try_github && exit 0 ;;
sambanova|samba) try_sambanova && exit 0 ;;
cloudflare|cf) try_cloudflare && exit 0 ;;
granite|local|ollama) try_granite && exit 0 ;;
*) echo "[error] unknown --force '$FORCE'" >&2; exit 1 ;;
esac
echo "[error] forced provider failed" >&2; exit 1
fi
# Auto chain with skip support
for p in $PROVIDERS; do
if [ -n "$SKIP" ] && [ "$p" = "$SKIP" ]; then continue; fi
case "$p" in
github) try_github && exit 0 ;;
sambanova) try_sambanova && exit 0 ;;
cloudflare) try_cloudflare && exit 0 ;;
claude-opus) try_claude_opus && exit 0 ;;
claude-sonnet) try_claude_sonnet && exit 0 ;;
openrouter) try_openrouter && exit 0 ;;
openrouter-free) try_openrouter_free && exit 0 ;;
gemini) try_gemini && exit 0 ;;
groq) try_groq && exit 0 ;;
granite) try_granite && exit 0 ;;
esac
done
echo "[error] all providers exhausted" >&2
exit 1