surrogate-1 / bin /lib /context_builder.sh
Ashira Pitchayapakayakul
fix: strip Mac /usr/bin/* hardcoded paths + expand dataset-enrich to 21 sources
023ab84
#!/usr/bin/env bash
# Shared context builder β€” sourced by qwen-coder-worker + dev-cloud-worker.
# Produces rich context: repo-map + similar functions from project + past accepted examples.
# Call: build_rich_context <project> <priority_id> <priority_title>
# Sets env vars: REPO_MAP, SIMILAR_FUNCS, FEWSHOT_ACCEPTED, ANTI_PATTERNS
build_rich_context() {
local PRIO_PROJECT="$1"
local PRIO_ID="$2"
local PRIO_TITLE="$3"
local SHARED="$HOME/.hermes/workspace/swarm-shared"
local PROJECT_DIR="$HOME/axentx/$PRIO_PROJECT"
# 1. Full repo-map (up to 10KB β€” was 3KB).
# build-repo-map.sh writes to "<proj>_map.md"; some older paths used "<proj>.md".
# Try both so we don't silently lose the strongest grounding signal.
REPO_MAP=""
for candidate in "$SHARED/repo-maps/${PRIO_PROJECT}_map.md" "$SHARED/repo-maps/${PRIO_PROJECT}.md"; do
if [[ -f "$candidate" ]]; then
REPO_MAP=$(head -c 10000 "$candidate")
break
fi
done
# 2. Similar function signatures from project (grep in real codebase)
SIMILAR_FUNCS=""
if [[ -d "$PROJECT_DIR" ]]; then
# Extract keywords from title for grep
local KW=$(echo "$PRIO_TITLE" | /usr/bin/tr '[:upper:]' '[:lower:]' | /usr/bin/tr -cs 'a-z0-9' ' ' | /usr/bin/tr ' ' '\n' | awk 'length>4' | head -3 | /usr/bin/tr '\n' '|' | sed 's/|$//')
if [[ -n "$KW" ]]; then
SIMILAR_FUNCS=$(/usr/bin/find "$PROJECT_DIR" -type f \( -name '*.py' -o -name '*.ts' -o -name '*.tsx' -o -name '*.js' -o -name '*.go' \) ! -path '*/node_modules/*' ! -path '*/.hermes-*' 2>/dev/null | \
xargs grep -lE "($KW)" 2>/dev/null | head -3 | while read f; do
echo "=== ${f#$PROJECT_DIR/} ==="
grep -A3 -E "^(def|function|export const|class|async def|interface)" "$f" 2>/dev/null | head -30
done 2>/dev/null | head -c 4000)
fi
fi
# 3. RAG: actual code patterns from project (SQLite FTS via ask-sqlite.py if exists)
RAG_EXAMPLES=""
if [[ -x "$HOME/.surrogate/bin/ask-sqlite.py" ]]; then
RAG_EXAMPLES=$(python3 "$HOME/.surrogate/bin/ask-sqlite.py" \
"$PRIO_PROJECT $PRIO_TITLE" 2>/dev/null | head -c 3000)
fi
# 4. Semantic RAG (from embeddings) β€” top-5 similar
SEMANTIC_RAG=""
if [[ -f "$HOME/.surrogate/embeddings.db" ]]; then
SEMANTIC_RAG=$(python3 "$HOME/.surrogate/bin/embed-doc.py" --query "$PRIO_TITLE" 2>/dev/null | head -c 2000)
fi
# 5. Past ACCEPTED examples (few-shot from qualityβ‰₯7 history)
FEWSHOT_ACCEPTED=""
for review in $(/bin/ls -t "$HOME/.hermes/workspace/qwen-coder-reviews/"*.review.json 2>/dev/null | head -30); do
if grep -qE '"quality_score":\s*[789]|"quality_score":\s*10' "$review" 2>/dev/null; then
local OUT_FILE=$(basename "$review" .review.json)
# Search all worker output dirs
for WD in qwen-coder dev-cloud-samba dev-cloud-github dev-cloud-cloudflare dev-cloud-groq dev-cloud-synthesis; do
local OUT_PATH="$HOME/.hermes/workspace/$WD/${OUT_FILE}.md"
if [[ -f "$OUT_PATH" ]]; then
FEWSHOT_ACCEPTED=$(head -c 2000 "$OUT_PATH")
break 2
fi
done
fi
done
# 6. Anti-patterns (last 5 rejection reasons across all workers)
ANTI_PATTERNS=""
for review in $(/bin/ls -t "$HOME/.hermes/workspace/qwen-coder-reviews/"*.review.json 2>/dev/null | head -10); do
local bugs=$(python3 -c "
import json, re, sys
try:
txt = open('$review').read()
m = re.search(r'\{.*\}', txt, re.DOTALL)
if not m: sys.exit()
d = json.loads(m.group(0))
if d.get('verdict') in ('reject','rework') and d.get('bugs'):
for b in d['bugs'][:2]:
print(f'- {b[:180]}')
except: pass
" 2>/dev/null)
[[ -n "$bugs" ]] && ANTI_PATTERNS="$ANTI_PATTERNS$bugs"$'\n'
done
ANTI_PATTERNS=$(echo "$ANTI_PATTERNS" | head -10)
# 7. Active-learning prompt deltas β€” aggregate last 5 UNIQUE anti-patterns.
# Preference: same-project anti-patterns first, then generic.
# Dedup by first 80 chars of prompt_addition (similar bugs shouldn't bloat prompt).
PROMPT_DELTAS=""
if [[ -f "$HOME/.surrogate/memory/worker-prompt-deltas.jsonl" ]]; then
PROMPT_DELTAS=$(python3 -c "
import json, sys
from pathlib import Path
try:
entries = []
for l in Path('$HOME/.surrogate/memory/worker-prompt-deltas.jsonl').read_text().splitlines():
if not l.strip(): continue
try: entries.append(json.loads(l))
except: pass
# Dedup by first 80 chars
seen = set()
picked = []
# Walk newest β†’ oldest, cap 5 unique
for e in reversed(entries):
addn = (e.get('prompt_addition') or '').strip()
if not addn: continue
key = addn[:80]
if key in seen: continue
seen.add(key)
picked.append(addn)
if len(picked) >= 5: break
if picked:
out = ['ACTIVE-LEARNED RULES (avoid these past mistakes):']
for i, a in enumerate(picked, 1):
out.append(f'{i}. {a[:400]}')
print('\n'.join(out))
except Exception as e: pass
" 2>/dev/null)
fi
# 8. Priority full spec (if a detailed spec file exists)
# Spec is the single most important signal β€” cap high (6KB) so the full
# Context/Requirements/DO NOT sections fit. Other RAG signals are capped
# lower because they're supplementary; the spec is authoritative.
PRIO_SPEC=""
local SPEC_FILE="$HOME/.hermes/workspace/swarm-shared/specs/${PRIO_ID}.md"
[[ -f "$SPEC_FILE" ]] && PRIO_SPEC=$(head -c 6000 "$SPEC_FILE")
# 9. Task-type authoritative sources β€” boost scraped knowledge based on title.
# Security task β†’ CVE/MITRE/OWASP/Prowler. SRE β†’ Google SRE/postmortems.
# Observability β†’ OTel/Prometheus/Grafana/Honeycomb. etc.
# This is THE fix that makes all our scraping actually used by Hermes workers.
AUTHORITATIVE_CONTEXT=""
if [[ -f "$HOME/.surrogate/index.db" ]]; then
AUTHORITATIVE_CONTEXT=$(python3 <<PYEOF
import sqlite3, re
title = """${PRIO_TITLE}""".lower()
project = """${PRIO_PROJECT}""".lower()
# Classify task β†’ preferred source whitelist
routes = {
# Security tasks
('security','cve','vuln','prowler','kyverno','opa','admission','ciem','sigma','mitre','attack','cosign','sbom','falco','threat','malware','exploit'): ['cisa-kev','mitre-attack','owasp-cheatsheet','domain:sec-cloudsec','domain:sec-appsec','domain:sec-devsecops','code-deep:sec-appsec','code-deep:sec-cloudsec'],
# SRE / incident / postmortem
('sre','slo','sli','incident','postmortem','runbook','chaos','rca','dora','mttr','blameless','on-call','pager','outage'): ['google-sre','postmortems-index','firecrawl','eng-blog:charity-majors','eng-blog:high-scalability','mythos-ai-engineering','domain:ops-sre','code-deep:ops-sre'],
# Observability
('observab','otel','telemetry','prometheus','grafana','loki','tempo','metric','trace','log','honeycomb','ebpf'): ['opentelemetry-spec','prometheus-docs','grafana-docs','firecrawl','domain:ops-observability'],
# Cloud / K8s / Terraform
('kubernetes','k8s','helm','istio','terraform','aws','ecs','eks','lambda','cloudformation','cdk','gcp','azure','argocd','flux'): ['firecrawl','github-public','code-deep:ops-devops','domain:ops-devops','mythos-cloud','github-trending'],
# AI / multi-agent
('agent','autogen','crewai','langgraph','orchestra','mcp','reflexion','dspy','rag','llm'): ['anthropic-cookbook','arxiv','mythos-ai-agent','mythos-ai-engineering','domain:ai-engineering','code-deep:ai-engineering','firecrawl','hf-papers'],
# FinOps
('cost','finops','focus','rightsizing','kubecost','opencost','savings','budget','spend','waste'): ['firecrawl','rss','eng-blog:high-scalability','domain:ops-devops','arxiv'],
# Frontend / FE
('frontend','react','nextjs','typescript','tsx','ui'): ['domain:dev-frontend','domain:design-ux','code-deep:dev-frontend','stackoverflow','github-trending'],
# Backend / API / DB
('backend','api','fastapi','database','sql','postgres','asyncpg','sqlalchemy'): ['domain:dev-backend','domain:dev-fullstack','code-deep:dev-backend','github-public','stackoverflow','hf-papers'],
# Mobile
('mobile','android','ios','flutter','reactnative','line','workio'): ['domain:dev-mobile','code-deep:dev-mobile','firecrawl','stackoverflow'],
}
# Project-specific boost
project_preferred = {
'vanguard': ['cisa-kev','mitre-attack','owasp-cheatsheet','code-deep:sec-appsec'],
'costinel': ['firecrawl','rss','arxiv','mythos-ai-engineering'],
'arkship': ['google-sre','postmortems-index','anthropic-cookbook','opentelemetry-spec','firecrawl'],
'surrogate':['arxiv','hf-papers','anthropic-cookbook','mythos-ai-agent'],
'workio': ['firecrawl','stackoverflow','github-public'],
}
preferred_sources = set()
for keywords, srcs in routes.items():
if any(k in title for k in keywords):
preferred_sources.update(srcs)
for proj_key, srcs in project_preferred.items():
if proj_key in project:
preferred_sources.update(srcs)
if not preferred_sources:
print(''); exit()
# FTS query β€” prefer authoritative sources
conn = sqlite3.connect('$HOME/.surrogate/index.db')
conn.row_factory = sqlite3.Row
# Simple keyword from title
kw = ' '.join([w for w in re.sub(r'[^a-zA-Z0-9 ]', ' ', title).split() if len(w) > 3][:5])
if not kw: exit()
src_list = ','.join(f"'{s}'" for s in preferred_sources)
# Strategy: 3-tier fallback β€” preferred+match β†’ any+match β†’ preferred random
rows = []
try:
# Tier 1: preferred sources + FTS match on keywords
q = f"""SELECT d.source, d.instruction, substr(d.response, 1, 600) as body
FROM docs_fts f JOIN docs d ON d.id = f.rowid
WHERE f.docs_fts MATCH ? AND d.source IN ({src_list})
ORDER BY bm25(docs_fts) LIMIT 6"""
rows = conn.execute(q, (kw,)).fetchall()
except sqlite3.OperationalError: pass
if not rows:
# Tier 2: FTS match on ANY source β€” relax source filter
try:
q2 = """SELECT d.source, d.instruction, substr(d.response, 1, 600) as body
FROM docs_fts f JOIN docs d ON d.id = f.rowid
WHERE f.docs_fts MATCH ? ORDER BY bm25(docs_fts) LIMIT 6"""
rows = conn.execute(q2, (kw,)).fetchall()
except sqlite3.OperationalError: pass
if not rows:
# Tier 3: random sample from preferred sources (even if no keyword match)
rows = conn.execute(f"SELECT source, instruction, substr(response,1,600) as body FROM docs WHERE source IN ({src_list}) ORDER BY RANDOM() LIMIT 6").fetchall()
conn.close()
out = []
for r in rows:
out.append(f"[{r['source']}] {(r['instruction'] or '')[:120]}")
out.append((r['body'] or '')[:500])
out.append('')
print('\n'.join(out)[:3500])
PYEOF
)
fi
# 10. FalkorDB graph β€” related decisions + past priorities with similar theme
GRAPH_CONTEXT=""
local REDIS_SOCK=$(/usr/bin/find /var/folders /tmp -name 'redis.socket' -type s 2>/dev/null | head -1)
if [[ -n "$REDIS_SOCK" ]]; then
# Get related priorities + learned rules
GRAPH_CONTEXT=$(/opt/homebrew/bin/redis-cli -s "$REDIS_SOCK" GRAPH.QUERY ashira "
MATCH (p:Priority {project: '$PRIO_PROJECT'})
OPTIONAL MATCH (p)-[:HAS_LEARNED_RULE]->(l:LearnedRule)
OPTIONAL MATCH (p)-[:COMMITTED_AS]->(c:Commit)
RETURN p.id, p.title, l.content, c.msg LIMIT 8
" 2>/dev/null | tail -c 2500)
fi
# 11. Hermes trace recall β€” past similar tasks Hermes handled (from JSONL)
HERMES_RECALL=""
local TRACE_DIR="$HOME/axentx/surrogate/data/training-jsonl"
if [[ -d "$TRACE_DIR" ]]; then
HERMES_RECALL=$(python3 <<PYEOF
import json, re, glob
title = """${PRIO_TITLE}""".lower()
words = [w for w in re.sub(r'[^a-zA-Z0-9 ]', ' ', title).split() if len(w) > 4][:4]
if not words: exit()
hits = []
# Walk recent hermes-trace-YYYY-MM-DD.jsonl files (last 7 days)
import os
files = sorted(glob.glob(os.path.expanduser('~/axentx/surrogate/data/training-jsonl/hermes-trace-*.jsonl')))[-7:]
for f in files:
try:
for line in open(f):
try: rec = json.loads(line)
except: continue
blob = (rec.get('instruction','') + ' ' + rec.get('output',''))[:2000].lower()
score = sum(1 for w in words if w in blob)
if score >= 2:
hits.append((score, rec))
except: pass
hits.sort(key=lambda x: -x[0])
for score, rec in hits[:3]:
print(f"HERMES PREVIOUSLY [{rec.get('category','?')}]: {rec.get('instruction','')[:120]}")
print(f"β†’ {rec.get('output','')[:400]}")
print()
PYEOF
)
fi
}
export -f build_rich_context