"""REPOMIND — HuggingFace Space entry point. Public demo. Auto-detects backend from environment variables (Steve Kimoi's canonical lablab/AMD tutorial pattern): VLLM_BASE_URL — set in Space → Settings → Variables and secrets to point at a live MI300X vLLM endpoint, e.g. http://:8000/v1 MODEL_NAME — model id served by vLLM, defaults to Qwen/Qwen3-Coder-Next-FP8 When VLLM_BASE_URL is unset (default), the Space runs the offline mock backend on CPU-basic so it stays free 24/7. When set, the Space wires through to the live AMD MI300X for real inference. Local repo: https://github.com/SRKRZ23/repomind Hackathon: https://lablab.ai/ai-hackathons/amd-developer """ from __future__ import annotations import json import os import sys import tempfile from pathlib import Path # make submodules importable sys.path.insert(0, str(Path(__file__).resolve().parent)) import gradio as gr from ingestion.chunker import ingest_to_json from ingestion.cloner import clone # ─── Configuration via env vars (Steve Kimoi tutorial pattern) ──────────── VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "").strip() MODEL_NAME = os.environ.get("MODEL_NAME", "Qwen/Qwen3-Coder-Next-FP8").strip() LIVE_BACKEND = bool(VLLM_BASE_URL) BACKEND_LABEL = "🟢 Live AMD MI300X" if LIVE_BACKEND else "🟡 Mock backend (CPU-basic, demo mode)" BACKEND_HINT = ( f"Connected to vLLM endpoint: `{VLLM_BASE_URL}` · model `{MODEL_NAME}`" if LIVE_BACKEND else "Set the Space secrets `VLLM_BASE_URL` + `MODEL_NAME` to wire a real MI300X backend." ) HEADER_MD = f""" # REPOMIND **Open-source repo-scale coding agent on AMD MI300X.** Ingest a git repository (up to 256K tokens, FP8) on a single GPU and reason across the whole codebase with multi-step tool use. > 📦 GitHub: SRKRZ23/repomind · MIT > 🏆 Built for the AMD Developer Hackathon 2026 > 🤗 HF Special Prize candidate · 🛡 Conservative claim discipline applied ### Why AMD MI300X (verified 2026-05-05 on real hardware) - Qwen3-Coder-Next-FP8 weights = **77.29 GiB** in VRAM (verified) - 256K KV cache @ FP8 = **94.58 GiB** available (2,065,744 tokens, verified) - Activations + framework overhead → peak 176/191.7 GiB ≈ **92% utilization** - NVIDIA H100 80 GB cannot accommodate this on a single card by VRAM accounting (~143 GB > 80 GB); MI300X 192 GB has the headroom ### Status **Backend right now**: {BACKEND_LABEL} {BACKEND_HINT} """ # Minimal cap — HF Space CPU-basic gets 16 GB RAM. Don't blow it on giant repos. MAX_INGEST_SIZE_MB = 50 SCRATCH_DIR = Path(tempfile.gettempdir()) / "repomind_hf" SCRATCH_DIR.mkdir(exist_ok=True) def ingest(url_or_path: str, chunk_tokens: int) -> str: if not url_or_path or not url_or_path.strip(): return "Provide a GitHub URL or `owner/repo` shorthand." out = SCRATCH_DIR / "active.json" try: # Local path mode (rare on HF — usually URL) if Path(url_or_path).is_dir(): repo_root = Path(url_or_path) label = repo_root.name else: res = clone(url_or_path, cache_dir=SCRATCH_DIR / "repos") repo_root = res.local_path label = res.url.rsplit("/", 1)[-1].removesuffix(".git") summary = ingest_to_json( repo_root, out, repo_label=label, max_tokens_per_chunk=chunk_tokens, ) return json.dumps(summary, indent=2) except Exception as e: return f"❌ {type(e).__name__}: {e}" def _build_llm(): """Return an LLM client based on env-var configuration.""" if LIVE_BACKEND: from serving.vllm_client import VLLMClient return VLLMClient(base_url=VLLM_BASE_URL, model=MODEL_NAME) from serving.mock_client import MockClient return MockClient(max_tool_turns=2) def ask(question: str): summary_path = SCRATCH_DIR / "active.json" if not summary_path.exists(): return "Ingest a repo first.", "" if not question or not question.strip(): return "Type a question.", "" summary = json.loads(summary_path.read_text()) repo_root = Path(summary.get("root", ".")) try: llm = _build_llm() except Exception as e: return f"❌ failed to init LLM client: {type(e).__name__}: {e}", "" from agent.loop import Agent from tools.registry import default_registry try: agent = Agent( llm=llm, tools=default_registry(repo_root, scratch_dir=SCRATCH_DIR / "scratch"), max_steps=4, ) result = agent.run(question, summary) except Exception as e: return f"❌ agent failed: {type(e).__name__}: {e}", "" trace_lines = [ f"- {tc['name']} {json.dumps(tc['arguments'], ensure_ascii=False)}" for tc in result.tool_calls ] trace = "\n".join(trace_lines) or "(no tool calls)" return result.answer, trace with gr.Blocks( title="REPOMIND — repo-scale coding agent on AMD MI300X", ) as demo: gr.Markdown(HEADER_MD) with gr.Tab("1. Ingest"): gr.Markdown( "Paste any **GitHub URL** or `owner/repo` shorthand. " "REPOMIND clones it, parses the source files, and chunks them " "into priority-ranked sections (README first, then top-level " "symbols, then nested code, then tests)." ) with gr.Row(): url = gr.Textbox( label="GitHub URL or owner/repo", placeholder="https://github.com/pallets/flask OR pallets/flask", scale=4, ) chunk_tokens = gr.Slider( 256, 4096, value=1024, step=128, label="Tokens / chunk", scale=1 ) ingest_btn = gr.Button("Ingest", variant="primary") ingest_out = gr.Code(label="Ingestion summary", language="json") ingest_btn.click(ingest, [url, chunk_tokens], ingest_out) gr.Markdown( "**Examples that work on a single MI300X**: " "`pallets/flask` (~408K tokens, fits in 256K window with priority chunking) · " "`pytorch/vision` (~1.3M tokens, trimmed to 180K of highest-priority " "content via the chunker) · this repo `SRKRZ23/repomind` (~68K tokens, fits whole)." ) with gr.Tab("2. Ask"): gr.Markdown( f"Ask any question about the ingested repo. The agent runs an " f"SC-TIR loop (PLAN → CALL TOOL → OBSERVE → THINK → ANSWER) with " f"five tools: `read_file`, `grep_codebase`, `execute_code` " f"(sandboxed), `run_tests`, `git_log`.\n\n" f"**Backend**: {BACKEND_LABEL}" ) question = gr.Textbox( label="Question", lines=3, placeholder=( "Where is the WSGI entry point? · " "What does the chunker prioritize? · " "Trace one slab allocation through the call graph." ), ) ask_btn = gr.Button("Ask", variant="primary") answer = gr.Markdown(label="Answer") tool_trace = gr.Code(label="Tool trace (agent steps)", language="markdown") ask_btn.click(ask, [question], [answer, tool_trace]) with gr.Tab("3. Verified evidence"): gr.Markdown( "REPOMIND was stress-tested on a real AMD MI300X x1 droplet across " "two sessions (**2026-05-05 / 2026-05-06**, 124 min total, $4.12). " "Highlights:\n\n" "| Test | Result |\n" "|---|---|\n" "| Memory peak | 176/191.7 GiB (92%) |\n" "| `--max-model-len 262144` | started clean |\n" "| Concurrency 8K / 16K / 32K / 64K @ N=31 | **31/31 success at every context** ✅ |\n" "| Concurrency 128K @ N=31 | 25/31 (6 timeouts past 15 min) |\n" "| Long-context needle at 200K | **3/3** pass (early/middle/late) |\n" "| End-to-end repo Q&A | **9/9** correct across 3 repos |\n" "| Largest repo tested | **pytorch/vision (1.3M tokens)** |\n" "| Tuning attempt: AITER backend | regression — 137/144 cells broken under FP8 KV cache; default Triton stays production-safe |\n" "| Cost | $1.99/hr cloud, $45.75/1M completion tokens |\n\n" "Full evidence pack — JSON results, plots, raw model outputs — is at " 'github.com/SRKRZ23/repomind/tree/main/benchmarks/2026-05-05-mi300x-stress-test. ' "Extended PHASE 1+2 narrative + AITER A/B in the " 'extended/SUMMARY.md.' ) gr.HTML( """

Author: Sardor Razikov — Tashkent 🇺🇿

GitHub · LinkedIn · X / Twitter · Zenodo (ECB)

📧 razikovsardor1@gmail.com · razikovs777@gmail.com

If the MI300X memory-architecture story resonates, a like on this Space helps with the Hugging Face Special Prize judging. 🤗

""" ) if __name__ == "__main__": demo.launch(theme=gr.themes.Soft(primary_hue="red", secondary_hue="gray"))