REPOMIND v0.1 — repo-scale coding agent demo
Browse filesOpen-source coding agent that ingests an entire git repository and reasons
across the whole codebase with multi-step tool use. Built for the AMD
Developer Hackathon 2026.
This Space ships:
- Ingestion pipeline (clone + tree-sitter + smart chunker + priority budget)
- 5 tools: read_file, grep_codebase, execute_code, run_tests, git_log
- SC-TIR-style agent loop adapted from AIMO3 math reasoning pipeline
- Mock LLM backend for offline demo (default)
- vLLM client ready to wire to MI300X (Qwen3-Coder-Next-FP8, 256K context)
Full source: https://github.com/SRKRZ23/repomind
- .gitignore +6 -0
- agent/__init__.py +0 -0
- agent/loop.py +148 -0
- agent/prompts.py +49 -0
- app.py +175 -0
- ingestion/__init__.py +0 -0
- ingestion/chunker.py +202 -0
- ingestion/cloner.py +100 -0
- ingestion/parser.py +161 -0
- ingestion/token_budget.py +71 -0
- requirements.txt +7 -0
- serving/__init__.py +0 -0
- serving/base.py +28 -0
- serving/mock_client.py +73 -0
- serving/vllm_client.py +69 -0
- tools/__init__.py +0 -0
- tools/base.py +62 -0
- tools/execute_code.py +104 -0
- tools/git_log.py +50 -0
- tools/grep.py +95 -0
- tools/read_file.py +50 -0
- tools/registry.py +20 -0
- tools/run_tests.py +54 -0
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
.repomind_cache/
|
| 4 |
+
.benchmarks/
|
| 5 |
+
.pytest_cache/
|
| 6 |
+
.DS_Store
|
agent/__init__.py
ADDED
|
File without changes
|
agent/loop.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SC-TIR-style agent loop adapted from AIMO3 (math) to coding.
|
| 2 |
+
|
| 3 |
+
Loop:
|
| 4 |
+
user → LLM → (tool calls?) → tools → LLM → ... → final answer
|
| 5 |
+
|
| 6 |
+
Stops when:
|
| 7 |
+
- LLM emits content with no tool calls, OR
|
| 8 |
+
- max_steps hit (forces a final response without tools)
|
| 9 |
+
|
| 10 |
+
The pattern mirrors Sardor's AIMO3 SC-TIR pipeline: the model alternates
|
| 11 |
+
between thinking and tool-augmented action, with deterministic verification
|
| 12 |
+
on the tool side.
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
import json
|
| 16 |
+
from dataclasses import dataclass, field
|
| 17 |
+
from typing import Any, Dict, List
|
| 18 |
+
|
| 19 |
+
from serving.base import LLMClient, LLMResponse, ToolCall
|
| 20 |
+
from tools.base import ToolRegistry, ToolResult
|
| 21 |
+
from agent.prompts import SYSTEM, build_repo_overview, initial_user_prompt
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class AgentTurn:
|
| 26 |
+
role: str
|
| 27 |
+
content: str
|
| 28 |
+
tool_calls: List[Dict[str, Any]] = field(default_factory=list)
|
| 29 |
+
tool_call_id: str | None = None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@dataclass
|
| 33 |
+
class AgentRun:
|
| 34 |
+
answer: str
|
| 35 |
+
transcript: List[Dict[str, Any]]
|
| 36 |
+
tool_calls: List[Dict[str, Any]]
|
| 37 |
+
steps: int
|
| 38 |
+
finished: bool
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class Agent:
|
| 42 |
+
def __init__(
|
| 43 |
+
self,
|
| 44 |
+
llm: LLMClient,
|
| 45 |
+
tools: ToolRegistry,
|
| 46 |
+
max_steps: int = 6,
|
| 47 |
+
max_tool_output_chars: int = 6000,
|
| 48 |
+
):
|
| 49 |
+
self.llm = llm
|
| 50 |
+
self.tools = tools
|
| 51 |
+
self.max_steps = max_steps
|
| 52 |
+
self.max_tool_output_chars = max_tool_output_chars
|
| 53 |
+
|
| 54 |
+
def run(self, question: str, repo_summary: Dict[str, Any]) -> AgentRun:
|
| 55 |
+
overview = build_repo_overview(
|
| 56 |
+
repo=repo_summary.get("repo", ""),
|
| 57 |
+
n_files=repo_summary.get("n_files", 0),
|
| 58 |
+
n_chunks=repo_summary.get("n_chunks", 0),
|
| 59 |
+
total_tokens=repo_summary.get("total_tokens", 0),
|
| 60 |
+
top_paths=_pick_top_paths(repo_summary),
|
| 61 |
+
)
|
| 62 |
+
messages: List[Dict[str, Any]] = [
|
| 63 |
+
{"role": "system", "content": SYSTEM},
|
| 64 |
+
{"role": "user", "content": initial_user_prompt(question, overview)},
|
| 65 |
+
]
|
| 66 |
+
tool_schema = self.tools.schema()
|
| 67 |
+
tool_calls_log: List[Dict[str, Any]] = []
|
| 68 |
+
step = 0
|
| 69 |
+
finished = False
|
| 70 |
+
|
| 71 |
+
while step < self.max_steps:
|
| 72 |
+
resp = self.llm.complete(messages, tools=tool_schema)
|
| 73 |
+
assistant_msg: Dict[str, Any] = {"role": "assistant"}
|
| 74 |
+
if resp.content:
|
| 75 |
+
assistant_msg["content"] = resp.content
|
| 76 |
+
if resp.tool_calls:
|
| 77 |
+
assistant_msg["tool_calls"] = [self._tool_call_to_msg(tc) for tc in resp.tool_calls]
|
| 78 |
+
else:
|
| 79 |
+
assistant_msg.setdefault("content", "")
|
| 80 |
+
messages.append(assistant_msg)
|
| 81 |
+
|
| 82 |
+
if not resp.tool_calls:
|
| 83 |
+
finished = True
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
for tc in resp.tool_calls:
|
| 87 |
+
tool_calls_log.append({"name": tc.name, "arguments": tc.arguments})
|
| 88 |
+
result = self.tools.call(tc.name, tc.arguments)
|
| 89 |
+
tool_msg = {
|
| 90 |
+
"role": "tool",
|
| 91 |
+
"tool_call_id": tc.id,
|
| 92 |
+
"name": tc.name,
|
| 93 |
+
"content": self._format_tool_output(result),
|
| 94 |
+
}
|
| 95 |
+
messages.append(tool_msg)
|
| 96 |
+
step += 1
|
| 97 |
+
|
| 98 |
+
# If we hit max_steps without a final answer, force one more text-only call.
|
| 99 |
+
if not finished:
|
| 100 |
+
messages.append({
|
| 101 |
+
"role": "user",
|
| 102 |
+
"content": "You've used the tool budget. Provide your best final answer now, without tool calls.",
|
| 103 |
+
})
|
| 104 |
+
resp = self.llm.complete(messages, tools=[])
|
| 105 |
+
messages.append({"role": "assistant", "content": resp.content or ""})
|
| 106 |
+
|
| 107 |
+
# Final answer = last assistant message with content
|
| 108 |
+
answer = ""
|
| 109 |
+
for m in reversed(messages):
|
| 110 |
+
if m.get("role") == "assistant" and m.get("content"):
|
| 111 |
+
answer = m["content"]
|
| 112 |
+
break
|
| 113 |
+
|
| 114 |
+
return AgentRun(
|
| 115 |
+
answer=answer,
|
| 116 |
+
transcript=messages,
|
| 117 |
+
tool_calls=tool_calls_log,
|
| 118 |
+
steps=step,
|
| 119 |
+
finished=finished,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
def _tool_call_to_msg(self, tc: ToolCall) -> Dict[str, Any]:
|
| 123 |
+
return {
|
| 124 |
+
"id": tc.id,
|
| 125 |
+
"type": "function",
|
| 126 |
+
"function": {"name": tc.name, "arguments": json.dumps(tc.arguments)},
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
def _format_tool_output(self, result: ToolResult) -> str:
|
| 130 |
+
body = result.output if result.ok else f"[error] {result.error}"
|
| 131 |
+
if len(body) > self.max_tool_output_chars:
|
| 132 |
+
body = body[: self.max_tool_output_chars] + "\n[... truncated]"
|
| 133 |
+
return body
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _pick_top_paths(summary: Dict[str, Any]) -> List[str]:
|
| 137 |
+
chunks = summary.get("chunks") or []
|
| 138 |
+
seen: List[str] = []
|
| 139 |
+
seen_set = set()
|
| 140 |
+
# priority 0 first, then 1; keep insertion order
|
| 141 |
+
for prio in (0, 1, 2):
|
| 142 |
+
for c in chunks:
|
| 143 |
+
if c.get("priority") == prio and c.get("path") not in seen_set:
|
| 144 |
+
seen.append(c["path"])
|
| 145 |
+
seen_set.add(c["path"])
|
| 146 |
+
if len(seen) >= 60:
|
| 147 |
+
return seen
|
| 148 |
+
return seen
|
agent/prompts.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""System and tool prompt templates for the REPOMIND agent."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from typing import Iterable
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
SYSTEM = """You are REPOMIND, a coding agent that has the ENTIRE git repository
|
| 7 |
+
loaded in your context window. You have these capabilities:
|
| 8 |
+
|
| 9 |
+
1. You can see the structure of the repo (tree of files + symbols).
|
| 10 |
+
2. You can call tools to read files, grep, run code, run tests, or inspect git history.
|
| 11 |
+
3. You must reason carefully across multiple files when needed.
|
| 12 |
+
|
| 13 |
+
## How to answer
|
| 14 |
+
|
| 15 |
+
- Be precise. When you cite code, give the file path and line numbers.
|
| 16 |
+
- When you don't know, call a tool to find out — never invent.
|
| 17 |
+
- Stop calling tools as soon as you have enough information to answer.
|
| 18 |
+
- After tool calls, respond in plain prose with concrete references.
|
| 19 |
+
|
| 20 |
+
## Tool-call protocol
|
| 21 |
+
|
| 22 |
+
Use the standard OpenAI function-calling format. Each tool result will be
|
| 23 |
+
delivered back as a tool-role message; you may then either call another
|
| 24 |
+
tool or write the final answer.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def build_repo_overview(repo: str, n_files: int, n_chunks: int, total_tokens: int, top_paths: Iterable[str]) -> str:
|
| 29 |
+
paths = "\n".join(f" - {p}" for p in list(top_paths)[:40])
|
| 30 |
+
return f"""# Repo: {repo}
|
| 31 |
+
|
| 32 |
+
- Files indexed: {n_files}
|
| 33 |
+
- Chunks: {n_chunks}
|
| 34 |
+
- Total tokens: {total_tokens:,}
|
| 35 |
+
|
| 36 |
+
Top-priority files:
|
| 37 |
+
{paths or " (none)"}
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def initial_user_prompt(question: str, overview: str) -> str:
|
| 42 |
+
return f"""{overview}
|
| 43 |
+
|
| 44 |
+
# Question
|
| 45 |
+
|
| 46 |
+
{question}
|
| 47 |
+
|
| 48 |
+
Answer with concrete code references.
|
| 49 |
+
"""
|
app.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""REPOMIND — HuggingFace Space entry point.
|
| 2 |
+
|
| 3 |
+
Public demo. Backend defaults to the offline mock LLM so the Space runs
|
| 4 |
+
without GPU credits. Once the AMD MI300X vLLM endpoint is live, switch
|
| 5 |
+
the backend toggle to "vllm" and point at the live URL.
|
| 6 |
+
|
| 7 |
+
Local repo: https://github.com/SRKRZ23/repomind
|
| 8 |
+
Hackathon: https://lablab.ai/ai-hackathons/amd-developer
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import tempfile
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
# make submodules importable
|
| 18 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
| 19 |
+
|
| 20 |
+
import gradio as gr
|
| 21 |
+
|
| 22 |
+
from ingestion.chunker import ingest_to_json
|
| 23 |
+
from ingestion.cloner import clone
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
HEADER_MD = """
|
| 27 |
+
# REPOMIND
|
| 28 |
+
**Open-source repo-scale coding agent on AMD MI300X.**
|
| 29 |
+
|
| 30 |
+
Ingest a git repository (up to 256K tokens, FP8) on a single GPU and reason across the whole codebase with multi-step tool use.
|
| 31 |
+
|
| 32 |
+
> 📦 GitHub: [SRKRZ23/repomind](https://github.com/SRKRZ23/repomind)
|
| 33 |
+
> 🏆 Built for the [AMD Developer Hackathon 2026](https://lablab.ai/ai-hackathons/amd-developer)
|
| 34 |
+
|
| 35 |
+
### Why MI300X?
|
| 36 |
+
- Qwen3-Coder-Next-FP8 weights ≈ 80 GB
|
| 37 |
+
- 256K KV cache @ FP8 ≈ 38 GB
|
| 38 |
+
- + activations ≈ 25 GB → **~143 GB total on a single GPU**
|
| 39 |
+
- NVIDIA H100 80GB physically OOMs. AMD MI300X 192GB just runs it.
|
| 40 |
+
|
| 41 |
+
### About this Space
|
| 42 |
+
This is the **frontend** demo. Backend defaults to the **mock LLM** so the Space
|
| 43 |
+
runs on CPU-basic without burning GPU credits. Switch to `vllm` and provide a
|
| 44 |
+
base URL once the MI300X endpoint is live.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Minimal cap — HF Space CPU-basic gets 16 GB RAM. Don't blow it on giant repos.
|
| 49 |
+
MAX_INGEST_SIZE_MB = 50
|
| 50 |
+
SCRATCH_DIR = Path(tempfile.gettempdir()) / "repomind_hf"
|
| 51 |
+
SCRATCH_DIR.mkdir(exist_ok=True)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def ingest(url_or_path: str, chunk_tokens: int) -> str:
|
| 55 |
+
if not url_or_path or not url_or_path.strip():
|
| 56 |
+
return "Provide a GitHub URL or `owner/repo` shorthand."
|
| 57 |
+
out = SCRATCH_DIR / "active.json"
|
| 58 |
+
try:
|
| 59 |
+
# Local path mode (rare on HF — usually URL)
|
| 60 |
+
if Path(url_or_path).is_dir():
|
| 61 |
+
repo_root = Path(url_or_path)
|
| 62 |
+
label = repo_root.name
|
| 63 |
+
else:
|
| 64 |
+
res = clone(url_or_path, cache_dir=SCRATCH_DIR / "repos")
|
| 65 |
+
repo_root = res.local_path
|
| 66 |
+
label = res.url.rsplit("/", 1)[-1].removesuffix(".git")
|
| 67 |
+
summary = ingest_to_json(
|
| 68 |
+
repo_root,
|
| 69 |
+
out,
|
| 70 |
+
repo_label=label,
|
| 71 |
+
max_tokens_per_chunk=chunk_tokens,
|
| 72 |
+
)
|
| 73 |
+
return json.dumps(summary, indent=2)
|
| 74 |
+
except Exception as e:
|
| 75 |
+
return f"❌ {type(e).__name__}: {e}"
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def ask(question: str, backend: str, base_url: str, model: str):
|
| 79 |
+
summary_path = SCRATCH_DIR / "active.json"
|
| 80 |
+
if not summary_path.exists():
|
| 81 |
+
return "Ingest a repo first.", ""
|
| 82 |
+
if not question or not question.strip():
|
| 83 |
+
return "Type a question.", ""
|
| 84 |
+
|
| 85 |
+
summary = json.loads(summary_path.read_text())
|
| 86 |
+
repo_root = Path(summary.get("root", "."))
|
| 87 |
+
|
| 88 |
+
# Backend wiring — vLLM only when user explicitly chose it AND a URL is given
|
| 89 |
+
if backend == "vllm":
|
| 90 |
+
if not base_url or not base_url.strip():
|
| 91 |
+
return "vLLM backend selected but no base URL provided.", ""
|
| 92 |
+
try:
|
| 93 |
+
from serving.vllm_client import VLLMClient
|
| 94 |
+
llm = VLLMClient(base_url=base_url.strip(), model=model.strip() or "Qwen/Qwen3-Coder-Next-FP8")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
return f"❌ failed to init vLLM client: {e}", ""
|
| 97 |
+
else:
|
| 98 |
+
from serving.mock_client import MockClient
|
| 99 |
+
llm = MockClient(max_tool_turns=2)
|
| 100 |
+
|
| 101 |
+
from agent.loop import Agent
|
| 102 |
+
from tools.registry import default_registry
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
agent = Agent(llm=llm, tools=default_registry(repo_root, scratch_dir=SCRATCH_DIR / "scratch"), max_steps=4)
|
| 106 |
+
result = agent.run(question, summary)
|
| 107 |
+
except Exception as e:
|
| 108 |
+
return f"❌ agent failed: {type(e).__name__}: {e}", ""
|
| 109 |
+
|
| 110 |
+
trace_lines = [f"- {tc['name']} {json.dumps(tc['arguments'], ensure_ascii=False)}" for tc in result.tool_calls]
|
| 111 |
+
trace = "\n".join(trace_lines) or "(no tool calls)"
|
| 112 |
+
return result.answer, trace
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
with gr.Blocks(title="REPOMIND — repo-scale coding agent on AMD MI300X", theme=gr.themes.Soft()) as demo:
|
| 116 |
+
gr.Markdown(HEADER_MD)
|
| 117 |
+
|
| 118 |
+
with gr.Tab("1. Ingest"):
|
| 119 |
+
with gr.Row():
|
| 120 |
+
url = gr.Textbox(
|
| 121 |
+
label="GitHub URL or owner/repo",
|
| 122 |
+
placeholder="https://github.com/torvalds/linux OR pallets/flask",
|
| 123 |
+
scale=4,
|
| 124 |
+
)
|
| 125 |
+
chunk_tokens = gr.Slider(256, 4096, value=1024, step=128, label="Tokens / chunk", scale=1)
|
| 126 |
+
ingest_btn = gr.Button("Ingest", variant="primary")
|
| 127 |
+
ingest_out = gr.Code(label="Ingestion summary", language="json")
|
| 128 |
+
ingest_btn.click(ingest, [url, chunk_tokens], ingest_out)
|
| 129 |
+
|
| 130 |
+
with gr.Tab("2. Ask"):
|
| 131 |
+
with gr.Row():
|
| 132 |
+
backend = gr.Radio(
|
| 133 |
+
choices=["mock (offline demo)", "vllm (live MI300X)"],
|
| 134 |
+
value="mock (offline demo)",
|
| 135 |
+
label="Backend",
|
| 136 |
+
scale=1,
|
| 137 |
+
)
|
| 138 |
+
base_url = gr.Textbox(
|
| 139 |
+
label="vLLM base URL (only used in `vllm` mode)",
|
| 140 |
+
value="",
|
| 141 |
+
placeholder="http://your-mi300x-host:8000/v1",
|
| 142 |
+
scale=2,
|
| 143 |
+
)
|
| 144 |
+
model = gr.Textbox(
|
| 145 |
+
label="Model id",
|
| 146 |
+
value="Qwen/Qwen3-Coder-Next-FP8",
|
| 147 |
+
scale=2,
|
| 148 |
+
)
|
| 149 |
+
question = gr.Textbox(
|
| 150 |
+
label="Question",
|
| 151 |
+
lines=3,
|
| 152 |
+
placeholder="What does the chunker prioritize? Where is authentication handled?",
|
| 153 |
+
)
|
| 154 |
+
ask_btn = gr.Button("Ask", variant="primary")
|
| 155 |
+
answer = gr.Markdown(label="Answer")
|
| 156 |
+
tool_trace = gr.Code(label="Tool trace", language="markdown")
|
| 157 |
+
|
| 158 |
+
# normalize backend selector to internal value
|
| 159 |
+
def _ask(q, b, u, m):
|
| 160 |
+
internal = "vllm" if b.startswith("vllm") else "mock"
|
| 161 |
+
return ask(q, internal, u, m)
|
| 162 |
+
|
| 163 |
+
ask_btn.click(_ask, [question, backend, base_url, model], [answer, tool_trace])
|
| 164 |
+
|
| 165 |
+
gr.Markdown(
|
| 166 |
+
"---\n"
|
| 167 |
+
"**Author:** [Sardor Razikov](https://huggingface.co/ZeroR3) · "
|
| 168 |
+
"[GitHub](https://github.com/SRKRZ23) · "
|
| 169 |
+
"[lablab.ai](https://lablab.ai/u/@Sardor_R) · "
|
| 170 |
+
"[Zenodo (ECB)](https://doi.org/10.5281/zenodo.19791329)"
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
if __name__ == "__main__":
|
| 175 |
+
demo.launch()
|
ingestion/__init__.py
ADDED
|
File without changes
|
ingestion/chunker.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smart, structure-aware chunking with priority scoring.
|
| 2 |
+
|
| 3 |
+
Per file:
|
| 4 |
+
1. Detect language (filesystem extension).
|
| 5 |
+
2. Extract top-level symbols via tree-sitter (or regex fallback).
|
| 6 |
+
3. Slice file into chunks aligned to symbol boundaries when possible;
|
| 7 |
+
otherwise split on paragraph / blank lines / hard cut.
|
| 8 |
+
4. Tag each chunk with a priority used by the token budgeter:
|
| 9 |
+
0 = README / top-level docs
|
| 10 |
+
1 = top-level symbols (functions, classes)
|
| 11 |
+
2 = nested / private symbols
|
| 12 |
+
3 = test / vendored / generated code
|
| 13 |
+
4 = unknown / binary-ish
|
| 14 |
+
|
| 15 |
+
The agent only sees chunks that fit its context budget — priorities decide
|
| 16 |
+
who gets in first when a 50K-LOC kernel doesn't fit at all.
|
| 17 |
+
"""
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
from dataclasses import dataclass, asdict
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
from typing import Iterable, List, Optional, Sequence
|
| 25 |
+
|
| 26 |
+
from .parser import Symbol, detect_language, extract_symbols
|
| 27 |
+
from .token_budget import count_tokens
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
SKIP_DIRS = {
|
| 31 |
+
".git", "node_modules", ".venv", "venv", "env", "__pycache__",
|
| 32 |
+
"dist", "build", "target", ".next", ".nuxt", ".cache",
|
| 33 |
+
"vendor", "third_party", "external", ".gradle", ".idea", ".vscode",
|
| 34 |
+
}
|
| 35 |
+
SKIP_BIN_EXT = {
|
| 36 |
+
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".ico", ".tiff",
|
| 37 |
+
".pdf", ".zip", ".tar", ".gz", ".bz2", ".7z", ".xz", ".whl", ".egg",
|
| 38 |
+
".so", ".dylib", ".dll", ".exe", ".o", ".a", ".class", ".jar",
|
| 39 |
+
".bin", ".pkl", ".parquet", ".safetensors", ".pt", ".onnx",
|
| 40 |
+
".woff", ".woff2", ".ttf", ".otf", ".mp3", ".mp4", ".mov", ".wav",
|
| 41 |
+
}
|
| 42 |
+
README_NAMES = {"README.md", "README.rst", "README.txt", "README"}
|
| 43 |
+
TEST_PATTERNS = (re.compile(r"(?:^|/)tests?/"), re.compile(r"(?:^|/)test_"), re.compile(r"_test\."))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class Chunk:
|
| 48 |
+
chunk_id: str
|
| 49 |
+
repo: str
|
| 50 |
+
path: str
|
| 51 |
+
section: str # symbol name or "header"
|
| 52 |
+
start_line: int
|
| 53 |
+
end_line: int
|
| 54 |
+
text: str
|
| 55 |
+
n_tokens: int
|
| 56 |
+
priority: int
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _is_test_path(rel: str) -> bool:
|
| 60 |
+
return any(p.search(rel) for p in TEST_PATTERNS)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _file_priority(rel: str, name: str) -> int:
|
| 64 |
+
if name in README_NAMES or rel.endswith(("README.md", "README.rst")):
|
| 65 |
+
return 0
|
| 66 |
+
if _is_test_path(rel):
|
| 67 |
+
return 3
|
| 68 |
+
if any(seg in rel.split("/") for seg in ("docs", "doc")):
|
| 69 |
+
return 0
|
| 70 |
+
return 1
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _chunk_text_by_symbols(
|
| 74 |
+
text: str, symbols: List[Symbol], max_tokens: int, overlap_lines: int = 4,
|
| 75 |
+
) -> List[tuple[str, str, int, int]]:
|
| 76 |
+
"""Return [(section, text, start_line, end_line)]. Symbols are sorted by start_line."""
|
| 77 |
+
lines = text.split("\n")
|
| 78 |
+
n = len(lines)
|
| 79 |
+
if not symbols:
|
| 80 |
+
return _chunk_lines("body", lines, 1, n, max_tokens)
|
| 81 |
+
|
| 82 |
+
symbols = sorted(symbols, key=lambda s: s.start_line)
|
| 83 |
+
out: List[tuple[str, str, int, int]] = []
|
| 84 |
+
|
| 85 |
+
# Header / preamble before first symbol
|
| 86 |
+
if symbols[0].start_line > 1:
|
| 87 |
+
out.extend(_chunk_lines("header", lines, 1, symbols[0].start_line - 1, max_tokens))
|
| 88 |
+
|
| 89 |
+
for i, sym in enumerate(symbols):
|
| 90 |
+
end = symbols[i + 1].start_line - 1 if i + 1 < len(symbols) else n
|
| 91 |
+
if end < sym.start_line:
|
| 92 |
+
continue
|
| 93 |
+
out.extend(_chunk_lines(sym.name or sym.kind, lines, sym.start_line, end, max_tokens))
|
| 94 |
+
return out
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def _chunk_lines(section: str, lines: list[str], lo: int, hi: int, max_tokens: int):
|
| 98 |
+
"""Split a slice of [lo..hi] (1-indexed inclusive) into <= max_tokens pieces."""
|
| 99 |
+
pieces: List[tuple[str, str, int, int]] = []
|
| 100 |
+
cur: List[str] = []
|
| 101 |
+
cur_tokens = 0
|
| 102 |
+
cur_start = lo
|
| 103 |
+
for idx in range(lo, hi + 1):
|
| 104 |
+
line = lines[idx - 1] if 0 < idx <= len(lines) else ""
|
| 105 |
+
line_tokens = count_tokens(line) + 1
|
| 106 |
+
if cur and cur_tokens + line_tokens > max_tokens:
|
| 107 |
+
pieces.append((section, "\n".join(cur), cur_start, idx - 1))
|
| 108 |
+
cur = [line]
|
| 109 |
+
cur_tokens = line_tokens
|
| 110 |
+
cur_start = idx
|
| 111 |
+
else:
|
| 112 |
+
cur.append(line)
|
| 113 |
+
cur_tokens += line_tokens
|
| 114 |
+
if cur:
|
| 115 |
+
pieces.append((section, "\n".join(cur), cur_start, hi))
|
| 116 |
+
return pieces
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def chunk_file(
|
| 120 |
+
repo: str,
|
| 121 |
+
path: Path,
|
| 122 |
+
rel_path: str,
|
| 123 |
+
max_tokens_per_chunk: int = 1024,
|
| 124 |
+
) -> List[Chunk]:
|
| 125 |
+
name = path.name
|
| 126 |
+
if path.suffix.lower() in SKIP_BIN_EXT:
|
| 127 |
+
return []
|
| 128 |
+
try:
|
| 129 |
+
text = path.read_text(encoding="utf-8")
|
| 130 |
+
except (UnicodeDecodeError, OSError):
|
| 131 |
+
return []
|
| 132 |
+
if not text.strip():
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
lang = detect_language(path)
|
| 136 |
+
symbols = extract_symbols(text, lang)
|
| 137 |
+
base_priority = _file_priority(rel_path, name)
|
| 138 |
+
pieces = _chunk_text_by_symbols(text, symbols, max_tokens_per_chunk)
|
| 139 |
+
|
| 140 |
+
chunks: List[Chunk] = []
|
| 141 |
+
for i, (section, ctext, start, end) in enumerate(pieces):
|
| 142 |
+
# Nested / very small private fragments get bumped down a tier.
|
| 143 |
+
prio = base_priority
|
| 144 |
+
if base_priority == 1 and section.startswith("_"):
|
| 145 |
+
prio = 2
|
| 146 |
+
chunks.append(Chunk(
|
| 147 |
+
chunk_id=f"{rel_path}#{i}",
|
| 148 |
+
repo=repo,
|
| 149 |
+
path=rel_path,
|
| 150 |
+
section=section,
|
| 151 |
+
start_line=start,
|
| 152 |
+
end_line=end,
|
| 153 |
+
text=ctext,
|
| 154 |
+
n_tokens=count_tokens(ctext),
|
| 155 |
+
priority=prio,
|
| 156 |
+
))
|
| 157 |
+
return chunks
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def walk_repo(
|
| 161 |
+
root: str | Path,
|
| 162 |
+
repo_label: str,
|
| 163 |
+
max_tokens_per_chunk: int = 1024,
|
| 164 |
+
follow_symlinks: bool = False,
|
| 165 |
+
) -> Iterable[Chunk]:
|
| 166 |
+
root = Path(root).resolve()
|
| 167 |
+
for dirpath, dirnames, filenames in os.walk(root, followlinks=follow_symlinks):
|
| 168 |
+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
| 169 |
+
for fn in filenames:
|
| 170 |
+
full = Path(dirpath) / fn
|
| 171 |
+
try:
|
| 172 |
+
rel = str(full.relative_to(root))
|
| 173 |
+
except ValueError:
|
| 174 |
+
continue
|
| 175 |
+
yield from chunk_file(repo_label, full, rel, max_tokens_per_chunk)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def ingest_to_json(
|
| 179 |
+
root: str | Path,
|
| 180 |
+
out_path: str | Path,
|
| 181 |
+
repo_label: Optional[str] = None,
|
| 182 |
+
max_tokens_per_chunk: int = 1024,
|
| 183 |
+
) -> dict:
|
| 184 |
+
root = Path(root).resolve()
|
| 185 |
+
label = repo_label or root.name
|
| 186 |
+
chunks = list(walk_repo(root, label, max_tokens_per_chunk))
|
| 187 |
+
summary = {
|
| 188 |
+
"repo": label,
|
| 189 |
+
"root": str(root),
|
| 190 |
+
"n_files": len({c.path for c in chunks}),
|
| 191 |
+
"n_chunks": len(chunks),
|
| 192 |
+
"total_tokens": sum(c.n_tokens for c in chunks),
|
| 193 |
+
"by_priority": {
|
| 194 |
+
str(p): sum(1 for c in chunks if c.priority == p)
|
| 195 |
+
for p in sorted({c.priority for c in chunks})
|
| 196 |
+
},
|
| 197 |
+
"chunks": [asdict(c) for c in chunks],
|
| 198 |
+
}
|
| 199 |
+
out = Path(out_path)
|
| 200 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 201 |
+
out.write_text(json.dumps(summary, ensure_ascii=False))
|
| 202 |
+
return {k: v for k, v in summary.items() if k != "chunks"}
|
ingestion/cloner.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Clone a git repository into a local cache directory.
|
| 2 |
+
|
| 3 |
+
Uses GitPython if installed, falls back to shelling out to `git`. Always
|
| 4 |
+
shallow-clones (depth=1) by default — for retrieval we don't need history,
|
| 5 |
+
and shallow makes the Linux kernel ingest in seconds instead of minutes.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import shutil
|
| 11 |
+
import subprocess
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
URL_RE = re.compile(r"^(https?://|git@)([\w./:-]+?)(\.git)?/?$")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class CloneResult:
|
| 22 |
+
url: str
|
| 23 |
+
local_path: Path
|
| 24 |
+
sha: str
|
| 25 |
+
cached: bool
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def normalize_url(url_or_path: str) -> str:
|
| 29 |
+
"""Accept https://, git@, owner/repo, or a local path."""
|
| 30 |
+
s = url_or_path.strip()
|
| 31 |
+
if os.path.isdir(s):
|
| 32 |
+
return os.path.abspath(s)
|
| 33 |
+
if s.startswith("git@") or s.startswith("http"):
|
| 34 |
+
return s
|
| 35 |
+
if "/" in s and not s.startswith("/"):
|
| 36 |
+
# owner/repo shorthand -> github
|
| 37 |
+
return f"https://github.com/{s}.git"
|
| 38 |
+
return s
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def slugify(url: str) -> str:
|
| 42 |
+
"""Stable filesystem-friendly slug from a URL."""
|
| 43 |
+
if os.path.isdir(url):
|
| 44 |
+
return Path(url).name
|
| 45 |
+
m = URL_RE.match(url)
|
| 46 |
+
if not m:
|
| 47 |
+
return re.sub(r"[^a-zA-Z0-9._-]+", "_", url)
|
| 48 |
+
body = m.group(2)
|
| 49 |
+
return re.sub(r"[^a-zA-Z0-9._-]+", "_", body)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _git(*args: str, cwd: Optional[Path] = None) -> str:
|
| 53 |
+
proc = subprocess.run(
|
| 54 |
+
["git", *args],
|
| 55 |
+
cwd=str(cwd) if cwd else None,
|
| 56 |
+
capture_output=True, text=True,
|
| 57 |
+
)
|
| 58 |
+
if proc.returncode != 0:
|
| 59 |
+
raise RuntimeError(f"git {' '.join(args)} failed: {proc.stderr.strip()}")
|
| 60 |
+
return proc.stdout.strip()
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def clone(
|
| 64 |
+
url_or_path: str,
|
| 65 |
+
cache_dir: str | Path = ".repomind_cache/repos",
|
| 66 |
+
depth: int = 1,
|
| 67 |
+
force: bool = False,
|
| 68 |
+
) -> CloneResult:
|
| 69 |
+
"""Clone url to cache_dir/<slug>. If local path is given, just return it."""
|
| 70 |
+
url = normalize_url(url_or_path)
|
| 71 |
+
cache_dir = Path(cache_dir)
|
| 72 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 73 |
+
|
| 74 |
+
# Local-path mode — no clone, just record the SHA if it's a git repo.
|
| 75 |
+
if os.path.isdir(url):
|
| 76 |
+
local = Path(url)
|
| 77 |
+
try:
|
| 78 |
+
sha = _git("rev-parse", "HEAD", cwd=local)
|
| 79 |
+
except Exception:
|
| 80 |
+
sha = "no-git"
|
| 81 |
+
return CloneResult(url=str(local), local_path=local, sha=sha, cached=True)
|
| 82 |
+
|
| 83 |
+
target = cache_dir / slugify(url)
|
| 84 |
+
if target.exists() and not force:
|
| 85 |
+
try:
|
| 86 |
+
sha = _git("rev-parse", "HEAD", cwd=target)
|
| 87 |
+
return CloneResult(url=url, local_path=target, sha=sha, cached=True)
|
| 88 |
+
except Exception:
|
| 89 |
+
shutil.rmtree(target, ignore_errors=True)
|
| 90 |
+
|
| 91 |
+
if target.exists() and force:
|
| 92 |
+
shutil.rmtree(target, ignore_errors=True)
|
| 93 |
+
|
| 94 |
+
args = ["clone", "--filter=blob:none"]
|
| 95 |
+
if depth > 0:
|
| 96 |
+
args += ["--depth", str(depth)]
|
| 97 |
+
args += [url, str(target)]
|
| 98 |
+
_git(*args)
|
| 99 |
+
sha = _git("rev-parse", "HEAD", cwd=target)
|
| 100 |
+
return CloneResult(url=url, local_path=target, sha=sha, cached=False)
|
ingestion/parser.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tree-sitter-aware parsing with graceful fallback.
|
| 2 |
+
|
| 3 |
+
If `tree-sitter-languages` isn't installed we degrade to a regex-based
|
| 4 |
+
top-level-symbol extractor — good enough for unit tests and for
|
| 5 |
+
languages we don't yet have grammars for.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
import re
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Iterable, List, Optional
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Map filesystem extensions to tree-sitter grammar names.
|
| 15 |
+
LANG_BY_EXT = {
|
| 16 |
+
".py": "python", ".pyi": "python",
|
| 17 |
+
".rs": "rust",
|
| 18 |
+
".go": "go",
|
| 19 |
+
".js": "javascript", ".jsx": "javascript",
|
| 20 |
+
".ts": "typescript", ".tsx": "tsx",
|
| 21 |
+
".c": "c", ".h": "c",
|
| 22 |
+
".cc": "cpp", ".cpp": "cpp", ".cxx": "cpp", ".hpp": "cpp",
|
| 23 |
+
".java": "java",
|
| 24 |
+
".rb": "ruby",
|
| 25 |
+
".php": "php",
|
| 26 |
+
".cs": "c_sharp",
|
| 27 |
+
".swift": "swift",
|
| 28 |
+
".kt": "kotlin", ".kts": "kotlin",
|
| 29 |
+
".sh": "bash", ".bash": "bash",
|
| 30 |
+
".sql": "sql",
|
| 31 |
+
".html": "html",
|
| 32 |
+
".css": "css",
|
| 33 |
+
".json": "json",
|
| 34 |
+
".yaml": "yaml", ".yml": "yaml",
|
| 35 |
+
".toml": "toml",
|
| 36 |
+
".md": "markdown", ".markdown": "markdown",
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class Symbol:
|
| 42 |
+
name: str
|
| 43 |
+
kind: str # function / class / method / module / heading
|
| 44 |
+
start_line: int
|
| 45 |
+
end_line: int
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def detect_language(path: str | Path) -> Optional[str]:
|
| 49 |
+
suffix = Path(path).suffix.lower()
|
| 50 |
+
return LANG_BY_EXT.get(suffix)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def extract_symbols(text: str, language: Optional[str]) -> List[Symbol]:
|
| 54 |
+
"""Top-level structural symbols. Tree-sitter when available; regex fallback."""
|
| 55 |
+
if not text.strip():
|
| 56 |
+
return []
|
| 57 |
+
try:
|
| 58 |
+
return _ts_symbols(text, language)
|
| 59 |
+
except Exception:
|
| 60 |
+
return _regex_symbols(text, language)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ---- tree-sitter path -----------------------------------------------------
|
| 64 |
+
|
| 65 |
+
def _ts_symbols(text: str, language: Optional[str]) -> List[Symbol]:
|
| 66 |
+
if not language:
|
| 67 |
+
raise RuntimeError("no language")
|
| 68 |
+
try:
|
| 69 |
+
from tree_sitter_languages import get_parser # type: ignore
|
| 70 |
+
except ImportError as e:
|
| 71 |
+
raise RuntimeError("tree_sitter_languages not installed") from e
|
| 72 |
+
|
| 73 |
+
parser = get_parser(language)
|
| 74 |
+
tree = parser.parse(text.encode("utf-8"))
|
| 75 |
+
out: List[Symbol] = []
|
| 76 |
+
|
| 77 |
+
interesting = {
|
| 78 |
+
"function_definition", "function_declaration", "method_definition",
|
| 79 |
+
"class_definition", "class_declaration", "struct_item", "trait_item",
|
| 80 |
+
"impl_item", "enum_item", "type_alias_declaration",
|
| 81 |
+
"atx_heading", "setext_heading",
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
def walk(node):
|
| 85 |
+
if node.type in interesting:
|
| 86 |
+
name = _node_name(node, text) or node.type
|
| 87 |
+
kind = _kind_for(node.type)
|
| 88 |
+
out.append(Symbol(
|
| 89 |
+
name=name, kind=kind,
|
| 90 |
+
start_line=node.start_point[0] + 1,
|
| 91 |
+
end_line=node.end_point[0] + 1,
|
| 92 |
+
))
|
| 93 |
+
for c in node.children:
|
| 94 |
+
walk(c)
|
| 95 |
+
|
| 96 |
+
walk(tree.root_node)
|
| 97 |
+
return out
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _node_name(node, text: str) -> Optional[str]:
|
| 101 |
+
for c in node.children:
|
| 102 |
+
if c.type == "identifier" or c.type == "type_identifier":
|
| 103 |
+
return text[c.start_byte:c.end_byte]
|
| 104 |
+
for cc in c.children:
|
| 105 |
+
if cc.type in ("identifier", "type_identifier"):
|
| 106 |
+
return text[cc.start_byte:cc.end_byte]
|
| 107 |
+
if node.type in ("atx_heading", "setext_heading"):
|
| 108 |
+
return text[node.start_byte:node.end_byte].strip().lstrip("#").strip()
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _kind_for(node_type: str) -> str:
|
| 113 |
+
if "class" in node_type or "struct" in node_type or "trait" in node_type or "impl" in node_type:
|
| 114 |
+
return "class"
|
| 115 |
+
if "method" in node_type:
|
| 116 |
+
return "method"
|
| 117 |
+
if "function" in node_type:
|
| 118 |
+
return "function"
|
| 119 |
+
if "heading" in node_type:
|
| 120 |
+
return "heading"
|
| 121 |
+
return "symbol"
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ---- regex fallback -------------------------------------------------------
|
| 125 |
+
|
| 126 |
+
PY_DEF = re.compile(r"^(?P<indent>\s*)(?:async\s+)?def\s+(?P<name>[A-Za-z_][\w]*)\s*\(", re.MULTILINE)
|
| 127 |
+
PY_CLASS = re.compile(r"^(?P<indent>\s*)class\s+(?P<name>[A-Za-z_][\w]*)", re.MULTILINE)
|
| 128 |
+
RS_FN = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?fn\s+(?P<name>[A-Za-z_][\w]*)", re.MULTILINE)
|
| 129 |
+
GO_FN = re.compile(r"^\s*func\s+(?:\([^)]*\)\s+)?(?P<name>[A-Za-z_][\w]*)", re.MULTILINE)
|
| 130 |
+
JS_FN = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+(?P<name>[A-Za-z_$][\w$]*)", re.MULTILINE)
|
| 131 |
+
MD_HEADING = re.compile(r"^(#{1,6})\s+(?P<name>.+)$", re.MULTILINE)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _regex_symbols(text: str, language: Optional[str]) -> List[Symbol]:
|
| 135 |
+
lines = text.split("\n")
|
| 136 |
+
out: List[Symbol] = []
|
| 137 |
+
|
| 138 |
+
def add(name: str, kind: str, m: re.Match):
|
| 139 |
+
line = text[:m.start()].count("\n") + 1
|
| 140 |
+
out.append(Symbol(name=name, kind=kind, start_line=line, end_line=line))
|
| 141 |
+
|
| 142 |
+
if language == "python":
|
| 143 |
+
for m in PY_CLASS.finditer(text):
|
| 144 |
+
add(m.group("name"), "class", m)
|
| 145 |
+
for m in PY_DEF.finditer(text):
|
| 146 |
+
indent = m.group("indent")
|
| 147 |
+
kind = "method" if indent else "function"
|
| 148 |
+
add(m.group("name"), kind, m)
|
| 149 |
+
elif language == "rust":
|
| 150 |
+
for m in RS_FN.finditer(text):
|
| 151 |
+
add(m.group("name"), "function", m)
|
| 152 |
+
elif language == "go":
|
| 153 |
+
for m in GO_FN.finditer(text):
|
| 154 |
+
add(m.group("name"), "function", m)
|
| 155 |
+
elif language in ("javascript", "typescript", "tsx"):
|
| 156 |
+
for m in JS_FN.finditer(text):
|
| 157 |
+
add(m.group("name"), "function", m)
|
| 158 |
+
elif language == "markdown":
|
| 159 |
+
for m in MD_HEADING.finditer(text):
|
| 160 |
+
add(m.group("name").strip(), "heading", m)
|
| 161 |
+
return out
|
ingestion/token_budget.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Token-aware operations: counting + priority-based truncation.
|
| 2 |
+
|
| 3 |
+
Uses tiktoken when available (cl100k_base encoder approximates Qwen tokens
|
| 4 |
+
within ~3 % on natural code/prose). Falls back to a 3.6 chars-per-token
|
| 5 |
+
estimator otherwise.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
from typing import Iterable, List, Sequence
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
_ENC = None
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _get_encoder():
|
| 15 |
+
global _ENC
|
| 16 |
+
if _ENC is not None:
|
| 17 |
+
return _ENC
|
| 18 |
+
try:
|
| 19 |
+
import tiktoken
|
| 20 |
+
_ENC = tiktoken.get_encoding("cl100k_base")
|
| 21 |
+
except Exception:
|
| 22 |
+
_ENC = False
|
| 23 |
+
return _ENC
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def count_tokens(text: str) -> int:
|
| 27 |
+
"""Best-effort token count for the configured encoder."""
|
| 28 |
+
if not text:
|
| 29 |
+
return 0
|
| 30 |
+
enc = _get_encoder()
|
| 31 |
+
if enc:
|
| 32 |
+
return len(enc.encode(text, disallowed_special=()))
|
| 33 |
+
# Heuristic: ~3.6 chars/token on mixed code+prose
|
| 34 |
+
return max(1, int(len(text) / 3.6))
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def truncate_to(text: str, max_tokens: int) -> str:
|
| 38 |
+
"""Drop trailing tokens to fit a budget."""
|
| 39 |
+
if max_tokens <= 0:
|
| 40 |
+
return ""
|
| 41 |
+
enc = _get_encoder()
|
| 42 |
+
if enc:
|
| 43 |
+
toks = enc.encode(text, disallowed_special=())
|
| 44 |
+
if len(toks) <= max_tokens:
|
| 45 |
+
return text
|
| 46 |
+
return enc.decode(toks[:max_tokens])
|
| 47 |
+
# heuristic
|
| 48 |
+
chars = int(max_tokens * 3.6)
|
| 49 |
+
return text[:chars]
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def fit_priority(
|
| 53 |
+
items: Sequence[tuple[str, int]], # (text, priority — lower = include first)
|
| 54 |
+
max_tokens: int,
|
| 55 |
+
) -> str:
|
| 56 |
+
"""Pack texts in priority order until budget exhausted; truncate the last fitting one."""
|
| 57 |
+
out: List[str] = []
|
| 58 |
+
used = 0
|
| 59 |
+
sorted_items = sorted(items, key=lambda t: t[1])
|
| 60 |
+
for text, _prio in sorted_items:
|
| 61 |
+
n = count_tokens(text)
|
| 62 |
+
if used + n <= max_tokens:
|
| 63 |
+
out.append(text)
|
| 64 |
+
used += n
|
| 65 |
+
else:
|
| 66 |
+
remaining = max_tokens - used
|
| 67 |
+
if remaining > 32:
|
| 68 |
+
out.append(truncate_to(text, remaining))
|
| 69 |
+
used = max_tokens
|
| 70 |
+
break
|
| 71 |
+
return "\n\n".join(out)
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.40
|
| 2 |
+
gitpython>=3.1
|
| 3 |
+
tree-sitter>=0.22
|
| 4 |
+
tree-sitter-languages>=1.10
|
| 5 |
+
tiktoken>=0.7
|
| 6 |
+
pydantic>=2.6
|
| 7 |
+
openai>=1.40
|
serving/__init__.py
ADDED
|
File without changes
|
serving/base.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM client protocol used by the agent loop."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import Any, Dict, List, Protocol
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class ToolCall:
|
| 9 |
+
id: str
|
| 10 |
+
name: str
|
| 11 |
+
arguments: Dict[str, Any]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class LLMResponse:
|
| 16 |
+
content: str
|
| 17 |
+
tool_calls: List[ToolCall] = field(default_factory=list)
|
| 18 |
+
usage: Dict[str, int] = field(default_factory=dict)
|
| 19 |
+
raw: Any = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class LLMClient(Protocol):
|
| 23 |
+
def complete(
|
| 24 |
+
self,
|
| 25 |
+
messages: List[Dict[str, Any]],
|
| 26 |
+
tools: List[Dict[str, Any]],
|
| 27 |
+
**kwargs: Any,
|
| 28 |
+
) -> LLMResponse: ...
|
serving/mock_client.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic offline LLM stub. Drives unit tests without a GPU.
|
| 2 |
+
|
| 3 |
+
Each call inspects the latest user / tool message and decides what to do
|
| 4 |
+
based on simple heuristics:
|
| 5 |
+
- Question contains 'grep' or 'search' → emit grep_codebase tool call
|
| 6 |
+
- Question contains 'read' or 'show' → emit read_file tool call
|
| 7 |
+
- After two tool turns → emit a final answer
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
import json
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from typing import Any, Dict, List
|
| 13 |
+
|
| 14 |
+
from .base import LLMClient, LLMResponse, ToolCall
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class MockClient(LLMClient):
|
| 19 |
+
max_tool_turns: int = 2
|
| 20 |
+
|
| 21 |
+
def complete(self, messages: List[Dict[str, Any]], tools: List[Dict[str, Any]], **kwargs: Any) -> LLMResponse:
|
| 22 |
+
tool_turns = sum(1 for m in messages if m.get("role") == "tool")
|
| 23 |
+
last_user = next((m for m in reversed(messages) if m.get("role") == "user"), None)
|
| 24 |
+
question = (last_user.get("content") if last_user else "") or ""
|
| 25 |
+
|
| 26 |
+
if tool_turns >= self.max_tool_turns:
|
| 27 |
+
tool_outputs = [m.get("content", "") for m in messages if m.get("role") == "tool"]
|
| 28 |
+
joined = "\n".join(tool_outputs)[:1500]
|
| 29 |
+
answer = (
|
| 30 |
+
"Based on the inspected files, here is what I found:\n\n"
|
| 31 |
+
f"{joined or '(no tool output)'}\n\n"
|
| 32 |
+
f"Original question: {question.strip()}"
|
| 33 |
+
)
|
| 34 |
+
return LLMResponse(content=answer, tool_calls=[], usage={"prompt": 0, "completion": 0})
|
| 35 |
+
|
| 36 |
+
# Decide which tool to call next
|
| 37 |
+
q = question.lower()
|
| 38 |
+
if any(k in q for k in ("grep", "search", "find", "where", "occurr")):
|
| 39 |
+
call = ToolCall(
|
| 40 |
+
id=f"call_{tool_turns}", name="grep_codebase",
|
| 41 |
+
arguments={"pattern": _extract_term(question), "max_results": 10},
|
| 42 |
+
)
|
| 43 |
+
elif any(k in q for k in ("git", "history", "commits", "log")):
|
| 44 |
+
call = ToolCall(id=f"call_{tool_turns}", name="git_log", arguments={"limit": 10})
|
| 45 |
+
elif any(k in q for k in ("test", "run", "pytest")):
|
| 46 |
+
call = ToolCall(id=f"call_{tool_turns}", name="run_tests", arguments={})
|
| 47 |
+
elif any(k in q for k in ("read", "show", "open", "file")):
|
| 48 |
+
call = ToolCall(
|
| 49 |
+
id=f"call_{tool_turns}", name="read_file",
|
| 50 |
+
arguments={"path": _extract_path(question) or "README.md"},
|
| 51 |
+
)
|
| 52 |
+
else:
|
| 53 |
+
# default to a grep so we always exercise tool path
|
| 54 |
+
call = ToolCall(
|
| 55 |
+
id=f"call_{tool_turns}", name="grep_codebase",
|
| 56 |
+
arguments={"pattern": _extract_term(question), "max_results": 5},
|
| 57 |
+
)
|
| 58 |
+
return LLMResponse(content="", tool_calls=[call], usage={"prompt": 0, "completion": 0})
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _extract_term(text: str) -> str:
|
| 62 |
+
# crude term extraction for tests: take first identifier-like token longer than 3 chars
|
| 63 |
+
import re
|
| 64 |
+
for m in re.finditer(r"[A-Za-z_][\w]{3,}", text):
|
| 65 |
+
if m.group(0).lower() not in {"what", "where", "find", "grep", "search", "show", "file", "read"}:
|
| 66 |
+
return m.group(0)
|
| 67 |
+
return text.strip()[:32] or "."
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _extract_path(text: str) -> str:
|
| 71 |
+
import re
|
| 72 |
+
m = re.search(r"[\w./-]+\.[A-Za-z]{1,5}", text)
|
| 73 |
+
return m.group(0) if m else ""
|
serving/vllm_client.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""vLLM-backed client (OpenAI-compatible API).
|
| 2 |
+
|
| 3 |
+
Targets a local vLLM server running:
|
| 4 |
+
vllm serve Qwen/Qwen3-Coder-Next-FP8 \\
|
| 5 |
+
--tool-call-parser qwen3_coder \\
|
| 6 |
+
--max-model-len 262144 \\
|
| 7 |
+
--kv-cache-dtype fp8
|
| 8 |
+
|
| 9 |
+
The server returns tool-calls in the OpenAI function-calling format, which
|
| 10 |
+
we translate to our internal ToolCall dataclass.
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
import json
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
from typing import Any, Dict, List
|
| 16 |
+
|
| 17 |
+
from .base import LLMClient, LLMResponse, ToolCall
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class VLLMClient(LLMClient):
|
| 22 |
+
base_url: str = "http://localhost:8000/v1"
|
| 23 |
+
model: str = "Qwen/Qwen3-Coder-Next-FP8"
|
| 24 |
+
api_key: str = "EMPTY" # vLLM ignores the key but the SDK requires one
|
| 25 |
+
timeout: float = 300.0
|
| 26 |
+
temperature: float = 0.2
|
| 27 |
+
max_tokens: int = 2048
|
| 28 |
+
|
| 29 |
+
def __post_init__(self):
|
| 30 |
+
try:
|
| 31 |
+
from openai import OpenAI # type: ignore
|
| 32 |
+
except ImportError as e:
|
| 33 |
+
raise ImportError("pip install openai") from e
|
| 34 |
+
self._client = OpenAI(base_url=self.base_url, api_key=self.api_key, timeout=self.timeout)
|
| 35 |
+
|
| 36 |
+
def complete(
|
| 37 |
+
self,
|
| 38 |
+
messages: List[Dict[str, Any]],
|
| 39 |
+
tools: List[Dict[str, Any]],
|
| 40 |
+
**kwargs: Any,
|
| 41 |
+
) -> LLMResponse:
|
| 42 |
+
kw = {
|
| 43 |
+
"model": self.model,
|
| 44 |
+
"messages": messages,
|
| 45 |
+
"temperature": kwargs.get("temperature", self.temperature),
|
| 46 |
+
"max_tokens": kwargs.get("max_tokens", self.max_tokens),
|
| 47 |
+
}
|
| 48 |
+
if tools:
|
| 49 |
+
kw["tools"] = tools
|
| 50 |
+
kw["tool_choice"] = kwargs.get("tool_choice", "auto")
|
| 51 |
+
|
| 52 |
+
resp = self._client.chat.completions.create(**kw)
|
| 53 |
+
choice = resp.choices[0].message
|
| 54 |
+
content = choice.content or ""
|
| 55 |
+
tool_calls: List[ToolCall] = []
|
| 56 |
+
for tc in (choice.tool_calls or []):
|
| 57 |
+
try:
|
| 58 |
+
args = json.loads(tc.function.arguments or "{}")
|
| 59 |
+
except json.JSONDecodeError:
|
| 60 |
+
args = {}
|
| 61 |
+
tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args))
|
| 62 |
+
|
| 63 |
+
usage = {}
|
| 64 |
+
if getattr(resp, "usage", None):
|
| 65 |
+
usage = {
|
| 66 |
+
"prompt": resp.usage.prompt_tokens,
|
| 67 |
+
"completion": resp.usage.completion_tokens,
|
| 68 |
+
}
|
| 69 |
+
return LLMResponse(content=content, tool_calls=tool_calls, usage=usage, raw=resp)
|
tools/__init__.py
ADDED
|
File without changes
|
tools/base.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tool-call protocol shared by every tool. Mirrors the qwen3_coder schema
|
| 2 |
+
so vLLM's built-in tool parser can dispatch directly into our registry.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
import json
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from typing import Any, Callable, Dict, List
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class ToolResult:
|
| 12 |
+
ok: bool
|
| 13 |
+
output: str
|
| 14 |
+
error: str = ""
|
| 15 |
+
extra: Dict[str, Any] = field(default_factory=dict)
|
| 16 |
+
|
| 17 |
+
def to_message(self) -> Dict[str, Any]:
|
| 18 |
+
body = self.output if self.ok else f"[error] {self.error}"
|
| 19 |
+
return {"role": "tool", "content": body}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class ToolSpec:
|
| 24 |
+
name: str
|
| 25 |
+
description: str
|
| 26 |
+
parameters: Dict[str, Any]
|
| 27 |
+
runner: Callable[..., ToolResult]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ToolRegistry:
|
| 31 |
+
def __init__(self):
|
| 32 |
+
self._tools: Dict[str, ToolSpec] = {}
|
| 33 |
+
|
| 34 |
+
def register(self, spec: ToolSpec):
|
| 35 |
+
self._tools[spec.name] = spec
|
| 36 |
+
|
| 37 |
+
def names(self) -> List[str]:
|
| 38 |
+
return list(self._tools)
|
| 39 |
+
|
| 40 |
+
def schema(self) -> List[Dict[str, Any]]:
|
| 41 |
+
"""OpenAI / qwen3_coder tool schema (function-calling style)."""
|
| 42 |
+
return [
|
| 43 |
+
{
|
| 44 |
+
"type": "function",
|
| 45 |
+
"function": {
|
| 46 |
+
"name": s.name,
|
| 47 |
+
"description": s.description,
|
| 48 |
+
"parameters": s.parameters,
|
| 49 |
+
},
|
| 50 |
+
}
|
| 51 |
+
for s in self._tools.values()
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
def call(self, name: str, args: Dict[str, Any]) -> ToolResult:
|
| 55 |
+
if name not in self._tools:
|
| 56 |
+
return ToolResult(ok=False, output="", error=f"unknown tool: {name}")
|
| 57 |
+
try:
|
| 58 |
+
return self._tools[name].runner(**args)
|
| 59 |
+
except TypeError as e:
|
| 60 |
+
return ToolResult(ok=False, output="", error=f"bad args: {e}")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
return ToolResult(ok=False, output="", error=f"{type(e).__name__}: {e}")
|
tools/execute_code.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""execute_code tool — sandboxed Python runner.
|
| 2 |
+
|
| 3 |
+
Subprocess with -I -S, no network (best-effort via env), CPU + wall-clock
|
| 4 |
+
limits, file IO restricted to a temp scratch dir. Sandboxing here is
|
| 5 |
+
defense-in-depth, not airtight — REPOMIND's threat model assumes the
|
| 6 |
+
operator trusts the model. The point is preventing accidental damage to
|
| 7 |
+
the repo, not stopping a determined adversary.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
import os
|
| 11 |
+
import resource
|
| 12 |
+
import subprocess
|
| 13 |
+
import sys
|
| 14 |
+
import tempfile
|
| 15 |
+
import textwrap
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
from .base import ToolResult, ToolSpec
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
PREAMBLE = textwrap.dedent("""\
|
| 22 |
+
import sys, os, signal, resource
|
| 23 |
+
# disable network sockets at python level
|
| 24 |
+
try:
|
| 25 |
+
import socket
|
| 26 |
+
def _block(*_a, **_k):
|
| 27 |
+
raise RuntimeError("network disabled in sandbox")
|
| 28 |
+
socket.socket = _block # type: ignore
|
| 29 |
+
socket.create_connection = _block # type: ignore
|
| 30 |
+
except Exception:
|
| 31 |
+
pass
|
| 32 |
+
""")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _set_limits(cpu_seconds: int = 30, mem_mb: int = 1024):
|
| 36 |
+
try:
|
| 37 |
+
resource.setrlimit(resource.RLIMIT_CPU, (cpu_seconds, cpu_seconds + 1))
|
| 38 |
+
except (ValueError, OSError):
|
| 39 |
+
pass
|
| 40 |
+
try:
|
| 41 |
+
resource.setrlimit(resource.RLIMIT_AS, (mem_mb * 1024 * 1024, mem_mb * 1024 * 1024))
|
| 42 |
+
except (ValueError, OSError):
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def make_tool(scratch_dir: str | Path = ".repomind_cache/scratch", timeout: int = 30) -> ToolSpec:
|
| 47 |
+
scratch = Path(scratch_dir)
|
| 48 |
+
scratch.mkdir(parents=True, exist_ok=True)
|
| 49 |
+
|
| 50 |
+
def run(code: str, timeout_seconds: int = 0) -> ToolResult:
|
| 51 |
+
timeout_s = timeout_seconds if 0 < timeout_seconds <= timeout else timeout
|
| 52 |
+
with tempfile.NamedTemporaryFile("w", suffix=".py", dir=str(scratch), delete=False) as f:
|
| 53 |
+
f.write(PREAMBLE + "\n" + code)
|
| 54 |
+
script_path = f.name
|
| 55 |
+
|
| 56 |
+
env = os.environ.copy()
|
| 57 |
+
env["PYTHONDONTWRITEBYTECODE"] = "1"
|
| 58 |
+
env["NO_COLOR"] = "1"
|
| 59 |
+
env["PYTHONIOENCODING"] = "utf-8"
|
| 60 |
+
# block obvious network env that requests / urllib3 read
|
| 61 |
+
for k in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"):
|
| 62 |
+
env.pop(k, None)
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
proc = subprocess.run(
|
| 66 |
+
[sys.executable, "-I", "-S", script_path],
|
| 67 |
+
capture_output=True, text=True, timeout=timeout_s,
|
| 68 |
+
cwd=str(scratch), env=env,
|
| 69 |
+
preexec_fn=lambda: _set_limits(timeout_s, 1024),
|
| 70 |
+
)
|
| 71 |
+
except subprocess.TimeoutExpired:
|
| 72 |
+
return ToolResult(ok=False, output="", error=f"timeout after {timeout_s}s")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
return ToolResult(ok=False, output="", error=f"sandbox error: {e}")
|
| 75 |
+
finally:
|
| 76 |
+
try:
|
| 77 |
+
os.unlink(script_path)
|
| 78 |
+
except OSError:
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
out = (proc.stdout or "")[-8000:]
|
| 82 |
+
err = (proc.stderr or "")[-4000:]
|
| 83 |
+
if proc.returncode == 0:
|
| 84 |
+
return ToolResult(ok=True, output=out or "(no output)", extra={"returncode": 0})
|
| 85 |
+
return ToolResult(
|
| 86 |
+
ok=False,
|
| 87 |
+
output=out,
|
| 88 |
+
error=err.strip() or f"non-zero return: {proc.returncode}",
|
| 89 |
+
extra={"returncode": proc.returncode},
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
return ToolSpec(
|
| 93 |
+
name="execute_code",
|
| 94 |
+
description="Run a Python snippet in a sandboxed subprocess. No network, CPU+memory limits, isolated cwd.",
|
| 95 |
+
parameters={
|
| 96 |
+
"type": "object",
|
| 97 |
+
"properties": {
|
| 98 |
+
"code": {"type": "string", "description": "Python source to execute."},
|
| 99 |
+
"timeout_seconds": {"type": "integer", "default": 0, "description": "Override default timeout (cap 30s)."},
|
| 100 |
+
},
|
| 101 |
+
"required": ["code"],
|
| 102 |
+
},
|
| 103 |
+
runner=run,
|
| 104 |
+
)
|
tools/git_log.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""git_log tool — read-only commit history queries."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import subprocess
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from .base import ToolResult, ToolSpec
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def make_tool(repo_root: str | Path, max_commits: int = 50) -> ToolSpec:
|
| 10 |
+
root = Path(repo_root).resolve()
|
| 11 |
+
|
| 12 |
+
def run(path: str = "", limit: int = 20, since: str = "") -> ToolResult:
|
| 13 |
+
if not (root / ".git").exists():
|
| 14 |
+
return ToolResult(ok=False, output="", error="not a git repository")
|
| 15 |
+
n = max(1, min(limit, max_commits))
|
| 16 |
+
cmd = ["git", "log", f"-n{n}", "--pretty=format:%h|%an|%ai|%s"]
|
| 17 |
+
if since:
|
| 18 |
+
cmd += [f"--since={since}"]
|
| 19 |
+
if path:
|
| 20 |
+
cmd += ["--", path]
|
| 21 |
+
try:
|
| 22 |
+
proc = subprocess.run(cmd, cwd=str(root), capture_output=True, text=True, timeout=30)
|
| 23 |
+
except subprocess.TimeoutExpired:
|
| 24 |
+
return ToolResult(ok=False, output="", error="git log timeout")
|
| 25 |
+
if proc.returncode != 0:
|
| 26 |
+
return ToolResult(ok=False, output="", error=proc.stderr.strip())
|
| 27 |
+
rows = [_format_row(line) for line in proc.stdout.splitlines() if line]
|
| 28 |
+
return ToolResult(ok=True, output="\n".join(rows), extra={"commits": len(rows)})
|
| 29 |
+
|
| 30 |
+
return ToolSpec(
|
| 31 |
+
name="git_log",
|
| 32 |
+
description="Read commit history. Optionally filter by path or --since.",
|
| 33 |
+
parameters={
|
| 34 |
+
"type": "object",
|
| 35 |
+
"properties": {
|
| 36 |
+
"path": {"type": "string", "default": ""},
|
| 37 |
+
"limit": {"type": "integer", "default": 20},
|
| 38 |
+
"since": {"type": "string", "default": "", "description": "git --since (e.g. '2 weeks ago')"},
|
| 39 |
+
},
|
| 40 |
+
},
|
| 41 |
+
runner=run,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _format_row(line: str) -> str:
|
| 46 |
+
parts = line.split("|", 3)
|
| 47 |
+
if len(parts) != 4:
|
| 48 |
+
return line
|
| 49 |
+
sha, author, date, subject = parts
|
| 50 |
+
return f"{sha[:9]} {date[:10]} {author[:24]:<24} {subject}"
|
tools/grep.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""grep_codebase tool — ripgrep-style search inside the ingested repo.
|
| 2 |
+
|
| 3 |
+
Uses Python's `re` so we don't depend on rg being installed; that lets the
|
| 4 |
+
tool run identically in tests, in the local sandbox, and on AMD Cloud.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List
|
| 11 |
+
|
| 12 |
+
from .base import ToolResult, ToolSpec
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
SKIP_DIRS = {".git", "node_modules", ".venv", "venv", "__pycache__", "target", "build", "dist"}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def make_tool(
|
| 19 |
+
repo_root: str | Path,
|
| 20 |
+
max_matches: int = 200,
|
| 21 |
+
max_file_size: int = 2_000_000,
|
| 22 |
+
) -> ToolSpec:
|
| 23 |
+
root = Path(repo_root).resolve()
|
| 24 |
+
|
| 25 |
+
def run(pattern: str, path: str = "", case_sensitive: bool = False, max_results: int = 50) -> ToolResult:
|
| 26 |
+
try:
|
| 27 |
+
flags = 0 if case_sensitive else re.IGNORECASE
|
| 28 |
+
rx = re.compile(pattern, flags)
|
| 29 |
+
except re.error as e:
|
| 30 |
+
return ToolResult(ok=False, output="", error=f"invalid regex: {e}")
|
| 31 |
+
|
| 32 |
+
scope = (root / path).resolve() if path else root
|
| 33 |
+
try:
|
| 34 |
+
scope.relative_to(root)
|
| 35 |
+
except ValueError:
|
| 36 |
+
return ToolResult(ok=False, output="", error=f"path outside repo: {path}")
|
| 37 |
+
if not scope.exists():
|
| 38 |
+
return ToolResult(ok=False, output="", error=f"not found: {path}")
|
| 39 |
+
|
| 40 |
+
hits: List[str] = []
|
| 41 |
+
n = 0
|
| 42 |
+
cap = min(max_results, max_matches)
|
| 43 |
+
|
| 44 |
+
def consider(filepath: Path):
|
| 45 |
+
nonlocal n
|
| 46 |
+
if n >= cap:
|
| 47 |
+
return
|
| 48 |
+
try:
|
| 49 |
+
if filepath.stat().st_size > max_file_size:
|
| 50 |
+
return
|
| 51 |
+
except OSError:
|
| 52 |
+
return
|
| 53 |
+
try:
|
| 54 |
+
text = filepath.read_text(encoding="utf-8", errors="replace")
|
| 55 |
+
except OSError:
|
| 56 |
+
return
|
| 57 |
+
for ln, line in enumerate(text.split("\n"), start=1):
|
| 58 |
+
if rx.search(line):
|
| 59 |
+
rel = str(filepath.relative_to(root))
|
| 60 |
+
hits.append(f"{rel}:{ln}: {line.rstrip()}")
|
| 61 |
+
n += 1
|
| 62 |
+
if n >= cap:
|
| 63 |
+
return
|
| 64 |
+
|
| 65 |
+
if scope.is_file():
|
| 66 |
+
consider(scope)
|
| 67 |
+
else:
|
| 68 |
+
for dirpath, dirnames, filenames in os.walk(scope):
|
| 69 |
+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
| 70 |
+
for fn in filenames:
|
| 71 |
+
consider(Path(dirpath) / fn)
|
| 72 |
+
if n >= cap:
|
| 73 |
+
break
|
| 74 |
+
if n >= cap:
|
| 75 |
+
break
|
| 76 |
+
|
| 77 |
+
if not hits:
|
| 78 |
+
return ToolResult(ok=True, output="(no matches)", extra={"matches": 0})
|
| 79 |
+
return ToolResult(ok=True, output="\n".join(hits), extra={"matches": n, "capped": n >= cap})
|
| 80 |
+
|
| 81 |
+
return ToolSpec(
|
| 82 |
+
name="grep_codebase",
|
| 83 |
+
description="Search regular expression across files in the ingested repo. Returns path:line:match.",
|
| 84 |
+
parameters={
|
| 85 |
+
"type": "object",
|
| 86 |
+
"properties": {
|
| 87 |
+
"pattern": {"type": "string"},
|
| 88 |
+
"path": {"type": "string", "description": "Limit search to this subpath. Empty = whole repo.", "default": ""},
|
| 89 |
+
"case_sensitive": {"type": "boolean", "default": False},
|
| 90 |
+
"max_results": {"type": "integer", "default": 50},
|
| 91 |
+
},
|
| 92 |
+
"required": ["pattern"],
|
| 93 |
+
},
|
| 94 |
+
runner=run,
|
| 95 |
+
)
|
tools/read_file.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""read_file tool — read a slice of a file inside the ingested repo root."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from .base import ToolResult, ToolSpec
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def make_tool(repo_root: str | Path, max_bytes: int = 200_000) -> ToolSpec:
|
| 9 |
+
root = Path(repo_root).resolve()
|
| 10 |
+
|
| 11 |
+
def run(path: str, start_line: int = 1, end_line: int = -1) -> ToolResult:
|
| 12 |
+
# security: only allow paths inside the ingested root
|
| 13 |
+
target = (root / path).resolve()
|
| 14 |
+
try:
|
| 15 |
+
target.relative_to(root)
|
| 16 |
+
except ValueError:
|
| 17 |
+
return ToolResult(ok=False, output="", error=f"path outside repo: {path}")
|
| 18 |
+
if not target.exists():
|
| 19 |
+
return ToolResult(ok=False, output="", error=f"not found: {path}")
|
| 20 |
+
try:
|
| 21 |
+
text = target.read_text(encoding="utf-8", errors="replace")
|
| 22 |
+
except OSError as e:
|
| 23 |
+
return ToolResult(ok=False, output="", error=str(e))
|
| 24 |
+
if len(text) > max_bytes:
|
| 25 |
+
text = text[:max_bytes] + f"\n[... truncated at {max_bytes} bytes]"
|
| 26 |
+
lines = text.split("\n")
|
| 27 |
+
start = max(1, start_line)
|
| 28 |
+
end = len(lines) if end_line in (-1, 0) else min(len(lines), end_line)
|
| 29 |
+
slice_text = "\n".join(lines[start - 1:end])
|
| 30 |
+
numbered = "\n".join(f"{i:>5} {l}" for i, l in enumerate(lines[start - 1:end], start=start))
|
| 31 |
+
return ToolResult(
|
| 32 |
+
ok=True,
|
| 33 |
+
output=numbered,
|
| 34 |
+
extra={"path": path, "lines": (start, end), "total_lines": len(lines)},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
return ToolSpec(
|
| 38 |
+
name="read_file",
|
| 39 |
+
description="Read a file from the ingested repo. Optionally restrict to a line range.",
|
| 40 |
+
parameters={
|
| 41 |
+
"type": "object",
|
| 42 |
+
"properties": {
|
| 43 |
+
"path": {"type": "string", "description": "Path relative to repo root."},
|
| 44 |
+
"start_line": {"type": "integer", "description": "1-indexed inclusive start.", "default": 1},
|
| 45 |
+
"end_line": {"type": "integer", "description": "1-indexed inclusive end. -1 = end of file.", "default": -1},
|
| 46 |
+
},
|
| 47 |
+
"required": ["path"],
|
| 48 |
+
},
|
| 49 |
+
runner=run,
|
| 50 |
+
)
|
tools/registry.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build the default tool registry for a given repo root."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from .base import ToolRegistry
|
| 6 |
+
from .execute_code import make_tool as make_execute
|
| 7 |
+
from .git_log import make_tool as make_git_log
|
| 8 |
+
from .grep import make_tool as make_grep
|
| 9 |
+
from .read_file import make_tool as make_read_file
|
| 10 |
+
from .run_tests import make_tool as make_run_tests
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def default_registry(repo_root: str | Path, scratch_dir: str | Path = ".repomind_cache/scratch") -> ToolRegistry:
|
| 14 |
+
reg = ToolRegistry()
|
| 15 |
+
reg.register(make_read_file(repo_root))
|
| 16 |
+
reg.register(make_grep(repo_root))
|
| 17 |
+
reg.register(make_execute(scratch_dir))
|
| 18 |
+
reg.register(make_run_tests(repo_root))
|
| 19 |
+
reg.register(make_git_log(repo_root))
|
| 20 |
+
return reg
|
tools/run_tests.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""run_tests tool — pytest invocation inside the ingested repo (read-only)."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from .base import ToolResult, ToolSpec
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def make_tool(repo_root: str | Path, timeout: int = 120) -> ToolSpec:
|
| 11 |
+
root = Path(repo_root).resolve()
|
| 12 |
+
|
| 13 |
+
def run(test_path: str = "", k_expression: str = "", max_lines: int = 200) -> ToolResult:
|
| 14 |
+
target = (root / test_path).resolve() if test_path else root
|
| 15 |
+
try:
|
| 16 |
+
target.relative_to(root)
|
| 17 |
+
except ValueError:
|
| 18 |
+
return ToolResult(ok=False, output="", error=f"path outside repo: {test_path}")
|
| 19 |
+
|
| 20 |
+
cmd = [sys.executable, "-m", "pytest", "-x", "--tb=short", "-q", str(target)]
|
| 21 |
+
if k_expression:
|
| 22 |
+
cmd += ["-k", k_expression]
|
| 23 |
+
try:
|
| 24 |
+
proc = subprocess.run(
|
| 25 |
+
cmd, capture_output=True, text=True, cwd=str(root), timeout=timeout,
|
| 26 |
+
)
|
| 27 |
+
except subprocess.TimeoutExpired:
|
| 28 |
+
return ToolResult(ok=False, output="", error=f"pytest timeout after {timeout}s")
|
| 29 |
+
except FileNotFoundError:
|
| 30 |
+
return ToolResult(ok=False, output="", error="pytest not installed")
|
| 31 |
+
|
| 32 |
+
lines = (proc.stdout or "").splitlines()[-max_lines:]
|
| 33 |
+
out = "\n".join(lines)
|
| 34 |
+
err = (proc.stderr or "").strip()
|
| 35 |
+
if proc.returncode == 0:
|
| 36 |
+
return ToolResult(ok=True, output=out or "(all passed)", extra={"returncode": 0})
|
| 37 |
+
return ToolResult(
|
| 38 |
+
ok=False, output=out, error=err or f"pytest exit {proc.returncode}",
|
| 39 |
+
extra={"returncode": proc.returncode},
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
return ToolSpec(
|
| 43 |
+
name="run_tests",
|
| 44 |
+
description="Run pytest on the ingested repo (or a sub-path). Read-only.",
|
| 45 |
+
parameters={
|
| 46 |
+
"type": "object",
|
| 47 |
+
"properties": {
|
| 48 |
+
"test_path": {"type": "string", "default": ""},
|
| 49 |
+
"k_expression": {"type": "string", "default": "", "description": "pytest -k expression"},
|
| 50 |
+
"max_lines": {"type": "integer", "default": 200},
|
| 51 |
+
},
|
| 52 |
+
},
|
| 53 |
+
runner=run,
|
| 54 |
+
)
|