Spaces:
Running
Running
Switch embeddings to Gemini
Browse files- backend/config.py +12 -7
- backend/dependencies.py +0 -3
- backend/routers/agent.py +6 -1
- backend/routers/ingestion.py +6 -5
- backend/services/agent.py +81 -49
- ingestion/embedder.py +125 -32
- ui/package.json +3 -0
backend/config.py
CHANGED
|
@@ -32,21 +32,26 @@ class Settings:
|
|
| 32 |
github_token: str = os.getenv("GITHUB_TOKEN", "")
|
| 33 |
|
| 34 |
# ββ Embeddings ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
-
#
|
| 36 |
#
|
| 37 |
-
# 1.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# voyage-code-3: code-optimised, 1024-dim, 200M tokens/month free.
|
| 39 |
# β οΈ Requires EMBEDDING_DIM=1024 and a NEW Qdrant collection β dims
|
| 40 |
-
# are incompatible with
|
| 41 |
#
|
| 42 |
-
#
|
| 43 |
-
# nomic-embed-text-v1.5:
|
| 44 |
-
#
|
| 45 |
#
|
| 46 |
# EMBEDDING_DIM must match the chosen model exactly.
|
| 47 |
nomic_api_key: str = os.getenv("NOMIC_API_KEY", "")
|
| 48 |
voyage_api_key: str = os.getenv("VOYAGE_API_KEY", "")
|
| 49 |
-
embedding_model: str = os.getenv("EMBEDDING_MODEL", "
|
| 50 |
embedding_dim: int = int(os.getenv("EMBEDDING_DIM", "768"))
|
| 51 |
|
| 52 |
# ββ Chunking ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 32 |
github_token: str = os.getenv("GITHUB_TOKEN", "")
|
| 33 |
|
| 34 |
# ββ Embeddings ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
# Three embedding providers, selected at startup by EMBEDDING_MODEL:
|
| 36 |
#
|
| 37 |
+
# 1. Gemini (default β EMBEDDING_MODEL contains "gemini", needs GEMINI_API_KEY)
|
| 38 |
+
# gemini-embedding-001: 768-dim output via MRL, generous free tier.
|
| 39 |
+
# Re-uses the same GEMINI_API_KEY used for the LLM β no extra signup.
|
| 40 |
+
# Free at https://aistudio.google.com.
|
| 41 |
+
#
|
| 42 |
+
# 2. Voyage AI (EMBEDDING_MODEL contains "voyage", needs VOYAGE_API_KEY)
|
| 43 |
# voyage-code-3: code-optimised, 1024-dim, 200M tokens/month free.
|
| 44 |
# β οΈ Requires EMBEDDING_DIM=1024 and a NEW Qdrant collection β dims
|
| 45 |
+
# are incompatible with 768-dim collections.
|
| 46 |
#
|
| 47 |
+
# 3. Nomic (legacy fallback β NOMIC_API_KEY set)
|
| 48 |
+
# nomic-embed-text-v1.5: 768-dim. Free quota is 10M tokens TOTAL
|
| 49 |
+
# (not per month) β easy to exhaust across a few large indexes.
|
| 50 |
#
|
| 51 |
# EMBEDDING_DIM must match the chosen model exactly.
|
| 52 |
nomic_api_key: str = os.getenv("NOMIC_API_KEY", "")
|
| 53 |
voyage_api_key: str = os.getenv("VOYAGE_API_KEY", "")
|
| 54 |
+
embedding_model: str = os.getenv("EMBEDDING_MODEL", "gemini-embedding-001")
|
| 55 |
embedding_dim: int = int(os.getenv("EMBEDDING_DIM", "768"))
|
| 56 |
|
| 57 |
# ββ Chunking ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
backend/dependencies.py
CHANGED
|
@@ -113,9 +113,6 @@ def check_rate_limit(request: Request) -> None:
|
|
| 113 |
window = _rate_windows[ip]
|
| 114 |
while window and window[0] < now - 60:
|
| 115 |
window.popleft()
|
| 116 |
-
if not window:
|
| 117 |
-
del _rate_windows[ip]
|
| 118 |
-
return
|
| 119 |
|
| 120 |
if len(window) >= limit:
|
| 121 |
raise HTTPException(
|
|
|
|
| 113 |
window = _rate_windows[ip]
|
| 114 |
while window and window[0] < now - 60:
|
| 115 |
window.popleft()
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
if len(window) >= limit:
|
| 118 |
raise HTTPException(
|
backend/routers/agent.py
CHANGED
|
@@ -52,7 +52,12 @@ async def agent_query(
|
|
| 52 |
):
|
| 53 |
"""Run the agentic RAG loop synchronously via MCP tools."""
|
| 54 |
try:
|
| 55 |
-
result = await agent_svc.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
return AgentResponse(
|
| 57 |
answer=result["answer"],
|
| 58 |
tool_calls=[AgentToolCall(**tc) for tc in result["tool_calls"]],
|
|
|
|
| 52 |
):
|
| 53 |
"""Run the agentic RAG loop synchronously via MCP tools."""
|
| 54 |
try:
|
| 55 |
+
result = await agent_svc.run(
|
| 56 |
+
request.question,
|
| 57 |
+
repo_filter=request.repo,
|
| 58 |
+
history=request.history,
|
| 59 |
+
model_id=request.model_id,
|
| 60 |
+
)
|
| 61 |
return AgentResponse(
|
| 62 |
answer=result["answer"],
|
| 63 |
tool_calls=[AgentToolCall(**tc) for tc in result["tool_calls"]],
|
backend/routers/ingestion.py
CHANGED
|
@@ -92,15 +92,16 @@ async def ingest_stream(repo: str, request: Request, force: bool = False):
|
|
| 92 |
|
| 93 |
async def _run():
|
| 94 |
try:
|
| 95 |
-
await asyncio.to_thread(services.ingestion.ingest, repo, force, _progress)
|
|
|
|
| 96 |
if services.diagram:
|
| 97 |
-
services.diagram.invalidate(
|
| 98 |
if services.repo_map:
|
| 99 |
-
services.repo_map.invalidate(
|
| 100 |
now = datetime.now(timezone.utc).isoformat()
|
| 101 |
-
repo_indexed_at[
|
| 102 |
if force:
|
| 103 |
-
repo_contextual_at[
|
| 104 |
except Exception as e:
|
| 105 |
loop.call_soon_threadsafe(queue.put_nowait, {"step": "error", "detail": str(e)})
|
| 106 |
finally:
|
|
|
|
| 92 |
|
| 93 |
async def _run():
|
| 94 |
try:
|
| 95 |
+
result = await asyncio.to_thread(services.ingestion.ingest, repo, force, _progress)
|
| 96 |
+
repo_slug = result.get("repo", repo)
|
| 97 |
if services.diagram:
|
| 98 |
+
services.diagram.invalidate(repo_slug)
|
| 99 |
if services.repo_map:
|
| 100 |
+
services.repo_map.invalidate(repo_slug)
|
| 101 |
now = datetime.now(timezone.utc).isoformat()
|
| 102 |
+
repo_indexed_at[repo_slug] = now
|
| 103 |
if force:
|
| 104 |
+
repo_contextual_at[repo_slug] = now
|
| 105 |
except Exception as e:
|
| 106 |
loop.call_soon_threadsafe(queue.put_nowait, {"step": "error", "detail": str(e)})
|
| 107 |
finally:
|
backend/services/agent.py
CHANGED
|
@@ -516,6 +516,10 @@ class AgentService:
|
|
| 516 |
"""
|
| 517 |
self.mcp = mcp_client
|
| 518 |
self._repo_map = repo_map_svc
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
|
| 520 |
# ββ Provider detection βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 521 |
# Priority: Cerebras (Qwen3-235B) β Gemini β OpenRouter β Anthropic β Groq.
|
|
@@ -566,65 +570,82 @@ class AgentService:
|
|
| 566 |
|
| 567 |
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 568 |
|
| 569 |
-
async def run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
"""
|
| 571 |
Run the full ReAct loop and return the final answer + trace.
|
| 572 |
|
| 573 |
Returns:
|
| 574 |
{"answer": str, "tool_calls": list[dict], "iterations": int}
|
| 575 |
"""
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
# model gets confused and repeats the same call over and over.
|
| 584 |
-
seen_calls: set[tuple] = set()
|
| 585 |
-
|
| 586 |
-
for iteration in range(self.MAX_ITERATIONS):
|
| 587 |
-
# LLM call is synchronous β run in thread pool to avoid blocking
|
| 588 |
-
# Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
|
| 589 |
-
step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
|
| 590 |
-
|
| 591 |
-
if step["done"]:
|
| 592 |
-
return {
|
| 593 |
-
"answer": step["answer"],
|
| 594 |
-
"tool_calls": tool_trace,
|
| 595 |
-
"iterations": iteration + 1,
|
| 596 |
-
}
|
| 597 |
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
|
|
|
|
|
|
| 628 |
|
| 629 |
async def stream(
|
| 630 |
self,
|
|
@@ -655,6 +676,17 @@ class AgentService:
|
|
| 655 |
we re-run with stream=True so tokens arrive in real time.
|
| 656 |
This is one extra LLM call but delivers genuine streaming UX.
|
| 657 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
# ββ Per-request model override ββββββββββββββββββββββββββββββββββββββββ
|
| 659 |
# If the user selected a specific model in the UI, temporarily swap to it.
|
| 660 |
# We save/restore self._client/provider/model in a finally block so the
|
|
|
|
| 516 |
"""
|
| 517 |
self.mcp = mcp_client
|
| 518 |
self._repo_map = repo_map_svc
|
| 519 |
+
# Provider fallback and per-request model selection mutate the active
|
| 520 |
+
# client/provider/model fields. Serialise runs so concurrent requests
|
| 521 |
+
# cannot leak one user's selected model into another user's session.
|
| 522 |
+
self._run_lock = asyncio.Lock()
|
| 523 |
|
| 524 |
# ββ Provider detection βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 525 |
# Priority: Cerebras (Qwen3-235B) β Gemini β OpenRouter β Anthropic β Groq.
|
|
|
|
| 570 |
|
| 571 |
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 572 |
|
| 573 |
+
async def run(
|
| 574 |
+
self,
|
| 575 |
+
question: str,
|
| 576 |
+
repo_filter: str | None = None,
|
| 577 |
+
history: list[dict] | None = None,
|
| 578 |
+
model_id: str | None = None,
|
| 579 |
+
) -> dict:
|
| 580 |
"""
|
| 581 |
Run the full ReAct loop and return the final answer + trace.
|
| 582 |
|
| 583 |
Returns:
|
| 584 |
{"answer": str, "tool_calls": list[dict], "iterations": int}
|
| 585 |
"""
|
| 586 |
+
async with self._run_lock:
|
| 587 |
+
_orig = (self._client, self._provider, self._model)
|
| 588 |
+
entry = next((m for m in AGENT_MODELS if m["id"] == model_id), None)
|
| 589 |
+
if entry:
|
| 590 |
+
self._client = _make_client(entry)
|
| 591 |
+
self._provider = entry["provider"]
|
| 592 |
+
self._model = entry["model"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
+
try:
|
| 595 |
+
# Discover tools from MCP server
|
| 596 |
+
mcp_tools = await self.mcp.list_tools()
|
| 597 |
+
messages = self._build_initial_messages(question, repo_filter, history)
|
| 598 |
+
tool_trace = []
|
| 599 |
+
|
| 600 |
+
# Loop detection: track (tool, args) pairs already executed this run.
|
| 601 |
+
# Prevents wasting all MAX_ITERATIONS on duplicate searches when the
|
| 602 |
+
# model gets confused and repeats the same call over and over.
|
| 603 |
+
seen_calls: set[tuple] = set()
|
| 604 |
+
|
| 605 |
+
for iteration in range(self.MAX_ITERATIONS):
|
| 606 |
+
# LLM call is synchronous β run in thread pool to avoid blocking
|
| 607 |
+
# Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
|
| 608 |
+
step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
|
| 609 |
+
|
| 610 |
+
if step["done"]:
|
| 611 |
+
return {
|
| 612 |
+
"answer": step["answer"],
|
| 613 |
+
"tool_calls": tool_trace,
|
| 614 |
+
"iterations": iteration + 1,
|
| 615 |
+
}
|
| 616 |
+
|
| 617 |
+
messages.append(step["assistant_message"])
|
| 618 |
+
|
| 619 |
+
for tc in step["tool_calls"]:
|
| 620 |
+
# Deduplicate: skip calls already made with identical arguments.
|
| 621 |
+
call_key = (tc["name"], tuple(sorted(tc["input"].items())))
|
| 622 |
+
if call_key in seen_calls:
|
| 623 |
+
result = f"[Skipped duplicate {tc['name']} call β already ran with these arguments]"
|
| 624 |
+
tool_trace.append({"tool": tc["name"], "input": tc["input"], "output": result})
|
| 625 |
+
messages.append(self._build_tool_result(tc["id"], tc["name"], result))
|
| 626 |
+
continue
|
| 627 |
+
seen_calls.add(call_key)
|
| 628 |
|
| 629 |
+
# Tool execution via MCP protocol (async HTTP)
|
| 630 |
+
try:
|
| 631 |
+
result = await self.mcp.call_tool(tc["name"], tc["input"])
|
| 632 |
+
except Exception as e:
|
| 633 |
+
result = f"Tool error: {e}"
|
| 634 |
|
| 635 |
+
tool_trace.append({
|
| 636 |
+
"tool": tc["name"],
|
| 637 |
+
"input": tc["input"],
|
| 638 |
+
"output": result[:500] + "..." if len(result) > 500 else result,
|
| 639 |
+
})
|
| 640 |
+
messages.append(self._build_tool_result(tc["id"], tc["name"], result))
|
| 641 |
|
| 642 |
+
return {
|
| 643 |
+
"answer": "I was unable to fully answer within the allowed reasoning steps.",
|
| 644 |
+
"tool_calls": tool_trace,
|
| 645 |
+
"iterations": self.MAX_ITERATIONS,
|
| 646 |
+
}
|
| 647 |
+
finally:
|
| 648 |
+
self._client, self._provider, self._model = _orig
|
| 649 |
|
| 650 |
async def stream(
|
| 651 |
self,
|
|
|
|
| 676 |
we re-run with stream=True so tokens arrive in real time.
|
| 677 |
This is one extra LLM call but delivers genuine streaming UX.
|
| 678 |
"""
|
| 679 |
+
async with self._run_lock:
|
| 680 |
+
async for event in self._stream_locked(question, repo_filter, history, model_id):
|
| 681 |
+
yield event
|
| 682 |
+
|
| 683 |
+
async def _stream_locked(
|
| 684 |
+
self,
|
| 685 |
+
question: str,
|
| 686 |
+
repo_filter: str | None = None,
|
| 687 |
+
history: list[dict] | None = None,
|
| 688 |
+
model_id: str | None = None,
|
| 689 |
+
) -> AsyncIterator[dict]:
|
| 690 |
# ββ Per-request model override ββββββββββββββββββββββββββββββββββββββββ
|
| 691 |
# If the user selected a specific model in the UI, temporarily swap to it.
|
| 692 |
# We save/restore self._client/provider/model in a finally block so the
|
ingestion/embedder.py
CHANGED
|
@@ -1,38 +1,45 @@
|
|
| 1 |
"""
|
| 2 |
-
embedder.py β Embed code chunks via
|
| 3 |
|
| 4 |
WHY API-BASED EMBEDDINGS
|
| 5 |
βββββββββββββββββββββββββ
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
|
| 11 |
-
|
| 12 |
ββββββββββββββββββββββββββββββ
|
| 13 |
-
Provider
|
| 14 |
|
| 15 |
-
|
| 16 |
β Voyage AI: code-optimised, 1024-dim, 200M tokens/month free.
|
| 17 |
voyage-code-3 is specifically trained on code and outperforms
|
| 18 |
general-purpose embedders on code retrieval benchmarks.
|
| 19 |
-
β οΈ Requires
|
| 20 |
|
| 21 |
-
|
| 22 |
-
β
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
TASK TYPES
|
| 25 |
βββββββββββ
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
|
| 32 |
BATCHING
|
| 33 |
βββββββββ
|
| 34 |
-
|
| 35 |
-
|
|
|
|
| 36 |
"""
|
| 37 |
|
| 38 |
import time
|
|
@@ -45,11 +52,12 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
| 45 |
from backend.config import settings
|
| 46 |
|
| 47 |
|
| 48 |
-
_NOMIC_API_URL
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
class Embedder:
|
|
@@ -69,13 +77,25 @@ class Embedder:
|
|
| 69 |
self.model_name = model_name or settings.embedding_model
|
| 70 |
self.embedding_dim = settings.embedding_dim
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
| 74 |
self._provider = "voyage"
|
| 75 |
self._init_voyage()
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
| 77 |
self._provider = "nomic"
|
| 78 |
self._init_nomic()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
def _init_voyage(self):
|
| 81 |
"""Initialise Voyage AI client. voyage-code-3 is code-optimised 1024-dim."""
|
|
@@ -93,18 +113,23 @@ class Embedder:
|
|
| 93 |
|
| 94 |
def _init_nomic(self):
|
| 95 |
"""Initialise Nomic API client. nomic-embed-text-v1.5 is 768-dim."""
|
| 96 |
-
if not settings.nomic_api_key:
|
| 97 |
-
raise RuntimeError(
|
| 98 |
-
"No embedding provider configured. "
|
| 99 |
-
"Set NOMIC_API_KEY (free at https://atlas.nomic.ai) or "
|
| 100 |
-
"VOYAGE_API_KEY + EMBEDDING_MODEL=voyage-code-3."
|
| 101 |
-
)
|
| 102 |
self._nomic_key = settings.nomic_api_key
|
| 103 |
print(
|
| 104 |
f"Embedder: using Nomic API ({self.model_name}, {self.embedding_dim}-dim). "
|
| 105 |
"No local model loaded."
|
| 106 |
)
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
# ββ Public interface βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
|
| 110 |
def embed_chunks(self, chunks: list[dict]) -> list[list[float]]:
|
|
@@ -123,6 +148,8 @@ class Embedder:
|
|
| 123 |
texts = [c["text"][:_MAX_CHARS] for c in chunks]
|
| 124 |
if self._provider == "voyage":
|
| 125 |
return self._voyage_embed(texts, input_type="document")
|
|
|
|
|
|
|
| 126 |
return self._nomic_embed(texts, task_type="search_document")
|
| 127 |
|
| 128 |
def embed_query(self, query: str) -> list[float]:
|
|
@@ -134,6 +161,8 @@ class Embedder:
|
|
| 134 |
"""
|
| 135 |
if self._provider == "voyage":
|
| 136 |
return self._voyage_embed([query], input_type="query")[0]
|
|
|
|
|
|
|
| 137 |
return self._nomic_embed([query], task_type="search_query")[0]
|
| 138 |
|
| 139 |
# ββ Voyage AI implementation βββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -231,3 +260,67 @@ class Embedder:
|
|
| 231 |
return response.json()["embeddings"]
|
| 232 |
|
| 233 |
raise RuntimeError("Nomic API call failed after retries")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
embedder.py β Embed code chunks via a hosted embedding API.
|
| 3 |
|
| 4 |
WHY API-BASED EMBEDDINGS
|
| 5 |
βββββββββββββββββββββββββ
|
| 6 |
+
Local sentence-transformers models are ~600MB RAM β enough to kill
|
| 7 |
+
free-tier hosting (HF Spaces, Render: 512MBβ1GB RAM limit). Hosted
|
| 8 |
+
APIs give us zero RAM cost and equivalent quality, at the price of
|
| 9 |
+
~200ms of network latency per batch.
|
| 10 |
|
| 11 |
+
THREE PROVIDERS, ONE INTERFACE
|
| 12 |
ββββββββββββββββββββββββββββββ
|
| 13 |
+
Provider is selected from EMBEDDING_MODEL at init:
|
| 14 |
|
| 15 |
+
EMBEDDING_MODEL contains "voyage" + VOYAGE_API_KEY set
|
| 16 |
β Voyage AI: code-optimised, 1024-dim, 200M tokens/month free.
|
| 17 |
voyage-code-3 is specifically trained on code and outperforms
|
| 18 |
general-purpose embedders on code retrieval benchmarks.
|
| 19 |
+
β οΈ Requires EMBEDDING_DIM=1024 and a new Qdrant collection.
|
| 20 |
|
| 21 |
+
EMBEDDING_MODEL contains "gemini" + GEMINI_API_KEY set (default)
|
| 22 |
+
β Google Gemini: gemini-embedding-001, 768-dim output (configurable
|
| 23 |
+
via MRL), generous free tier. Re-uses the same GEMINI_API_KEY we
|
| 24 |
+
use for the LLM β no separate signup.
|
| 25 |
+
|
| 26 |
+
NOMIC_API_KEY set (legacy fallback)
|
| 27 |
+
β Nomic API: nomic-embed-text-v1.5, 768-dim. Free quota is 10M
|
| 28 |
+
tokens total β easy to exhaust across a few large repo indexes.
|
| 29 |
|
| 30 |
TASK TYPES
|
| 31 |
βββββββββββ
|
| 32 |
+
Every provider distinguishes document and query roles. A document
|
| 33 |
+
projection and a query projection live in the same embedding space
|
| 34 |
+
but are optimised for their direction of the inner product:
|
| 35 |
+
- document: used when indexing chunks
|
| 36 |
+
- query: used when embedding the user's question
|
| 37 |
|
| 38 |
BATCHING
|
| 39 |
βββββββββ
|
| 40 |
+
All three APIs accept batched input. We use groups of 32 to stay
|
| 41 |
+
well under request-body size limits on large contextually-enriched
|
| 42 |
+
chunks (~8KB each) and to keep individual retries cheap.
|
| 43 |
"""
|
| 44 |
|
| 45 |
import time
|
|
|
|
| 52 |
from backend.config import settings
|
| 53 |
|
| 54 |
|
| 55 |
+
_NOMIC_API_URL = "https://api-atlas.nomic.ai/v1/embedding/text"
|
| 56 |
+
_GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta/models"
|
| 57 |
+
_BATCH_SIZE = 32 # conservative for all providers: stays under ~10MB body
|
| 58 |
+
# and keeps each failed batch cheap to retry
|
| 59 |
+
_MAX_CHARS = 8000 # truncate each text before sending β embeddings degrade
|
| 60 |
+
# gracefully on truncation and models silently clip anyway
|
| 61 |
|
| 62 |
|
| 63 |
class Embedder:
|
|
|
|
| 77 |
self.model_name = model_name or settings.embedding_model
|
| 78 |
self.embedding_dim = settings.embedding_dim
|
| 79 |
|
| 80 |
+
# Provider selection is driven by the MODEL NAME, with the available
|
| 81 |
+
# API key gating the choice. This lets an operator flip providers by
|
| 82 |
+
# only changing EMBEDDING_MODEL in .env β no code change needed.
|
| 83 |
+
name = self.model_name.lower()
|
| 84 |
+
if "voyage" in name and settings.voyage_api_key:
|
| 85 |
self._provider = "voyage"
|
| 86 |
self._init_voyage()
|
| 87 |
+
elif "gemini" in name and settings.gemini_api_key:
|
| 88 |
+
self._provider = "gemini"
|
| 89 |
+
self._init_gemini()
|
| 90 |
+
elif settings.nomic_api_key:
|
| 91 |
self._provider = "nomic"
|
| 92 |
self._init_nomic()
|
| 93 |
+
else:
|
| 94 |
+
raise RuntimeError(
|
| 95 |
+
f"No embedding provider available for model '{self.model_name}'. "
|
| 96 |
+
"Set GEMINI_API_KEY (default β free at https://aistudio.google.com), "
|
| 97 |
+
"or VOYAGE_API_KEY + EMBEDDING_MODEL=voyage-code-3."
|
| 98 |
+
)
|
| 99 |
|
| 100 |
def _init_voyage(self):
|
| 101 |
"""Initialise Voyage AI client. voyage-code-3 is code-optimised 1024-dim."""
|
|
|
|
| 113 |
|
| 114 |
def _init_nomic(self):
|
| 115 |
"""Initialise Nomic API client. nomic-embed-text-v1.5 is 768-dim."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
self._nomic_key = settings.nomic_api_key
|
| 117 |
print(
|
| 118 |
f"Embedder: using Nomic API ({self.model_name}, {self.embedding_dim}-dim). "
|
| 119 |
"No local model loaded."
|
| 120 |
)
|
| 121 |
|
| 122 |
+
def _init_gemini(self):
|
| 123 |
+
"""Initialise Gemini embeddings. gemini-embedding-001 supports MRL,
|
| 124 |
+
so we request exactly `embedding_dim` dimensions from the API β that
|
| 125 |
+
way one deployment can reuse an existing Qdrant collection schema
|
| 126 |
+
(768-dim) or scale up to a larger one without code changes."""
|
| 127 |
+
self._gemini_key = settings.gemini_api_key
|
| 128 |
+
print(
|
| 129 |
+
f"Embedder: using Gemini API ({self.model_name}, {self.embedding_dim}-dim). "
|
| 130 |
+
"No local model loaded."
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
# ββ Public interface βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
|
| 135 |
def embed_chunks(self, chunks: list[dict]) -> list[list[float]]:
|
|
|
|
| 148 |
texts = [c["text"][:_MAX_CHARS] for c in chunks]
|
| 149 |
if self._provider == "voyage":
|
| 150 |
return self._voyage_embed(texts, input_type="document")
|
| 151 |
+
if self._provider == "gemini":
|
| 152 |
+
return self._gemini_embed(texts, task_type="RETRIEVAL_DOCUMENT")
|
| 153 |
return self._nomic_embed(texts, task_type="search_document")
|
| 154 |
|
| 155 |
def embed_query(self, query: str) -> list[float]:
|
|
|
|
| 161 |
"""
|
| 162 |
if self._provider == "voyage":
|
| 163 |
return self._voyage_embed([query], input_type="query")[0]
|
| 164 |
+
if self._provider == "gemini":
|
| 165 |
+
return self._gemini_embed([query], task_type="RETRIEVAL_QUERY")[0]
|
| 166 |
return self._nomic_embed([query], task_type="search_query")[0]
|
| 167 |
|
| 168 |
# ββ Voyage AI implementation βββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 260 |
return response.json()["embeddings"]
|
| 261 |
|
| 262 |
raise RuntimeError("Nomic API call failed after retries")
|
| 263 |
+
|
| 264 |
+
# ββ Gemini API implementation ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 265 |
+
|
| 266 |
+
def _gemini_embed(self, texts: list[str], task_type: str) -> list[list[float]]:
|
| 267 |
+
"""Call Gemini batchEmbedContents with batching. Returns list of
|
| 268 |
+
`embedding_dim`-dim vectors.
|
| 269 |
+
|
| 270 |
+
task_type is the Gemini task enum (RETRIEVAL_DOCUMENT / RETRIEVAL_QUERY).
|
| 271 |
+
These produce different projections within the same embedding space β
|
| 272 |
+
the document projection is optimised for being retrieved, the query
|
| 273 |
+
projection for doing the retrieving.
|
| 274 |
+
"""
|
| 275 |
+
all_embeddings: list[list[float]] = []
|
| 276 |
+
for i in range(0, len(texts), _BATCH_SIZE):
|
| 277 |
+
batch = [t[:_MAX_CHARS] for t in texts[i : i + _BATCH_SIZE]]
|
| 278 |
+
embeddings = self._gemini_call_api(batch, task_type)
|
| 279 |
+
all_embeddings.extend(embeddings)
|
| 280 |
+
return all_embeddings
|
| 281 |
+
|
| 282 |
+
def _gemini_call_api(
|
| 283 |
+
self,
|
| 284 |
+
texts: list[str],
|
| 285 |
+
task_type: str,
|
| 286 |
+
retries: int = 3,
|
| 287 |
+
) -> list[list[float]]:
|
| 288 |
+
"""
|
| 289 |
+
Single Gemini batchEmbedContents call with retry on rate limit (429)
|
| 290 |
+
or service error (503). Gemini free tier is RPM-capped, so backoff is
|
| 291 |
+
more aggressive than Nomic (3 retries vs 2, longer default wait).
|
| 292 |
+
|
| 293 |
+
Response shape:
|
| 294 |
+
{ "embeddings": [{ "values": [float, ...] }, ...] }
|
| 295 |
+
"""
|
| 296 |
+
url = (
|
| 297 |
+
f"{_GEMINI_API_BASE}/{self.model_name}:batchEmbedContents"
|
| 298 |
+
f"?key={self._gemini_key}"
|
| 299 |
+
)
|
| 300 |
+
model_id = f"models/{self.model_name}"
|
| 301 |
+
payload = {
|
| 302 |
+
"requests": [
|
| 303 |
+
{
|
| 304 |
+
"model": model_id,
|
| 305 |
+
"content": {"parts": [{"text": t}]},
|
| 306 |
+
"taskType": task_type,
|
| 307 |
+
"outputDimensionality": self.embedding_dim,
|
| 308 |
+
}
|
| 309 |
+
for t in texts
|
| 310 |
+
]
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
for attempt in range(retries + 1):
|
| 314 |
+
response = http.post(url, json=payload, timeout=60)
|
| 315 |
+
|
| 316 |
+
if response.status_code in (429, 503) and attempt < retries:
|
| 317 |
+
# Gemini doesn't always send Retry-After; back off exponentially.
|
| 318 |
+
wait = int(response.headers.get("Retry-After", 2 ** attempt * 5))
|
| 319 |
+
print(f"Gemini API {response.status_code}. Waiting {wait}s before retry...")
|
| 320 |
+
time.sleep(wait)
|
| 321 |
+
continue
|
| 322 |
+
|
| 323 |
+
response.raise_for_status()
|
| 324 |
+
return [e["values"] for e in response.json()["embeddings"]]
|
| 325 |
+
|
| 326 |
+
raise RuntimeError("Gemini API call failed after retries")
|
ui/package.json
CHANGED
|
@@ -3,6 +3,9 @@
|
|
| 3 |
"private": true,
|
| 4 |
"version": "0.0.0",
|
| 5 |
"type": "module",
|
|
|
|
|
|
|
|
|
|
| 6 |
"scripts": {
|
| 7 |
"dev": "vite",
|
| 8 |
"build": "vite build",
|
|
|
|
| 3 |
"private": true,
|
| 4 |
"version": "0.0.0",
|
| 5 |
"type": "module",
|
| 6 |
+
"engines": {
|
| 7 |
+
"node": ">=20"
|
| 8 |
+
},
|
| 9 |
"scripts": {
|
| 10 |
"dev": "vite",
|
| 11 |
"build": "vite build",
|