umanggarg commited on
Commit
a63e301
Β·
1 Parent(s): 7de7656

Switch embeddings to Gemini

Browse files
backend/config.py CHANGED
@@ -32,21 +32,26 @@ class Settings:
32
  github_token: str = os.getenv("GITHUB_TOKEN", "")
33
 
34
  # ── Embeddings ────────────────────────────────────────────────────────────
35
- # Two embedding providers, selected at startup:
36
  #
37
- # 1. Voyage AI (VOYAGE_API_KEY set + EMBEDDING_MODEL=voyage-code-3)
 
 
 
 
 
38
  # voyage-code-3: code-optimised, 1024-dim, 200M tokens/month free.
39
  # ⚠️ Requires EMBEDDING_DIM=1024 and a NEW Qdrant collection β€” dims
40
- # are incompatible with nomic (768-dim) collections.
41
  #
42
- # 2. Nomic API (default, NOMIC_API_KEY required)
43
- # nomic-embed-text-v1.5: general text, 768-dim, generous free tier.
44
- # Free at https://atlas.nomic.ai (no credit card needed).
45
  #
46
  # EMBEDDING_DIM must match the chosen model exactly.
47
  nomic_api_key: str = os.getenv("NOMIC_API_KEY", "")
48
  voyage_api_key: str = os.getenv("VOYAGE_API_KEY", "")
49
- embedding_model: str = os.getenv("EMBEDDING_MODEL", "nomic-embed-text-v1.5")
50
  embedding_dim: int = int(os.getenv("EMBEDDING_DIM", "768"))
51
 
52
  # ── Chunking ──────────────────────────────────────────────────────────────
 
32
  github_token: str = os.getenv("GITHUB_TOKEN", "")
33
 
34
  # ── Embeddings ────────────────────────────────────────────────────────────
35
+ # Three embedding providers, selected at startup by EMBEDDING_MODEL:
36
  #
37
+ # 1. Gemini (default β€” EMBEDDING_MODEL contains "gemini", needs GEMINI_API_KEY)
38
+ # gemini-embedding-001: 768-dim output via MRL, generous free tier.
39
+ # Re-uses the same GEMINI_API_KEY used for the LLM β€” no extra signup.
40
+ # Free at https://aistudio.google.com.
41
+ #
42
+ # 2. Voyage AI (EMBEDDING_MODEL contains "voyage", needs VOYAGE_API_KEY)
43
  # voyage-code-3: code-optimised, 1024-dim, 200M tokens/month free.
44
  # ⚠️ Requires EMBEDDING_DIM=1024 and a NEW Qdrant collection β€” dims
45
+ # are incompatible with 768-dim collections.
46
  #
47
+ # 3. Nomic (legacy fallback β€” NOMIC_API_KEY set)
48
+ # nomic-embed-text-v1.5: 768-dim. Free quota is 10M tokens TOTAL
49
+ # (not per month) β€” easy to exhaust across a few large indexes.
50
  #
51
  # EMBEDDING_DIM must match the chosen model exactly.
52
  nomic_api_key: str = os.getenv("NOMIC_API_KEY", "")
53
  voyage_api_key: str = os.getenv("VOYAGE_API_KEY", "")
54
+ embedding_model: str = os.getenv("EMBEDDING_MODEL", "gemini-embedding-001")
55
  embedding_dim: int = int(os.getenv("EMBEDDING_DIM", "768"))
56
 
57
  # ── Chunking ──────────────────────────────────────────────────────────────
backend/dependencies.py CHANGED
@@ -113,9 +113,6 @@ def check_rate_limit(request: Request) -> None:
113
  window = _rate_windows[ip]
114
  while window and window[0] < now - 60:
115
  window.popleft()
116
- if not window:
117
- del _rate_windows[ip]
118
- return
119
 
120
  if len(window) >= limit:
121
  raise HTTPException(
 
113
  window = _rate_windows[ip]
114
  while window and window[0] < now - 60:
115
  window.popleft()
 
 
 
116
 
117
  if len(window) >= limit:
118
  raise HTTPException(
backend/routers/agent.py CHANGED
@@ -52,7 +52,12 @@ async def agent_query(
52
  ):
53
  """Run the agentic RAG loop synchronously via MCP tools."""
54
  try:
55
- result = await agent_svc.run(request.question, repo_filter=request.repo)
 
 
 
 
 
56
  return AgentResponse(
57
  answer=result["answer"],
58
  tool_calls=[AgentToolCall(**tc) for tc in result["tool_calls"]],
 
52
  ):
53
  """Run the agentic RAG loop synchronously via MCP tools."""
54
  try:
55
+ result = await agent_svc.run(
56
+ request.question,
57
+ repo_filter=request.repo,
58
+ history=request.history,
59
+ model_id=request.model_id,
60
+ )
61
  return AgentResponse(
62
  answer=result["answer"],
63
  tool_calls=[AgentToolCall(**tc) for tc in result["tool_calls"]],
backend/routers/ingestion.py CHANGED
@@ -92,15 +92,16 @@ async def ingest_stream(repo: str, request: Request, force: bool = False):
92
 
93
  async def _run():
94
  try:
95
- await asyncio.to_thread(services.ingestion.ingest, repo, force, _progress)
 
96
  if services.diagram:
97
- services.diagram.invalidate(repo)
98
  if services.repo_map:
99
- services.repo_map.invalidate(repo)
100
  now = datetime.now(timezone.utc).isoformat()
101
- repo_indexed_at[repo] = now
102
  if force:
103
- repo_contextual_at[repo] = now
104
  except Exception as e:
105
  loop.call_soon_threadsafe(queue.put_nowait, {"step": "error", "detail": str(e)})
106
  finally:
 
92
 
93
  async def _run():
94
  try:
95
+ result = await asyncio.to_thread(services.ingestion.ingest, repo, force, _progress)
96
+ repo_slug = result.get("repo", repo)
97
  if services.diagram:
98
+ services.diagram.invalidate(repo_slug)
99
  if services.repo_map:
100
+ services.repo_map.invalidate(repo_slug)
101
  now = datetime.now(timezone.utc).isoformat()
102
+ repo_indexed_at[repo_slug] = now
103
  if force:
104
+ repo_contextual_at[repo_slug] = now
105
  except Exception as e:
106
  loop.call_soon_threadsafe(queue.put_nowait, {"step": "error", "detail": str(e)})
107
  finally:
backend/services/agent.py CHANGED
@@ -516,6 +516,10 @@ class AgentService:
516
  """
517
  self.mcp = mcp_client
518
  self._repo_map = repo_map_svc
 
 
 
 
519
 
520
  # ── Provider detection ─────────────────────────────────────────────────
521
  # Priority: Cerebras (Qwen3-235B) β†’ Gemini β†’ OpenRouter β†’ Anthropic β†’ Groq.
@@ -566,65 +570,82 @@ class AgentService:
566
 
567
  # ── Public API ─────────────────────────────────────────────────────────────
568
 
569
- async def run(self, question: str, repo_filter: str | None = None, history: list[dict] | None = None) -> dict:
 
 
 
 
 
 
570
  """
571
  Run the full ReAct loop and return the final answer + trace.
572
 
573
  Returns:
574
  {"answer": str, "tool_calls": list[dict], "iterations": int}
575
  """
576
- # Discover tools from MCP server
577
- mcp_tools = await self.mcp.list_tools()
578
- messages = self._build_initial_messages(question, repo_filter, history)
579
- tool_trace = []
580
-
581
- # Loop detection: track (tool, args) pairs already executed this run.
582
- # Prevents wasting all MAX_ITERATIONS on duplicate searches when the
583
- # model gets confused and repeats the same call over and over.
584
- seen_calls: set[tuple] = set()
585
-
586
- for iteration in range(self.MAX_ITERATIONS):
587
- # LLM call is synchronous β€” run in thread pool to avoid blocking
588
- # Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
589
- step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
590
-
591
- if step["done"]:
592
- return {
593
- "answer": step["answer"],
594
- "tool_calls": tool_trace,
595
- "iterations": iteration + 1,
596
- }
597
 
598
- messages.append(step["assistant_message"])
599
-
600
- for tc in step["tool_calls"]:
601
- # Deduplicate: skip calls already made with identical arguments.
602
- call_key = (tc["name"], tuple(sorted(tc["input"].items())))
603
- if call_key in seen_calls:
604
- result = f"[Skipped duplicate {tc['name']} call β€” already ran with these arguments]"
605
- tool_trace.append({"tool": tc["name"], "input": tc["input"], "output": result})
606
- messages.append(self._build_tool_result(tc["id"], tc["name"], result))
607
- continue
608
- seen_calls.add(call_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
 
610
- # Tool execution via MCP protocol (async HTTP)
611
- try:
612
- result = await self.mcp.call_tool(tc["name"], tc["input"])
613
- except Exception as e:
614
- result = f"Tool error: {e}"
615
 
616
- tool_trace.append({
617
- "tool": tc["name"],
618
- "input": tc["input"],
619
- "output": result[:500] + "..." if len(result) > 500 else result,
620
- })
621
- messages.append(self._build_tool_result(tc["id"], tc["name"], result))
622
 
623
- return {
624
- "answer": "I was unable to fully answer within the allowed reasoning steps.",
625
- "tool_calls": tool_trace,
626
- "iterations": self.MAX_ITERATIONS,
627
- }
 
 
628
 
629
  async def stream(
630
  self,
@@ -655,6 +676,17 @@ class AgentService:
655
  we re-run with stream=True so tokens arrive in real time.
656
  This is one extra LLM call but delivers genuine streaming UX.
657
  """
 
 
 
 
 
 
 
 
 
 
 
658
  # ── Per-request model override ────────────────────────────────────────
659
  # If the user selected a specific model in the UI, temporarily swap to it.
660
  # We save/restore self._client/provider/model in a finally block so the
 
516
  """
517
  self.mcp = mcp_client
518
  self._repo_map = repo_map_svc
519
+ # Provider fallback and per-request model selection mutate the active
520
+ # client/provider/model fields. Serialise runs so concurrent requests
521
+ # cannot leak one user's selected model into another user's session.
522
+ self._run_lock = asyncio.Lock()
523
 
524
  # ── Provider detection ─────────────────────────────────────────────────
525
  # Priority: Cerebras (Qwen3-235B) β†’ Gemini β†’ OpenRouter β†’ Anthropic β†’ Groq.
 
570
 
571
  # ── Public API ─────────────────────────────────────────────────────────────
572
 
573
+ async def run(
574
+ self,
575
+ question: str,
576
+ repo_filter: str | None = None,
577
+ history: list[dict] | None = None,
578
+ model_id: str | None = None,
579
+ ) -> dict:
580
  """
581
  Run the full ReAct loop and return the final answer + trace.
582
 
583
  Returns:
584
  {"answer": str, "tool_calls": list[dict], "iterations": int}
585
  """
586
+ async with self._run_lock:
587
+ _orig = (self._client, self._provider, self._model)
588
+ entry = next((m for m in AGENT_MODELS if m["id"] == model_id), None)
589
+ if entry:
590
+ self._client = _make_client(entry)
591
+ self._provider = entry["provider"]
592
+ self._model = entry["model"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
+ try:
595
+ # Discover tools from MCP server
596
+ mcp_tools = await self.mcp.list_tools()
597
+ messages = self._build_initial_messages(question, repo_filter, history)
598
+ tool_trace = []
599
+
600
+ # Loop detection: track (tool, args) pairs already executed this run.
601
+ # Prevents wasting all MAX_ITERATIONS on duplicate searches when the
602
+ # model gets confused and repeats the same call over and over.
603
+ seen_calls: set[tuple] = set()
604
+
605
+ for iteration in range(self.MAX_ITERATIONS):
606
+ # LLM call is synchronous β€” run in thread pool to avoid blocking
607
+ # Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
608
+ step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
609
+
610
+ if step["done"]:
611
+ return {
612
+ "answer": step["answer"],
613
+ "tool_calls": tool_trace,
614
+ "iterations": iteration + 1,
615
+ }
616
+
617
+ messages.append(step["assistant_message"])
618
+
619
+ for tc in step["tool_calls"]:
620
+ # Deduplicate: skip calls already made with identical arguments.
621
+ call_key = (tc["name"], tuple(sorted(tc["input"].items())))
622
+ if call_key in seen_calls:
623
+ result = f"[Skipped duplicate {tc['name']} call β€” already ran with these arguments]"
624
+ tool_trace.append({"tool": tc["name"], "input": tc["input"], "output": result})
625
+ messages.append(self._build_tool_result(tc["id"], tc["name"], result))
626
+ continue
627
+ seen_calls.add(call_key)
628
 
629
+ # Tool execution via MCP protocol (async HTTP)
630
+ try:
631
+ result = await self.mcp.call_tool(tc["name"], tc["input"])
632
+ except Exception as e:
633
+ result = f"Tool error: {e}"
634
 
635
+ tool_trace.append({
636
+ "tool": tc["name"],
637
+ "input": tc["input"],
638
+ "output": result[:500] + "..." if len(result) > 500 else result,
639
+ })
640
+ messages.append(self._build_tool_result(tc["id"], tc["name"], result))
641
 
642
+ return {
643
+ "answer": "I was unable to fully answer within the allowed reasoning steps.",
644
+ "tool_calls": tool_trace,
645
+ "iterations": self.MAX_ITERATIONS,
646
+ }
647
+ finally:
648
+ self._client, self._provider, self._model = _orig
649
 
650
  async def stream(
651
  self,
 
676
  we re-run with stream=True so tokens arrive in real time.
677
  This is one extra LLM call but delivers genuine streaming UX.
678
  """
679
+ async with self._run_lock:
680
+ async for event in self._stream_locked(question, repo_filter, history, model_id):
681
+ yield event
682
+
683
+ async def _stream_locked(
684
+ self,
685
+ question: str,
686
+ repo_filter: str | None = None,
687
+ history: list[dict] | None = None,
688
+ model_id: str | None = None,
689
+ ) -> AsyncIterator[dict]:
690
  # ── Per-request model override ────────────────────────────────────────
691
  # If the user selected a specific model in the UI, temporarily swap to it.
692
  # We save/restore self._client/provider/model in a finally block so the
ingestion/embedder.py CHANGED
@@ -1,38 +1,45 @@
1
  """
2
- embedder.py β€” Embed code chunks via Voyage AI or Nomic API.
3
 
4
  WHY API-BASED EMBEDDINGS
5
  ─────────────────────────
6
- The local sentence-transformers model (nomic-embed-code) is ~600MB RAM.
7
- That kills free-tier hosting (HF Spaces, Render: 512MB–1GB RAM limit).
8
- Both APIs use the same underlying model β€” vectors are equivalent quality.
9
- Zero RAM cost on our server, just network latency (~200ms/batch).
10
 
11
- TWO PROVIDERS, ONE INTERFACE
12
  ──────────────────────────────
13
- Provider selection happens at init time, based on env vars:
14
 
15
- VOYAGE_API_KEY set + EMBEDDING_MODEL=voyage-code-3
16
  β†’ Voyage AI: code-optimised, 1024-dim, 200M tokens/month free.
17
  voyage-code-3 is specifically trained on code and outperforms
18
  general-purpose embedders on code retrieval benchmarks.
19
- ⚠️ Requires new Qdrant collection (dim mismatch with 768-dim).
20
 
21
- NOMIC_API_KEY set (default)
22
- β†’ Nomic API: nomic-embed-text-v1.5, 768-dim, generous free tier.
 
 
 
 
 
 
23
 
24
  TASK TYPES
25
  ───────────
26
- Both APIs distinguish between document and query roles:
27
- - "search_document" / "document": used when indexing chunks
28
- - "search_query" / "query": used when embedding user queries
29
-
30
- This produces a better inner-product space than treating both the same.
31
 
32
  BATCHING
33
  ─────────
34
- Both APIs accept up to 256-512 texts per call. We batch in groups of 96
35
- (conservative) to avoid timeout on large text chunks over free-tier networks.
 
36
  """
37
 
38
  import time
@@ -45,11 +52,12 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
45
  from backend.config import settings
46
 
47
 
48
- _NOMIC_API_URL = "https://api-atlas.nomic.ai/v1/embedding/text"
49
- _BATCH_SIZE = 32 # Nomic has a ~10MB request body limit; 32 chunks keeps us safe
50
- # even for large contextually-enriched chunks (~8KB each)
51
- _MAX_CHARS = 8000 # truncate each text before sending β€” embeddings degrade
52
- # gracefully on truncation, and models have a token limit anyway
 
53
 
54
 
55
  class Embedder:
@@ -69,13 +77,25 @@ class Embedder:
69
  self.model_name = model_name or settings.embedding_model
70
  self.embedding_dim = settings.embedding_dim
71
 
72
- # Select provider based on available keys + model name
73
- if settings.voyage_api_key and "voyage" in self.model_name.lower():
 
 
 
74
  self._provider = "voyage"
75
  self._init_voyage()
76
- else:
 
 
 
77
  self._provider = "nomic"
78
  self._init_nomic()
 
 
 
 
 
 
79
 
80
  def _init_voyage(self):
81
  """Initialise Voyage AI client. voyage-code-3 is code-optimised 1024-dim."""
@@ -93,18 +113,23 @@ class Embedder:
93
 
94
  def _init_nomic(self):
95
  """Initialise Nomic API client. nomic-embed-text-v1.5 is 768-dim."""
96
- if not settings.nomic_api_key:
97
- raise RuntimeError(
98
- "No embedding provider configured. "
99
- "Set NOMIC_API_KEY (free at https://atlas.nomic.ai) or "
100
- "VOYAGE_API_KEY + EMBEDDING_MODEL=voyage-code-3."
101
- )
102
  self._nomic_key = settings.nomic_api_key
103
  print(
104
  f"Embedder: using Nomic API ({self.model_name}, {self.embedding_dim}-dim). "
105
  "No local model loaded."
106
  )
107
 
 
 
 
 
 
 
 
 
 
 
 
108
  # ── Public interface ───────────────────────────────────────────────────────
109
 
110
  def embed_chunks(self, chunks: list[dict]) -> list[list[float]]:
@@ -123,6 +148,8 @@ class Embedder:
123
  texts = [c["text"][:_MAX_CHARS] for c in chunks]
124
  if self._provider == "voyage":
125
  return self._voyage_embed(texts, input_type="document")
 
 
126
  return self._nomic_embed(texts, task_type="search_document")
127
 
128
  def embed_query(self, query: str) -> list[float]:
@@ -134,6 +161,8 @@ class Embedder:
134
  """
135
  if self._provider == "voyage":
136
  return self._voyage_embed([query], input_type="query")[0]
 
 
137
  return self._nomic_embed([query], task_type="search_query")[0]
138
 
139
  # ── Voyage AI implementation ───────────────────────────────────────────────
@@ -231,3 +260,67 @@ class Embedder:
231
  return response.json()["embeddings"]
232
 
233
  raise RuntimeError("Nomic API call failed after retries")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ embedder.py β€” Embed code chunks via a hosted embedding API.
3
 
4
  WHY API-BASED EMBEDDINGS
5
  ─────────────────────────
6
+ Local sentence-transformers models are ~600MB RAM β€” enough to kill
7
+ free-tier hosting (HF Spaces, Render: 512MB–1GB RAM limit). Hosted
8
+ APIs give us zero RAM cost and equivalent quality, at the price of
9
+ ~200ms of network latency per batch.
10
 
11
+ THREE PROVIDERS, ONE INTERFACE
12
  ──────────────────────────────
13
+ Provider is selected from EMBEDDING_MODEL at init:
14
 
15
+ EMBEDDING_MODEL contains "voyage" + VOYAGE_API_KEY set
16
  β†’ Voyage AI: code-optimised, 1024-dim, 200M tokens/month free.
17
  voyage-code-3 is specifically trained on code and outperforms
18
  general-purpose embedders on code retrieval benchmarks.
19
+ ⚠️ Requires EMBEDDING_DIM=1024 and a new Qdrant collection.
20
 
21
+ EMBEDDING_MODEL contains "gemini" + GEMINI_API_KEY set (default)
22
+ β†’ Google Gemini: gemini-embedding-001, 768-dim output (configurable
23
+ via MRL), generous free tier. Re-uses the same GEMINI_API_KEY we
24
+ use for the LLM β€” no separate signup.
25
+
26
+ NOMIC_API_KEY set (legacy fallback)
27
+ β†’ Nomic API: nomic-embed-text-v1.5, 768-dim. Free quota is 10M
28
+ tokens total β€” easy to exhaust across a few large repo indexes.
29
 
30
  TASK TYPES
31
  ───────────
32
+ Every provider distinguishes document and query roles. A document
33
+ projection and a query projection live in the same embedding space
34
+ but are optimised for their direction of the inner product:
35
+ - document: used when indexing chunks
36
+ - query: used when embedding the user's question
37
 
38
  BATCHING
39
  ─────────
40
+ All three APIs accept batched input. We use groups of 32 to stay
41
+ well under request-body size limits on large contextually-enriched
42
+ chunks (~8KB each) and to keep individual retries cheap.
43
  """
44
 
45
  import time
 
52
  from backend.config import settings
53
 
54
 
55
+ _NOMIC_API_URL = "https://api-atlas.nomic.ai/v1/embedding/text"
56
+ _GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta/models"
57
+ _BATCH_SIZE = 32 # conservative for all providers: stays under ~10MB body
58
+ # and keeps each failed batch cheap to retry
59
+ _MAX_CHARS = 8000 # truncate each text before sending β€” embeddings degrade
60
+ # gracefully on truncation and models silently clip anyway
61
 
62
 
63
  class Embedder:
 
77
  self.model_name = model_name or settings.embedding_model
78
  self.embedding_dim = settings.embedding_dim
79
 
80
+ # Provider selection is driven by the MODEL NAME, with the available
81
+ # API key gating the choice. This lets an operator flip providers by
82
+ # only changing EMBEDDING_MODEL in .env β€” no code change needed.
83
+ name = self.model_name.lower()
84
+ if "voyage" in name and settings.voyage_api_key:
85
  self._provider = "voyage"
86
  self._init_voyage()
87
+ elif "gemini" in name and settings.gemini_api_key:
88
+ self._provider = "gemini"
89
+ self._init_gemini()
90
+ elif settings.nomic_api_key:
91
  self._provider = "nomic"
92
  self._init_nomic()
93
+ else:
94
+ raise RuntimeError(
95
+ f"No embedding provider available for model '{self.model_name}'. "
96
+ "Set GEMINI_API_KEY (default β€” free at https://aistudio.google.com), "
97
+ "or VOYAGE_API_KEY + EMBEDDING_MODEL=voyage-code-3."
98
+ )
99
 
100
  def _init_voyage(self):
101
  """Initialise Voyage AI client. voyage-code-3 is code-optimised 1024-dim."""
 
113
 
114
  def _init_nomic(self):
115
  """Initialise Nomic API client. nomic-embed-text-v1.5 is 768-dim."""
 
 
 
 
 
 
116
  self._nomic_key = settings.nomic_api_key
117
  print(
118
  f"Embedder: using Nomic API ({self.model_name}, {self.embedding_dim}-dim). "
119
  "No local model loaded."
120
  )
121
 
122
+ def _init_gemini(self):
123
+ """Initialise Gemini embeddings. gemini-embedding-001 supports MRL,
124
+ so we request exactly `embedding_dim` dimensions from the API β€” that
125
+ way one deployment can reuse an existing Qdrant collection schema
126
+ (768-dim) or scale up to a larger one without code changes."""
127
+ self._gemini_key = settings.gemini_api_key
128
+ print(
129
+ f"Embedder: using Gemini API ({self.model_name}, {self.embedding_dim}-dim). "
130
+ "No local model loaded."
131
+ )
132
+
133
  # ── Public interface ───────────────────────────────────────────────────────
134
 
135
  def embed_chunks(self, chunks: list[dict]) -> list[list[float]]:
 
148
  texts = [c["text"][:_MAX_CHARS] for c in chunks]
149
  if self._provider == "voyage":
150
  return self._voyage_embed(texts, input_type="document")
151
+ if self._provider == "gemini":
152
+ return self._gemini_embed(texts, task_type="RETRIEVAL_DOCUMENT")
153
  return self._nomic_embed(texts, task_type="search_document")
154
 
155
  def embed_query(self, query: str) -> list[float]:
 
161
  """
162
  if self._provider == "voyage":
163
  return self._voyage_embed([query], input_type="query")[0]
164
+ if self._provider == "gemini":
165
+ return self._gemini_embed([query], task_type="RETRIEVAL_QUERY")[0]
166
  return self._nomic_embed([query], task_type="search_query")[0]
167
 
168
  # ── Voyage AI implementation ───────────────────────────────────────────────
 
260
  return response.json()["embeddings"]
261
 
262
  raise RuntimeError("Nomic API call failed after retries")
263
+
264
+ # ── Gemini API implementation ──────────────────────────────────────────────
265
+
266
+ def _gemini_embed(self, texts: list[str], task_type: str) -> list[list[float]]:
267
+ """Call Gemini batchEmbedContents with batching. Returns list of
268
+ `embedding_dim`-dim vectors.
269
+
270
+ task_type is the Gemini task enum (RETRIEVAL_DOCUMENT / RETRIEVAL_QUERY).
271
+ These produce different projections within the same embedding space β€”
272
+ the document projection is optimised for being retrieved, the query
273
+ projection for doing the retrieving.
274
+ """
275
+ all_embeddings: list[list[float]] = []
276
+ for i in range(0, len(texts), _BATCH_SIZE):
277
+ batch = [t[:_MAX_CHARS] for t in texts[i : i + _BATCH_SIZE]]
278
+ embeddings = self._gemini_call_api(batch, task_type)
279
+ all_embeddings.extend(embeddings)
280
+ return all_embeddings
281
+
282
+ def _gemini_call_api(
283
+ self,
284
+ texts: list[str],
285
+ task_type: str,
286
+ retries: int = 3,
287
+ ) -> list[list[float]]:
288
+ """
289
+ Single Gemini batchEmbedContents call with retry on rate limit (429)
290
+ or service error (503). Gemini free tier is RPM-capped, so backoff is
291
+ more aggressive than Nomic (3 retries vs 2, longer default wait).
292
+
293
+ Response shape:
294
+ { "embeddings": [{ "values": [float, ...] }, ...] }
295
+ """
296
+ url = (
297
+ f"{_GEMINI_API_BASE}/{self.model_name}:batchEmbedContents"
298
+ f"?key={self._gemini_key}"
299
+ )
300
+ model_id = f"models/{self.model_name}"
301
+ payload = {
302
+ "requests": [
303
+ {
304
+ "model": model_id,
305
+ "content": {"parts": [{"text": t}]},
306
+ "taskType": task_type,
307
+ "outputDimensionality": self.embedding_dim,
308
+ }
309
+ for t in texts
310
+ ]
311
+ }
312
+
313
+ for attempt in range(retries + 1):
314
+ response = http.post(url, json=payload, timeout=60)
315
+
316
+ if response.status_code in (429, 503) and attempt < retries:
317
+ # Gemini doesn't always send Retry-After; back off exponentially.
318
+ wait = int(response.headers.get("Retry-After", 2 ** attempt * 5))
319
+ print(f"Gemini API {response.status_code}. Waiting {wait}s before retry...")
320
+ time.sleep(wait)
321
+ continue
322
+
323
+ response.raise_for_status()
324
+ return [e["values"] for e in response.json()["embeddings"]]
325
+
326
+ raise RuntimeError("Gemini API call failed after retries")
ui/package.json CHANGED
@@ -3,6 +3,9 @@
3
  "private": true,
4
  "version": "0.0.0",
5
  "type": "module",
 
 
 
6
  "scripts": {
7
  "dev": "vite",
8
  "build": "vite build",
 
3
  "private": true,
4
  "version": "0.0.0",
5
  "type": "module",
6
+ "engines": {
7
+ "node": ">=20"
8
+ },
9
  "scripts": {
10
  "dev": "vite",
11
  "build": "vite build",