Spaces:

umanggarg
/

cartographer

Sleeping

umanggarg commited on 17 days ago

Commit

eae79ac

1 Parent(s): 25d2210

Parallelize Voyage embeds, checkpoint ingestion, add client timeout

- _voyage_embed now dispatches batches through an 8-worker ThreadPoolExecutor,
preserving input order by filling a pre-sized results array by batch index.
Expected ~6x speedup on large repos (langchain 13594 chunks: 15min → ~3min).
- Voyage client gets timeout=60 + max_retries=0 per the Provider Client Rule;
the SDK uses plain `requests` and silently hangs otherwise (same failure
class as the Gemma 4 incident).
- Bumped Voyage batch size 32 → 96 (kept _BATCH_SIZE=32 for Nomic/Gemini).
- Ingestion now embeds+upserts in groups of 500 so a mid-run crash only loses
the current group's work; retry resumes via existing find_vectors_by_hash.
- Plumbed a progress(done,total) callback from the service through embed_chunks
so the UI and HF Space logs show live per-batch progress instead of a silent
15-minute block.

Files changed (2) hide show

backend/services/ingestion_service.py +51 -20
ingestion/embedder.py +76 -15

backend/services/ingestion_service.py CHANGED Viewed

@@ -186,27 +186,58 @@ class IngestionService:
             _emit("embedding", f"Embedding {len(chunks)} chunks...")
         print("Embedding chunks...")
-        new_vectors = self.embedder.embed_chunks(new_chunks) if new_chunks else []
-        if new_vectors:
-            print(f"  Produced {len(new_vectors)} vectors ({len(new_vectors[0])}-dim each)")
-        # Reconstruct the full vectors list in original chunk order.
-        # Chunks with existing vectors use the stored vector; new ones use the
-        # freshly computed one. This preserves the 1-to-1 chunks↔vectors pairing
-        # that upsert_chunks requires.
-        new_hash_to_vec = {c["text_hash"]: v for c, v in zip(new_chunks, new_vectors)}
-        vectors = [
-            existing_vectors.get(c["text_hash"]) or new_hash_to_vec[c["text_hash"]]
-            for c in chunks
-        ]
-        # ── Step 7: Store, then sweep stale chunks ────────────────────────────
-        # Upsert new chunks first — at this point old chunks are still visible.
-        # Any chunk whose source code hasn't changed will be overwritten with an
-        # identical payload (same ID, stable per repo::filepath::start_line).
-        _emit("storing", f"Storing {len(chunks)} chunks in Qdrant...")
-        print("Storing in Qdrant...")
-        written_ids = self.store.upsert_chunks(chunks, vectors)
         # On a force re-index, delete chunks that no longer exist in the source.
         # This handles deleted files and renamed functions — their old IDs won't

             _emit("embedding", f"Embedding {len(chunks)} chunks...")
         print("Embedding chunks...")
+        # ── Step 7: Embed + upsert in checkpointed groups ─────────────────────
+        # Stream embed→upsert in groups so a crash mid-run leaves earlier
+        # chunks safely in Qdrant. Retry then skips them via the existing
+        # find_vectors_by_hash dedup path above. Without checkpoints, a 15-min
+        # ingest that dies at chunk 13000/13594 loses 100% of the work.
+        #
+        # Group size 500: big enough that Qdrant upsert overhead amortises,
+        # small enough that a crash loses at most ~500 re-embeddings on retry.
+        CHECKPOINT_SIZE = 500
+        total_new = len(new_chunks)
+        new_done  = 0
+        written_ids: list = []
+        # Progress callback for the embedder — maps batch-level progress
+        # within a checkpoint group to an overall "chunks embedded / total"
+        # count. `new_done` snapshots the running total across groups.
+        def _embed_progress(batch_done: int, batch_total: int) -> None:
+            overall_done = new_done + batch_done
+            _emit("embedding", f"Embedded {overall_done}/{total_new} chunks...")
+        for group_start in range(0, len(chunks), CHECKPOINT_SIZE):
+            group = chunks[group_start : group_start + CHECKPOINT_SIZE]
+            # Within the group, split reused vs new. Only the new ones hit
+            # the embedding API; reused chunks pull from `existing_vectors`.
+            group_new_chunks = [c for c in group if c["text_hash"] not in existing_vectors]
+            if group_new_chunks:
+                group_new_vectors = self.embedder.embed_chunks(
+                    group_new_chunks, progress=_embed_progress,
+                )
+                new_done += len(group_new_chunks)
+            else:
+                group_new_vectors = []
+            # Stitch back into group order so each chunk lines up with its vector.
+            group_hash_to_vec = {
+                c["text_hash"]: v for c, v in zip(group_new_chunks, group_new_vectors)
+            }
+            group_vectors = [
+                existing_vectors.get(c["text_hash"]) or group_hash_to_vec[c["text_hash"]]
+                for c in group
+            ]
+            # Upsert this group before touching the next — that's the actual
+            # checkpoint. If the next group's embedding call dies, everything
+            # up to here is already in Qdrant.
+            _emit("storing", f"Storing checkpoint {group_start + len(group)}/{len(chunks)}...")
+            group_ids = self.store.upsert_chunks(group, group_vectors)
+            written_ids.extend(group_ids)
+            print(f"  Checkpoint {group_start + len(group)}/{len(chunks)} stored")
         # On a force re-index, delete chunks that no longer exist in the source.
         # This handles deleted files and renamed functions — their old IDs won't

ingestion/embedder.py CHANGED Viewed

@@ -46,6 +46,7 @@ import time
 from pathlib import Path
 import sys
 import re
 import requests as http
@@ -55,8 +56,12 @@ from backend.config import settings
 _NOMIC_API_URL   = "https://api-atlas.nomic.ai/v1/embedding/text"
 _GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta/models"
-_BATCH_SIZE      = 32    # conservative for all providers: stays under ~10MB body
                          # and keeps each failed batch cheap to retry
 _MAX_CHARS       = 8000  # truncate each text before sending — embeddings degrade
                          # gracefully on truncation and models silently clip anyway
@@ -118,10 +123,21 @@ class Embedder:
             )
     def _init_voyage(self):
-        """Initialise Voyage AI client. voyage-code-3 is code-optimised 1024-dim."""
         try:
             import voyageai
-            self._voyage = voyageai.Client(api_key=settings.voyage_api_key)
         except ImportError:
             raise ImportError(
                 "voyageai package not installed. Run: pip install voyageai"
@@ -153,7 +169,11 @@ class Embedder:
     # ── Public interface ───────────────────────────────────────────────────────
-    def embed_chunks(self, chunks: list[dict]) -> list[list[float]]:
         """
         Embed a list of chunk dicts for indexing (document role).
@@ -165,10 +185,14 @@ class Embedder:
         a token limit (~8192 tokens) and API gateways have a request body size
         limit (~10MB). Truncation degrades retrieval quality marginally but
         avoids 413 errors on large class definitions or contextually-enriched chunks.
         """
         texts = [c["text"][:_MAX_CHARS] for c in chunks]
         if self._provider == "voyage":
-            return self._voyage_embed(texts, input_type="document")
         if self._provider == "gemini":
             return self._gemini_embed(texts, task_type="RETRIEVAL_DOCUMENT")
         return self._nomic_embed(texts, task_type="search_document")
@@ -188,23 +212,60 @@ class Embedder:
     # ── Voyage AI implementation ───────────────────────────────────────────────
-    def _voyage_embed(self, texts: list[str], input_type: str) -> list[list[float]]:
         """
-        Call Voyage AI API with batching.
         voyage-code-3 is specifically trained on (code, docstring) pairs
         and GitHub issues, giving it much better code retrieval than
         general-purpose text embedders.
-        Batching: Voyage API accepts up to 128 texts per call on free tier.
-        We use 96 to leave headroom for large chunks.
         """
-        all_embeddings: list[list[float]] = []
-        for i in range(0, len(texts), _BATCH_SIZE):
-            batch  = texts[i : i + _BATCH_SIZE]
-            result = self._voyage_call_api(batch, input_type)
-            all_embeddings.extend(result)
-        return all_embeddings
     def _voyage_call_api(
         self,

 from pathlib import Path
 import sys
 import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import requests as http
 _NOMIC_API_URL   = "https://api-atlas.nomic.ai/v1/embedding/text"
 _GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta/models"
+_BATCH_SIZE      = 32    # conservative default (Nomic): stays under ~10MB body
                          # and keeps each failed batch cheap to retry
+_VOYAGE_BATCH_SIZE  = 96 # Voyage accepts up to 128 per call; 96 at _MAX_CHARS=8000
+                         # stays well under their 120K-token-per-request cap.
+_VOYAGE_CONCURRENCY = 8  # parallel workers for Voyage. Their paid tier is 2000 RPM
+                         # (~33 req/s); 8 workers * ~2s/batch ≈ 4 req/s — comfortable.
 _MAX_CHARS       = 8000  # truncate each text before sending — embeddings degrade
                          # gracefully on truncation and models silently clip anyway
             )
     def _init_voyage(self):
+        """Initialise Voyage AI client. voyage-code-3 is code-optimised 1024-dim.
+        timeout=60 is mandatory per the Provider Client Rule in CLAUDE.md — a
+        stuck request with no timeout will block an entire ingestion silently,
+        since the voyageai SDK uses plain `requests` and doesn't log each call.
+        max_retries=0 leaves retry logic to us (see _voyage_call_api), so we
+        don't double-retry on transient failures.
+        """
         try:
             import voyageai
+            self._voyage = voyageai.Client(
+                api_key=settings.voyage_api_key,
+                timeout=60,
+                max_retries=0,
+            )
         except ImportError:
             raise ImportError(
                 "voyageai package not installed. Run: pip install voyageai"
     # ── Public interface ───────────────────────────────────────────────────────
+    def embed_chunks(
+        self,
+        chunks: list[dict],
+        progress: callable = None,
+    ) -> list[list[float]]:
         """
         Embed a list of chunk dicts for indexing (document role).
         a token limit (~8192 tokens) and API gateways have a request body size
         limit (~10MB). Truncation degrades retrieval quality marginally but
         avoids 413 errors on large class definitions or contextually-enriched chunks.
+        progress: optional callback progress(done_chunks, total_chunks) called
+        as each batch completes. Lets callers render a live progress bar without
+        knowing the provider's internal batch size.
         """
         texts = [c["text"][:_MAX_CHARS] for c in chunks]
         if self._provider == "voyage":
+            return self._voyage_embed(texts, input_type="document", progress=progress)
         if self._provider == "gemini":
             return self._gemini_embed(texts, task_type="RETRIEVAL_DOCUMENT")
         return self._nomic_embed(texts, task_type="search_document")
     # ── Voyage AI implementation ───────────────────────────────────────────────
+    def _voyage_embed(
+        self,
+        texts: list[str],
+        input_type: str,
+        progress: callable = None,
+    ) -> list[list[float]]:
         """
+        Call Voyage AI API with parallel batching.
         voyage-code-3 is specifically trained on (code, docstring) pairs
         and GitHub issues, giving it much better code retrieval than
         general-purpose text embedders.
+        Concurrency: Voyage's paid tier is 2000 RPM (~33 req/s). We run
+        _VOYAGE_CONCURRENCY=8 workers so a 13K-chunk ingest drops from ~15 min
+        serial to ~2 min parallel. Each worker serialises its HTTP call through
+        the shared voyageai.Client (which uses a thread-safe requests.Session
+        internally), so no per-worker client is needed.
+        Order preservation: batches may complete out of order, so we fill a
+        pre-sized results array by batch index, then flatten in original order.
+        Without this, a chunk's vector could land on the wrong chunk payload.
+        progress(done_chunks, total_chunks) fires after each completed batch.
         """
+        batches = [
+            texts[i : i + _VOYAGE_BATCH_SIZE]
+            for i in range(0, len(texts), _VOYAGE_BATCH_SIZE)
+        ]
+        if not batches:
+            return []
+        results: list[list[list[float]] | None] = [None] * len(batches)
+        total_chunks = len(texts)
+        done_chunks  = 0
+        workers = min(_VOYAGE_CONCURRENCY, len(batches))
+        with ThreadPoolExecutor(max_workers=workers) as pool:
+            future_to_idx = {
+                pool.submit(self._voyage_call_api, batch, input_type): idx
+                for idx, batch in enumerate(batches)
+            }
+            for fut in as_completed(future_to_idx):
+                idx = future_to_idx[fut]
+                results[idx] = fut.result()
+                done_chunks += len(batches[idx])
+                print(
+                    f"  Voyage batch {sum(1 for r in results if r is not None)}"
+                    f"/{len(batches)} done ({done_chunks}/{total_chunks} chunks)"
+                )
+                if progress:
+                    progress(done_chunks, total_chunks)
+        return [vec for batch_result in results for vec in batch_result]
     def _voyage_call_api(
         self,