Spaces:

umanggarg
/

cartographer

Running

umanggarg Claude Sonnet 4.6 commited on 25 days ago

Commit

34ba9cf

1 Parent(s): 1dd81ef

Fix re-index stuck: emit per-batch progress during contextual retrieval

The progress bar was frozen at 38% (chunking) for the entire 10+ minute
contextual retrieval phase because _add_context had no progress callbacks.

Backend:
- Thread progress callback through to _add_context
- Emit "contextualizing" step every 20 chunks with "X / Y" in the detail
so the frontend can compute an exact percentage

Frontend:
- Parse "X / Y" from contextualizing detail to map 38-78% dynamically
- Adjust STEP_PCT: embedding=80%, storing=92% (shifted right to make room)

UI:
- Add a shimmer animation to the progress bar so it visually pulses
between batch events, making it clear the operation is still running

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

backend/services/ingestion_service.py +21 -5
ui/src/components/Sidebar.jsx +16 -3
ui/src/index.css +13 -0

backend/services/ingestion_service.py CHANGED Viewed

@@ -143,12 +143,21 @@ class IngestionService:
         # Only runs on force=True re-ingestion to avoid slowing down first-time indexing.
         if force and hasattr(self, '_gen') and self._gen is not None:
             top_n = settings.contextual_top_n  # 0 = all chunks
-            limit_str = f"all {len(chunks)}" if top_n == 0 else f"top {top_n}"
             _emit("contextualizing",
-                  f"Contextual retrieval: adding AI-generated descriptions to {limit_str} chunks…")
-            print(f"Contextual retrieval: enriching {limit_str} chunks with context...")
             now = datetime.now(timezone.utc).isoformat()
-            chunks = _add_context(chunks, file_dicts, self._gen, top_n=top_n, contextual_at=now)
             n_enriched = sum(1 for c in chunks if c.get('_contextualised'))
             print(f"  Context added to {n_enriched} chunks")
@@ -259,6 +268,7 @@ def _add_context(
     gen,
     top_n: int = 0,
     contextual_at: str = None,
 ) -> list[dict]:
     """
     Contextual Retrieval: prepend a short LLM-generated context sentence to
@@ -313,7 +323,9 @@ def _add_context(
             result[i] = dict(result[i])
             result[i]["contextual_at"] = contextual_at
-    for idx, _chunk in ranked[:limit]:
         # Use result[idx] (which already has contextual_at stamped) as the base
         chunk      = result[idx]
         filepath   = chunk.get("filepath", "")
@@ -340,4 +352,8 @@ def _add_context(
             print(f"  Context skipped for {filepath}:{chunk.get('name', '?')} — {e}")
             # Leave result[idx] unchanged — graceful fallback to raw chunk
     return result

         # Only runs on force=True re-ingestion to avoid slowing down first-time indexing.
         if force and hasattr(self, '_gen') and self._gen is not None:
             top_n = settings.contextual_top_n  # 0 = all chunks
+            total_ctx = len(chunks) if top_n == 0 else min(top_n, len(chunks))
             _emit("contextualizing",
+                  f"Adding context to chunks… 0 / {total_ctx}")
+            print(f"Contextual retrieval: enriching {total_ctx} chunks with context...")
             now = datetime.now(timezone.utc).isoformat()
+            # Emit progress every 20 chunks so the UI bar advances visibly
+            def _ctx_progress(done: int, total: int) -> None:
+                _emit("contextualizing", f"Adding context to chunks… {done} / {total}")
+            chunks = _add_context(
+                chunks, file_dicts, self._gen,
+                top_n=top_n, contextual_at=now,
+                progress=_ctx_progress,
+            )
             n_enriched = sum(1 for c in chunks if c.get('_contextualised'))
             print(f"  Context added to {n_enriched} chunks")
     gen,
     top_n: int = 0,
     contextual_at: str = None,
+    progress: callable = None,
 ) -> list[dict]:
     """
     Contextual Retrieval: prepend a short LLM-generated context sentence to
             result[i] = dict(result[i])
             result[i]["contextual_at"] = contextual_at
+    _REPORT_EVERY = 20  # emit a progress event every N chunks to avoid SSE flood
+    for done, (idx, _chunk) in enumerate(ranked[:limit], start=1):
         # Use result[idx] (which already has contextual_at stamped) as the base
         chunk      = result[idx]
         filepath   = chunk.get("filepath", "")
             print(f"  Context skipped for {filepath}:{chunk.get('name', '?')} — {e}")
             # Leave result[idx] unchanged — graceful fallback to raw chunk
+        # Report progress every N chunks so the UI bar advances visibly
+        if progress and (done % _REPORT_EVERY == 0 or done == limit):
+            progress(done, limit)
     return result

ui/src/components/Sidebar.jsx CHANGED Viewed

@@ -196,8 +196,8 @@ export default function Sidebar({ repos, reposLoading, activeRepo, onSelectRepo,
     setReindexPct(prev => ({ ...prev, [slug]: 5 }));
     // Map ingestion steps to approximate % complete so the bar fills meaningfully.
-    // Embedding is the longest step (~60% of total time), so it gets the most range.
-    const STEP_PCT = { fetching: 10, filtering: 22, chunking: 38, embedding: 75, storing: 90, done: 100 };
     // Use EventSource (GET SSE) instead of a POST fetch so the connection never
     // times out — large repos take several minutes to re-embed. The backend sends
@@ -206,7 +206,20 @@ export default function Sidebar({ repos, reposLoading, activeRepo, onSelectRepo,
     es.onmessage = (ev) => {
       const event = JSON.parse(ev.data);
-      const pct = STEP_PCT[event.step] ?? null;
       if (pct !== null) setReindexPct(prev => ({ ...prev, [slug]: pct }));
       if (event.step === "done") {

     setReindexPct(prev => ({ ...prev, [slug]: 5 }));
     // Map ingestion steps to approximate % complete so the bar fills meaningfully.
+    // "contextualizing" is dynamic — we compute it from the "X / Y" in the detail.
+    const STEP_PCT = { fetching: 10, filtering: 22, chunking: 38, embedding: 80, storing: 92, done: 100 };
     // Use EventSource (GET SSE) instead of a POST fetch so the connection never
     // times out — large repos take several minutes to re-embed. The backend sends
     es.onmessage = (ev) => {
       const event = JSON.parse(ev.data);
+      let pct = STEP_PCT[event.step] ?? null;
+      // Contextualizing fires many times with "X / Y" in the detail.
+      // Map it to 38–78% range so the bar visibly advances during this long phase.
+      if (event.step === "contextualizing" && event.detail) {
+        const m = event.detail.match(/(\d+)\s*\/\s*(\d+)/);
+        if (m) {
+          const [done, total] = [parseInt(m[1]), parseInt(m[2])];
+          pct = Math.round(38 + (done / total) * 40);
+        } else {
+          pct = 40; // initial "contextualizing" event before first batch
+        }
+      }
       if (pct !== null) setReindexPct(prev => ({ ...prev, [slug]: pct }));
       if (event.step === "done") {

ui/src/index.css CHANGED Viewed

@@ -3350,6 +3350,19 @@ textarea:focus-visible {
   border-radius: 0 1px 1px 0;
   transition: width 0.6s cubic-bezier(0.4, 0, 0.2, 1);
   box-shadow: 0 0 6px var(--accent);
 }
 /* ══════════════════════════════════════════════════════════

   border-radius: 0 1px 1px 0;
   transition: width 0.6s cubic-bezier(0.4, 0, 0.2, 1);
   box-shadow: 0 0 6px var(--accent);
+  /* Shimmer shows the bar is alive even when progress is slow */
+  background-image: linear-gradient(
+    90deg,
+    var(--accent) 0%,
+    rgba(160, 120, 255, 0.9) 45%,
+    var(--accent) 100%
+  );
+  background-size: 200% 100%;
+  animation: reindex-shimmer 1.8s ease-in-out infinite;
+}
+@keyframes reindex-shimmer {
+  0%   { background-position: 200% center; }
+  100% { background-position: -200% center; }
 }
 /* ══════════════════════════════════════════════════════════