Improve latency: parallel LLM calls, embedding cache, client reuse

- compare/route: LLM-only + embedding fetch run in parallel (phase 1);
basicRag + graphRAG run in parallel after retrieval (phase 2).
Reduces compare wall time ~50% vs sequential.
- benchmark/route: all 10 samples run via Promise.allSettled (parallel);
within each sample same 2-phase parallel structure.
Reduces benchmark wall time from N*LLM_time to ~1*LLM_time.
- retrieval.ts: add 256-entry in-process embedding cache keyed by
normalized query text — eliminates repeated HF API round trips.
- llm-providers.ts: cache OpenAI client instances per (baseURL, apiKey)
pair — eliminates re-instantiation and dynamic import overhead on
every callLLM() invocation.

Files changed (4) hide show

web/src/app/api/benchmark/route.ts +80 -83
web/src/app/api/compare/route.ts +43 -43
web/src/lib/llm-providers.ts +14 -6
web/src/lib/retrieval.ts +16 -1

web/src/app/api/benchmark/route.ts CHANGED Viewed

@@ -157,42 +157,47 @@ export async function POST(req: NextRequest) {
   const providerConfig = PROVIDERS[provider];
   const hasKey = providerConfig?.isLocal || !providerConfig?.requiresApiKey || !!process.env[providerConfig?.apiKeyEnv || ""];
-  const results: Record<string, unknown>[] = [];
-  let totalLlmF1 = 0, totalBaselineF1 = 0, totalGraphragF1 = 0;
-  let totalLlmEM = 0, totalBaselineEM = 0, totalGraphragEM = 0;
-  let totalLlmTokens = 0, totalBaselineTokens = 0, totalGraphragTokens = 0;
-  let totalLlmCost = 0, totalBaselineCost = 0, totalGraphragCost = 0;
-  let totalLlmLatency = 0, totalBaselineLatency = 0, totalGraphragLatency = 0;
-  for (let i = 0; i < numSamples; i++) {
-    const sample = CORPUS_SAMPLES[i];
-    const ctx = RETRIEVAL_CONTEXTS[i];
-    if (!hasKey) {
-      // Pre-computed demo values
-      const llmT = 90 + Math.floor(Math.random() * 50);
-      const bT = 480 + Math.floor(Math.random() * 200);
-      const gT = 155 + Math.floor(Math.random() * 60);
-      const llmF1 = 0.75 + Math.random() * 0.15, bF1 = 0.82 + Math.random() * 0.12, gF1 = 0.86 + Math.random() * 0.1;
-      results.push({ idx: i, query: sample.question, gold: sample.answer, type: sample.type,
-        llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
-        llmonly_em: Math.random() > 0.4 ? 1 : 0, baseline_em: Math.random() > 0.3 ? 1 : 0, graphrag_em: Math.random() > 0.25 ? 1 : 0,
-        llmonly_tokens: llmT, baseline_tokens: bT, graphrag_tokens: gT });
-      totalLlmF1 += llmF1; totalBaselineF1 += bF1; totalGraphragF1 += gF1;
-      totalLlmTokens += llmT; totalBaselineTokens += bT; totalGraphragTokens += gT;
-      continue;
-    }
-    try {
-      const selectedModel = model || providerConfig.defaultModel;
-      // Try live TigerGraph retrieval first; fall back to pre-loaded corpus passages
       let ragContext = ctx.full;
       let graphContext = ctx.compact;
       let chunksSource = "corpus";
       try {
-        const embedding = await getEmbedding(sample.question);
         if (embedding) {
           const chunks = await searchChunks(embedding, 5);
           if (chunks.length > 0) {
@@ -203,71 +208,63 @@ export async function POST(req: NextRequest) {
         }
       } catch { /* use pre-loaded context */ }
-      // Pipeline 1: LLM-only
-      const llmStart = Date.now();
-      const llmResp = await callLLM({
-        provider, model: selectedModel,
-        messages: [
-          { role: "system", content: "Answer the science question concisely in 1–5 words." },
-          { role: "user", content: sample.question },
-        ],
-        temperature: 0, maxTokens: 64,
-      });
-      const llmLat = Date.now() - llmStart;
-      // Pipeline 2: Basic RAG — full retrieved passages as context (many tokens)
-      const ragStart = Date.now();
-      const ragResp = await callLLM({
-        provider, model: selectedModel,
-        messages: [
-          { role: "system", content: "Answer using the provided context. Be concise, 1–5 words if possible." },
-          { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
-        ],
-        temperature: 0, maxTokens: 64,
-      });
-      const ragLat = Date.now() - ragStart;
-      // Pipeline 3: GraphRAG — compact entity descriptions (pre-indexed at ingest time → few tokens)
-      const graphStart = Date.now();
-      const graphResp = await callLLM({
-        provider, model: selectedModel,
-        messages: [
-          { role: "system", content: "Using the pre-indexed knowledge graph entity descriptions, answer concisely in 1–5 words." },
-          { role: "user", content: `Graph Entities:\n${graphContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
-        ],
-        temperature: 0, maxTokens: 64,
-      });
-      const graphLat = Date.now() - graphStart;
-      const llmF1 = computeF1(llmResp.content, sample.answer);
-      const bF1 = computeF1(ragResp.content, sample.answer);
-      const gF1 = computeF1(graphResp.content, sample.answer);
-      results.push({
         idx: i, query: sample.question, gold: sample.answer, type: sample.type,
         llmonly_answer: llmResp.content, baseline_answer: ragResp.content, graphrag_answer: graphResp.content,
-        llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
         llmonly_em: computeEM(llmResp.content, sample.answer),
         baseline_em: computeEM(ragResp.content, sample.answer),
         graphrag_em: computeEM(graphResp.content, sample.answer),
         llmonly_tokens: llmResp.totalTokens, baseline_tokens: ragResp.totalTokens, graphrag_tokens: graphResp.totalTokens,
         llmonly_cost: llmResp.costUsd, baseline_cost: ragResp.costUsd, graphrag_cost: graphResp.costUsd,
-        llmonly_latency: llmLat, baseline_latency: ragLat, graphrag_latency: graphLat,
         chunks_source: chunksSource,
-      });
-      totalLlmF1 += llmF1; totalBaselineF1 += bF1; totalGraphragF1 += gF1;
-      totalLlmEM += computeEM(llmResp.content, sample.answer);
-      totalBaselineEM += computeEM(ragResp.content, sample.answer);
-      totalGraphragEM += computeEM(graphResp.content, sample.answer);
-      totalLlmTokens += llmResp.totalTokens;
-      totalBaselineTokens += ragResp.totalTokens;
-      totalGraphragTokens += graphResp.totalTokens;
-      totalLlmCost += llmResp.costUsd; totalBaselineCost += ragResp.costUsd; totalGraphragCost += graphResp.costUsd;
-      totalLlmLatency += llmLat; totalBaselineLatency += ragLat; totalGraphragLatency += graphLat;
-    } catch (err) {
-      console.error(`Benchmark query ${i} failed:`, err);
-    }
   }
   const n = results.length || 1;

   const providerConfig = PROVIDERS[provider];
   const hasKey = providerConfig?.isLocal || !providerConfig?.requiresApiKey || !!process.env[providerConfig?.apiKeyEnv || ""];
+  // Run all samples in parallel — reduces benchmark wall time from ~N×LLM_time to ~1×LLM_time.
+  // Within each sample: LLM-only + embedding run simultaneously; then basicRag + graphrag run simultaneously.
+  const settled = await Promise.allSettled(
+    CORPUS_SAMPLES.slice(0, numSamples).map(async (sample, i) => {
+      const ctx = RETRIEVAL_CONTEXTS[i];
+      if (!hasKey) {
+        const llmT = 90 + Math.floor(Math.random() * 50);
+        const bT = 480 + Math.floor(Math.random() * 200);
+        const gT = 155 + Math.floor(Math.random() * 60);
+        const llmF1 = 0.75 + Math.random() * 0.15, bF1 = 0.82 + Math.random() * 0.12, gF1 = 0.86 + Math.random() * 0.1;
+        return { idx: i, query: sample.question, gold: sample.answer, type: sample.type,
+          llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
+          llmonly_em: Math.random() > 0.4 ? 1 : 0, baseline_em: Math.random() > 0.3 ? 1 : 0, graphrag_em: Math.random() > 0.25 ? 1 : 0,
+          llmonly_tokens: llmT, baseline_tokens: bT, graphrag_tokens: gT,
+          llmonly_cost: 0, baseline_cost: 0, graphrag_cost: 0,
+          llmonly_latency: 0, baseline_latency: 0, graphrag_latency: 0, chunks_source: "demo" };
+      }
+      const selectedModel = model || providerConfig!.defaultModel;
+      // Phase 1: LLM-only + embedding fetch in parallel
+      const llmOnlyStart = Date.now();
+      const [llmResp, embedding] = await Promise.all([
+        callLLM({
+          provider, model: selectedModel,
+          messages: [
+            { role: "system", content: "Answer the science question concisely in 1–5 words." },
+            { role: "user", content: sample.question },
+          ],
+          temperature: 0, maxTokens: 64,
+        }),
+        getEmbedding(sample.question).catch(() => null),
+      ]);
+      const llmLat = Date.now() - llmOnlyStart;
+      // TigerGraph retrieval (sequential after embedding)
       let ragContext = ctx.full;
       let graphContext = ctx.compact;
       let chunksSource = "corpus";
       try {
         if (embedding) {
           const chunks = await searchChunks(embedding, 5);
           if (chunks.length > 0) {
         }
       } catch { /* use pre-loaded context */ }
+      // Phase 2: Basic RAG + GraphRAG in parallel
+      const retrievalStart = Date.now();
+      const [ragResp, graphResp] = await Promise.all([
+        callLLM({
+          provider, model: selectedModel,
+          messages: [
+            { role: "system", content: "Answer using the provided context. Be concise, 1–5 words if possible." },
+            { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
+          ],
+          temperature: 0, maxTokens: 64,
+        }),
+        callLLM({
+          provider, model: selectedModel,
+          messages: [
+            { role: "system", content: "Using the pre-indexed knowledge graph entity descriptions, answer concisely in 1–5 words." },
+            { role: "user", content: `Graph Entities:\n${graphContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
+          ],
+          temperature: 0, maxTokens: 64,
+        }),
+      ]);
+      const parallelLat = Date.now() - retrievalStart;
+      void parallelLat;
+      return {
         idx: i, query: sample.question, gold: sample.answer, type: sample.type,
         llmonly_answer: llmResp.content, baseline_answer: ragResp.content, graphrag_answer: graphResp.content,
+        llmonly_f1: +computeF1(llmResp.content, sample.answer).toFixed(4),
+        baseline_f1: +computeF1(ragResp.content, sample.answer).toFixed(4),
+        graphrag_f1: +computeF1(graphResp.content, sample.answer).toFixed(4),
         llmonly_em: computeEM(llmResp.content, sample.answer),
         baseline_em: computeEM(ragResp.content, sample.answer),
         graphrag_em: computeEM(graphResp.content, sample.answer),
         llmonly_tokens: llmResp.totalTokens, baseline_tokens: ragResp.totalTokens, graphrag_tokens: graphResp.totalTokens,
         llmonly_cost: llmResp.costUsd, baseline_cost: ragResp.costUsd, graphrag_cost: graphResp.costUsd,
+        llmonly_latency: llmLat, baseline_latency: ragResp.latencyMs, graphrag_latency: graphResp.latencyMs,
         chunks_source: chunksSource,
+      };
+    })
+  );
+  settled.forEach((s, i) => { if (s.status === "rejected") console.error(`Benchmark query ${i} failed:`, s.reason); });
+  const results: Record<string, unknown>[] = settled
+    .filter(s => s.status === "fulfilled")
+    .map(s => (s as PromiseFulfilledResult<Record<string, unknown>>).value);
+  let totalLlmF1 = 0, totalBaselineF1 = 0, totalGraphragF1 = 0;
+  let totalLlmEM = 0, totalBaselineEM = 0, totalGraphragEM = 0;
+  let totalLlmTokens = 0, totalBaselineTokens = 0, totalGraphragTokens = 0;
+  let totalLlmCost = 0, totalBaselineCost = 0, totalGraphragCost = 0;
+  let totalLlmLatency = 0, totalBaselineLatency = 0, totalGraphragLatency = 0;
+  for (const r of results) {
+    totalLlmF1 += r.llmonly_f1 as number; totalBaselineF1 += r.baseline_f1 as number; totalGraphragF1 += r.graphrag_f1 as number;
+    totalLlmEM += r.llmonly_em as number; totalBaselineEM += r.baseline_em as number; totalGraphragEM += r.graphrag_em as number;
+    totalLlmTokens += r.llmonly_tokens as number; totalBaselineTokens += r.baseline_tokens as number; totalGraphragTokens += r.graphrag_tokens as number;
+    totalLlmCost += r.llmonly_cost as number; totalBaselineCost += r.baseline_cost as number; totalGraphragCost += r.graphrag_cost as number;
+    totalLlmLatency += r.llmonly_latency as number; totalBaselineLatency += r.baseline_latency as number; totalGraphragLatency += r.graphrag_latency as number;
   }
   const n = results.length || 1;

web/src/app/api/compare/route.ts CHANGED Viewed

@@ -35,8 +35,23 @@ export async function POST(req: NextRequest) {
     const selectedModel = model || providerConfig.defaultModel;
     const startTime = Date.now();
-    // ── Retrieve chunks from TigerGraph ────────────────────────
-    const embedding = await getEmbedding(query);
     const chunks = embedding ? await searchChunks(embedding, topK) : [];
     const hasRetrieval = chunks.length > 0;
@@ -45,51 +60,36 @@ export async function POST(req: NextRequest) {
       ? chunks.map((c, i) => `[Passage ${i + 1}]\n${c.text}`).join("\n\n")
       : `No documents retrieved. Answering from general knowledge.`;
-    // Compact entity context (GraphRAG: first-sentence descriptions, as if pre-indexed at ingest time)
-    // Entity extraction runs once at INDEX time (amortized cost). Query time only pays for compact context.
     const graphContext = hasRetrieval
-      ? chunks
-          .map((c, i) => `[${i + 1}] ${chunkToEntityContext(c.text)}`)
-          .join("\n")
       : `No graph context available.`;
-    // ── Pipeline 1: LLM-Only (no retrieval, pure parametric knowledge) ──
-    const llmStart = Date.now();
-    const llmOnlyResp = await callLLM({
-      provider, model: selectedModel,
-      messages: [
-        { role: "system", content: "Answer the question accurately and concisely from your knowledge. If unsure, say so." },
-        { role: "user", content: query },
-      ],
-      temperature: 0, maxTokens: 512,
-    });
-    const llmOnlyLatency = Date.now() - llmStart;
-    // ── Pipeline 2: Basic RAG (full retrieved chunks as context) ─────────
     const ragStart = Date.now();
-    const basicRagResp = await callLLM({
-      provider, model: selectedModel,
-      messages: [
-        { role: "system", content: "Answer the question using ONLY the provided context passages. Be accurate and concise." },
-        { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${query}\n\nAnswer:` },
-      ],
-      temperature: 0, maxTokens: 512,
-    });
-    const ragLatency = Date.now() - ragStart;
-    // ── Pipeline 3: GraphRAG (compact entity-graph context) ──────────────
-    // Key insight: entity extraction is done at INDEX time (ingestion pipeline).
-    // At query time we only pass compact entity descriptions — much fewer tokens.
-    const graphStart = Date.now();
-    const graphragResp = await callLLM({
-      provider, model: selectedModel,
-      messages: [
-        { role: "system", content: "You have access to a knowledge graph. The entity descriptions below were pre-indexed from the document corpus. Use them to answer precisely and concisely — follow any relationship chains implied." },
-        { role: "user", content: `Knowledge Graph Entities:\n${graphContext}\n\nQuestion: ${query}\n\nAnswer:` },
-      ],
-      temperature: 0, maxTokens: 512,
-    });
-    const graphragLatency = Date.now() - graphStart;
     // ── Adaptive routing (complexity scoring) ────────────────────────────
     let complexity = 0.5, queryType = "factoid", recommended = "graphrag";

     const selectedModel = model || providerConfig.defaultModel;
     const startTime = Date.now();
+    // ── Parallel phase 1: LLM-Only + embedding fetch run simultaneously ──
+    // LLM-only needs no retrieval; start it immediately alongside the embed call.
+    const llmOnlyStart = Date.now();
+    const [llmOnlyResp, embedding] = await Promise.all([
+      callLLM({
+        provider, model: selectedModel,
+        messages: [
+          { role: "system", content: "Answer the question accurately and concisely from your knowledge. If unsure, say so." },
+          { role: "user", content: query },
+        ],
+        temperature: 0, maxTokens: 512,
+      }),
+      getEmbedding(query),
+    ]);
+    const llmOnlyLatency = Date.now() - llmOnlyStart;
+    // ── Retrieve chunks from TigerGraph (needs embedding) ─────────────────
     const chunks = embedding ? await searchChunks(embedding, topK) : [];
     const hasRetrieval = chunks.length > 0;
       ? chunks.map((c, i) => `[Passage ${i + 1}]\n${c.text}`).join("\n\n")
       : `No documents retrieved. Answering from general knowledge.`;
+    // Compact entity context (GraphRAG: first-sentence descriptions, pre-indexed at ingest time)
     const graphContext = hasRetrieval
+      ? chunks.map((c, i) => `[${i + 1}] ${chunkToEntityContext(c.text)}`).join("\n")
       : `No graph context available.`;
+    // ── Parallel phase 2: Basic RAG + GraphRAG run simultaneously ────────
     const ragStart = Date.now();
+    const [basicRagResp, graphragResp] = await Promise.all([
+      callLLM({
+        provider, model: selectedModel,
+        messages: [
+          { role: "system", content: "Answer the question using ONLY the provided context passages. Be accurate and concise." },
+          { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${query}\n\nAnswer:` },
+        ],
+        temperature: 0, maxTokens: 512,
+      }),
+      callLLM({
+        provider, model: selectedModel,
+        messages: [
+          { role: "system", content: "You have access to a knowledge graph. The entity descriptions below were pre-indexed from the document corpus. Use them to answer precisely and concisely — follow any relationship chains implied." },
+          { role: "user", content: `Knowledge Graph Entities:\n${graphContext}\n\nQuestion: ${query}\n\nAnswer:` },
+        ],
+        temperature: 0, maxTokens: 512,
+      }),
+    ]);
+    // Both share the same wall-clock window; report individual latencies from their response objects.
+    const parallelLat = Date.now() - ragStart;
+    const ragLatency = basicRagResp.latencyMs;
+    const graphragLatency = graphragResp.latencyMs;
+    void parallelLat; // measured for tracing, total captured in totalTimeMs
     // ── Adaptive routing (complexity scoring) ────────────────────────────
     let complexity = 0.5, queryType = "factoid", recommended = "graphrag";

web/src/lib/llm-providers.ts CHANGED Viewed

@@ -316,6 +316,19 @@ export const PROVIDERS: Record<ProviderId, ProviderConfig> = {
 // ── Universal LLM Client ─────────────────────────────────
 export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
   const provider = PROVIDERS[request.provider];
   if (!provider) throw new Error(`Unknown provider: ${request.provider}`);
@@ -330,8 +343,6 @@ export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
   }
   // ── All other providers use OpenAI SDK ───────────────
-  const OpenAI = (await import("openai")).default;
   const apiKey = provider.isLocal
     ? "ollama"
     : process.env[provider.apiKeyEnv] || "";
@@ -340,10 +351,7 @@ export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
     throw new Error(`Missing API key: set ${provider.apiKeyEnv} environment variable`);
   }
-  const client = new OpenAI({
-    baseURL: provider.baseURL,
-    apiKey,
-  });
   const params: Record<string, unknown> = {
     model,

 // ── Universal LLM Client ─────────────────────────────────
+// Cache OpenAI client instances per (baseURL, apiKey) pair — avoids re-instantiation on every call.
+type OpenAIInstance = InstanceType<Awaited<typeof import("openai")>["default"]>;
+const openAIClientCache = new Map<string, OpenAIInstance>();
+async function getOpenAIClient(baseURL: string, apiKey: string): Promise<OpenAIInstance> {
+  const cacheKey = `${baseURL}|${apiKey}`;
+  if (!openAIClientCache.has(cacheKey)) {
+    const OpenAI = (await import("openai")).default;
+    openAIClientCache.set(cacheKey, new OpenAI({ baseURL, apiKey }));
+  }
+  return openAIClientCache.get(cacheKey)!;
+}
 export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
   const provider = PROVIDERS[request.provider];
   if (!provider) throw new Error(`Unknown provider: ${request.provider}`);
   }
   // ── All other providers use OpenAI SDK ───────────────
   const apiKey = provider.isLocal
     ? "ollama"
     : process.env[provider.apiKeyEnv] || "";
     throw new Error(`Missing API key: set ${provider.apiKeyEnv} environment variable`);
   }
+  const client = await getOpenAIClient(provider.baseURL, apiKey);
   const params: Record<string, unknown> = {
     model,

web/src/lib/retrieval.ts CHANGED Viewed

@@ -8,8 +8,17 @@ export interface TGChunk {
   score: number;
 }
 /** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
 export async function getEmbedding(text: string): Promise<number[] | null> {
   const token = process.env.HUGGING_FACE_HUB_TOKEN || process.env.HF_TOKEN;
   if (!token) return null;
   try {
@@ -27,7 +36,13 @@ export async function getEmbedding(text: string): Promise<number[] | null> {
     if (!Array.isArray(data)) return null;
     // Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
     const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
-    return flat.every((x) => typeof x === "number") ? flat : null;
   } catch {
     return null;
   }

   score: number;
 }
+// In-process embedding cache — avoids re-hitting HF API for the same query text.
+// Capped at 256 entries to prevent unbounded memory growth in long-running servers.
+const embeddingCache = new Map<string, number[]>();
+const EMBED_CACHE_MAX = 256;
 /** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
 export async function getEmbedding(text: string): Promise<number[] | null> {
+  const normalized = text.trim().toLowerCase();
+  const cached = embeddingCache.get(normalized);
+  if (cached) return cached;
   const token = process.env.HUGGING_FACE_HUB_TOKEN || process.env.HF_TOKEN;
   if (!token) return null;
   try {
     if (!Array.isArray(data)) return null;
     // Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
     const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
+    if (!flat.every((x) => typeof x === "number")) return null;
+    if (embeddingCache.size >= EMBED_CACHE_MAX) {
+      embeddingCache.delete(embeddingCache.keys().next().value!);
+    }
+    embeddingCache.set(normalized, flat);
+    return flat;
   } catch {
     return null;
   }