muthuk1 commited on
Commit
90b36cb
Β·
1 Parent(s): 04362d1

Improve latency: parallel LLM calls, embedding cache, client reuse

Browse files

- compare/route: LLM-only + embedding fetch run in parallel (phase 1);
basicRag + graphRAG run in parallel after retrieval (phase 2).
Reduces compare wall time ~50% vs sequential.
- benchmark/route: all 10 samples run via Promise.allSettled (parallel);
within each sample same 2-phase parallel structure.
Reduces benchmark wall time from N*LLM_time to ~1*LLM_time.
- retrieval.ts: add 256-entry in-process embedding cache keyed by
normalized query text β€” eliminates repeated HF API round trips.
- llm-providers.ts: cache OpenAI client instances per (baseURL, apiKey)
pair β€” eliminates re-instantiation and dynamic import overhead on
every callLLM() invocation.

web/src/app/api/benchmark/route.ts CHANGED
@@ -157,42 +157,47 @@ export async function POST(req: NextRequest) {
157
  const providerConfig = PROVIDERS[provider];
158
  const hasKey = providerConfig?.isLocal || !providerConfig?.requiresApiKey || !!process.env[providerConfig?.apiKeyEnv || ""];
159
 
160
- const results: Record<string, unknown>[] = [];
161
- let totalLlmF1 = 0, totalBaselineF1 = 0, totalGraphragF1 = 0;
162
- let totalLlmEM = 0, totalBaselineEM = 0, totalGraphragEM = 0;
163
- let totalLlmTokens = 0, totalBaselineTokens = 0, totalGraphragTokens = 0;
164
- let totalLlmCost = 0, totalBaselineCost = 0, totalGraphragCost = 0;
165
- let totalLlmLatency = 0, totalBaselineLatency = 0, totalGraphragLatency = 0;
166
 
167
- for (let i = 0; i < numSamples; i++) {
168
- const sample = CORPUS_SAMPLES[i];
169
- const ctx = RETRIEVAL_CONTEXTS[i];
 
 
 
 
 
 
 
 
 
170
 
171
- if (!hasKey) {
172
- // Pre-computed demo values
173
- const llmT = 90 + Math.floor(Math.random() * 50);
174
- const bT = 480 + Math.floor(Math.random() * 200);
175
- const gT = 155 + Math.floor(Math.random() * 60);
176
- const llmF1 = 0.75 + Math.random() * 0.15, bF1 = 0.82 + Math.random() * 0.12, gF1 = 0.86 + Math.random() * 0.1;
177
- results.push({ idx: i, query: sample.question, gold: sample.answer, type: sample.type,
178
- llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
179
- llmonly_em: Math.random() > 0.4 ? 1 : 0, baseline_em: Math.random() > 0.3 ? 1 : 0, graphrag_em: Math.random() > 0.25 ? 1 : 0,
180
- llmonly_tokens: llmT, baseline_tokens: bT, graphrag_tokens: gT });
181
- totalLlmF1 += llmF1; totalBaselineF1 += bF1; totalGraphragF1 += gF1;
182
- totalLlmTokens += llmT; totalBaselineTokens += bT; totalGraphragTokens += gT;
183
- continue;
184
- }
185
 
186
- try {
187
- const selectedModel = model || providerConfig.defaultModel;
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- // Try live TigerGraph retrieval first; fall back to pre-loaded corpus passages
190
  let ragContext = ctx.full;
191
  let graphContext = ctx.compact;
192
  let chunksSource = "corpus";
193
-
194
  try {
195
- const embedding = await getEmbedding(sample.question);
196
  if (embedding) {
197
  const chunks = await searchChunks(embedding, 5);
198
  if (chunks.length > 0) {
@@ -203,71 +208,63 @@ export async function POST(req: NextRequest) {
203
  }
204
  } catch { /* use pre-loaded context */ }
205
 
206
- // Pipeline 1: LLM-only
207
- const llmStart = Date.now();
208
- const llmResp = await callLLM({
209
- provider, model: selectedModel,
210
- messages: [
211
- { role: "system", content: "Answer the science question concisely in 1–5 words." },
212
- { role: "user", content: sample.question },
213
- ],
214
- temperature: 0, maxTokens: 64,
215
- });
216
- const llmLat = Date.now() - llmStart;
217
-
218
- // Pipeline 2: Basic RAG β€” full retrieved passages as context (many tokens)
219
- const ragStart = Date.now();
220
- const ragResp = await callLLM({
221
- provider, model: selectedModel,
222
- messages: [
223
- { role: "system", content: "Answer using the provided context. Be concise, 1–5 words if possible." },
224
- { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
225
- ],
226
- temperature: 0, maxTokens: 64,
227
- });
228
- const ragLat = Date.now() - ragStart;
229
-
230
- // Pipeline 3: GraphRAG β€” compact entity descriptions (pre-indexed at ingest time β†’ few tokens)
231
- const graphStart = Date.now();
232
- const graphResp = await callLLM({
233
- provider, model: selectedModel,
234
- messages: [
235
- { role: "system", content: "Using the pre-indexed knowledge graph entity descriptions, answer concisely in 1–5 words." },
236
- { role: "user", content: `Graph Entities:\n${graphContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
237
- ],
238
- temperature: 0, maxTokens: 64,
239
- });
240
- const graphLat = Date.now() - graphStart;
241
 
242
- const llmF1 = computeF1(llmResp.content, sample.answer);
243
- const bF1 = computeF1(ragResp.content, sample.answer);
244
- const gF1 = computeF1(graphResp.content, sample.answer);
245
-
246
- results.push({
247
  idx: i, query: sample.question, gold: sample.answer, type: sample.type,
248
  llmonly_answer: llmResp.content, baseline_answer: ragResp.content, graphrag_answer: graphResp.content,
249
- llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
 
 
250
  llmonly_em: computeEM(llmResp.content, sample.answer),
251
  baseline_em: computeEM(ragResp.content, sample.answer),
252
  graphrag_em: computeEM(graphResp.content, sample.answer),
253
  llmonly_tokens: llmResp.totalTokens, baseline_tokens: ragResp.totalTokens, graphrag_tokens: graphResp.totalTokens,
254
  llmonly_cost: llmResp.costUsd, baseline_cost: ragResp.costUsd, graphrag_cost: graphResp.costUsd,
255
- llmonly_latency: llmLat, baseline_latency: ragLat, graphrag_latency: graphLat,
256
  chunks_source: chunksSource,
257
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
- totalLlmF1 += llmF1; totalBaselineF1 += bF1; totalGraphragF1 += gF1;
260
- totalLlmEM += computeEM(llmResp.content, sample.answer);
261
- totalBaselineEM += computeEM(ragResp.content, sample.answer);
262
- totalGraphragEM += computeEM(graphResp.content, sample.answer);
263
- totalLlmTokens += llmResp.totalTokens;
264
- totalBaselineTokens += ragResp.totalTokens;
265
- totalGraphragTokens += graphResp.totalTokens;
266
- totalLlmCost += llmResp.costUsd; totalBaselineCost += ragResp.costUsd; totalGraphragCost += graphResp.costUsd;
267
- totalLlmLatency += llmLat; totalBaselineLatency += ragLat; totalGraphragLatency += graphLat;
268
- } catch (err) {
269
- console.error(`Benchmark query ${i} failed:`, err);
270
- }
271
  }
272
 
273
  const n = results.length || 1;
 
157
  const providerConfig = PROVIDERS[provider];
158
  const hasKey = providerConfig?.isLocal || !providerConfig?.requiresApiKey || !!process.env[providerConfig?.apiKeyEnv || ""];
159
 
160
+ // Run all samples in parallel β€” reduces benchmark wall time from ~NΓ—LLM_time to ~1Γ—LLM_time.
161
+ // Within each sample: LLM-only + embedding run simultaneously; then basicRag + graphrag run simultaneously.
162
+ const settled = await Promise.allSettled(
163
+ CORPUS_SAMPLES.slice(0, numSamples).map(async (sample, i) => {
164
+ const ctx = RETRIEVAL_CONTEXTS[i];
 
165
 
166
+ if (!hasKey) {
167
+ const llmT = 90 + Math.floor(Math.random() * 50);
168
+ const bT = 480 + Math.floor(Math.random() * 200);
169
+ const gT = 155 + Math.floor(Math.random() * 60);
170
+ const llmF1 = 0.75 + Math.random() * 0.15, bF1 = 0.82 + Math.random() * 0.12, gF1 = 0.86 + Math.random() * 0.1;
171
+ return { idx: i, query: sample.question, gold: sample.answer, type: sample.type,
172
+ llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
173
+ llmonly_em: Math.random() > 0.4 ? 1 : 0, baseline_em: Math.random() > 0.3 ? 1 : 0, graphrag_em: Math.random() > 0.25 ? 1 : 0,
174
+ llmonly_tokens: llmT, baseline_tokens: bT, graphrag_tokens: gT,
175
+ llmonly_cost: 0, baseline_cost: 0, graphrag_cost: 0,
176
+ llmonly_latency: 0, baseline_latency: 0, graphrag_latency: 0, chunks_source: "demo" };
177
+ }
178
 
179
+ const selectedModel = model || providerConfig!.defaultModel;
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ // Phase 1: LLM-only + embedding fetch in parallel
182
+ const llmOnlyStart = Date.now();
183
+ const [llmResp, embedding] = await Promise.all([
184
+ callLLM({
185
+ provider, model: selectedModel,
186
+ messages: [
187
+ { role: "system", content: "Answer the science question concisely in 1–5 words." },
188
+ { role: "user", content: sample.question },
189
+ ],
190
+ temperature: 0, maxTokens: 64,
191
+ }),
192
+ getEmbedding(sample.question).catch(() => null),
193
+ ]);
194
+ const llmLat = Date.now() - llmOnlyStart;
195
 
196
+ // TigerGraph retrieval (sequential after embedding)
197
  let ragContext = ctx.full;
198
  let graphContext = ctx.compact;
199
  let chunksSource = "corpus";
 
200
  try {
 
201
  if (embedding) {
202
  const chunks = await searchChunks(embedding, 5);
203
  if (chunks.length > 0) {
 
208
  }
209
  } catch { /* use pre-loaded context */ }
210
 
211
+ // Phase 2: Basic RAG + GraphRAG in parallel
212
+ const retrievalStart = Date.now();
213
+ const [ragResp, graphResp] = await Promise.all([
214
+ callLLM({
215
+ provider, model: selectedModel,
216
+ messages: [
217
+ { role: "system", content: "Answer using the provided context. Be concise, 1–5 words if possible." },
218
+ { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
219
+ ],
220
+ temperature: 0, maxTokens: 64,
221
+ }),
222
+ callLLM({
223
+ provider, model: selectedModel,
224
+ messages: [
225
+ { role: "system", content: "Using the pre-indexed knowledge graph entity descriptions, answer concisely in 1–5 words." },
226
+ { role: "user", content: `Graph Entities:\n${graphContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
227
+ ],
228
+ temperature: 0, maxTokens: 64,
229
+ }),
230
+ ]);
231
+ const parallelLat = Date.now() - retrievalStart;
232
+ void parallelLat;
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ return {
 
 
 
 
235
  idx: i, query: sample.question, gold: sample.answer, type: sample.type,
236
  llmonly_answer: llmResp.content, baseline_answer: ragResp.content, graphrag_answer: graphResp.content,
237
+ llmonly_f1: +computeF1(llmResp.content, sample.answer).toFixed(4),
238
+ baseline_f1: +computeF1(ragResp.content, sample.answer).toFixed(4),
239
+ graphrag_f1: +computeF1(graphResp.content, sample.answer).toFixed(4),
240
  llmonly_em: computeEM(llmResp.content, sample.answer),
241
  baseline_em: computeEM(ragResp.content, sample.answer),
242
  graphrag_em: computeEM(graphResp.content, sample.answer),
243
  llmonly_tokens: llmResp.totalTokens, baseline_tokens: ragResp.totalTokens, graphrag_tokens: graphResp.totalTokens,
244
  llmonly_cost: llmResp.costUsd, baseline_cost: ragResp.costUsd, graphrag_cost: graphResp.costUsd,
245
+ llmonly_latency: llmLat, baseline_latency: ragResp.latencyMs, graphrag_latency: graphResp.latencyMs,
246
  chunks_source: chunksSource,
247
+ };
248
+ })
249
+ );
250
+
251
+ settled.forEach((s, i) => { if (s.status === "rejected") console.error(`Benchmark query ${i} failed:`, s.reason); });
252
+ const results: Record<string, unknown>[] = settled
253
+ .filter(s => s.status === "fulfilled")
254
+ .map(s => (s as PromiseFulfilledResult<Record<string, unknown>>).value);
255
+
256
+ let totalLlmF1 = 0, totalBaselineF1 = 0, totalGraphragF1 = 0;
257
+ let totalLlmEM = 0, totalBaselineEM = 0, totalGraphragEM = 0;
258
+ let totalLlmTokens = 0, totalBaselineTokens = 0, totalGraphragTokens = 0;
259
+ let totalLlmCost = 0, totalBaselineCost = 0, totalGraphragCost = 0;
260
+ let totalLlmLatency = 0, totalBaselineLatency = 0, totalGraphragLatency = 0;
261
 
262
+ for (const r of results) {
263
+ totalLlmF1 += r.llmonly_f1 as number; totalBaselineF1 += r.baseline_f1 as number; totalGraphragF1 += r.graphrag_f1 as number;
264
+ totalLlmEM += r.llmonly_em as number; totalBaselineEM += r.baseline_em as number; totalGraphragEM += r.graphrag_em as number;
265
+ totalLlmTokens += r.llmonly_tokens as number; totalBaselineTokens += r.baseline_tokens as number; totalGraphragTokens += r.graphrag_tokens as number;
266
+ totalLlmCost += r.llmonly_cost as number; totalBaselineCost += r.baseline_cost as number; totalGraphragCost += r.graphrag_cost as number;
267
+ totalLlmLatency += r.llmonly_latency as number; totalBaselineLatency += r.baseline_latency as number; totalGraphragLatency += r.graphrag_latency as number;
 
 
 
 
 
 
268
  }
269
 
270
  const n = results.length || 1;
web/src/app/api/compare/route.ts CHANGED
@@ -35,8 +35,23 @@ export async function POST(req: NextRequest) {
35
  const selectedModel = model || providerConfig.defaultModel;
36
  const startTime = Date.now();
37
 
38
- // ── Retrieve chunks from TigerGraph ────────────────────────
39
- const embedding = await getEmbedding(query);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  const chunks = embedding ? await searchChunks(embedding, topK) : [];
41
  const hasRetrieval = chunks.length > 0;
42
 
@@ -45,51 +60,36 @@ export async function POST(req: NextRequest) {
45
  ? chunks.map((c, i) => `[Passage ${i + 1}]\n${c.text}`).join("\n\n")
46
  : `No documents retrieved. Answering from general knowledge.`;
47
 
48
- // Compact entity context (GraphRAG: first-sentence descriptions, as if pre-indexed at ingest time)
49
- // Entity extraction runs once at INDEX time (amortized cost). Query time only pays for compact context.
50
  const graphContext = hasRetrieval
51
- ? chunks
52
- .map((c, i) => `[${i + 1}] ${chunkToEntityContext(c.text)}`)
53
- .join("\n")
54
  : `No graph context available.`;
55
 
56
- // ── Pipeline 1: LLM-Only (no retrieval, pure parametric knowledge) ──
57
- const llmStart = Date.now();
58
- const llmOnlyResp = await callLLM({
59
- provider, model: selectedModel,
60
- messages: [
61
- { role: "system", content: "Answer the question accurately and concisely from your knowledge. If unsure, say so." },
62
- { role: "user", content: query },
63
- ],
64
- temperature: 0, maxTokens: 512,
65
- });
66
- const llmOnlyLatency = Date.now() - llmStart;
67
-
68
- // ── Pipeline 2: Basic RAG (full retrieved chunks as context) ─────────
69
  const ragStart = Date.now();
70
- const basicRagResp = await callLLM({
71
- provider, model: selectedModel,
72
- messages: [
73
- { role: "system", content: "Answer the question using ONLY the provided context passages. Be accurate and concise." },
74
- { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${query}\n\nAnswer:` },
75
- ],
76
- temperature: 0, maxTokens: 512,
77
- });
78
- const ragLatency = Date.now() - ragStart;
79
-
80
- // ── Pipeline 3: GraphRAG (compact entity-graph context) ──────────────
81
- // Key insight: entity extraction is done at INDEX time (ingestion pipeline).
82
- // At query time we only pass compact entity descriptions β€” much fewer tokens.
83
- const graphStart = Date.now();
84
- const graphragResp = await callLLM({
85
- provider, model: selectedModel,
86
- messages: [
87
- { role: "system", content: "You have access to a knowledge graph. The entity descriptions below were pre-indexed from the document corpus. Use them to answer precisely and concisely β€” follow any relationship chains implied." },
88
- { role: "user", content: `Knowledge Graph Entities:\n${graphContext}\n\nQuestion: ${query}\n\nAnswer:` },
89
- ],
90
- temperature: 0, maxTokens: 512,
91
- });
92
- const graphragLatency = Date.now() - graphStart;
93
 
94
  // ── Adaptive routing (complexity scoring) ────────────────────────────
95
  let complexity = 0.5, queryType = "factoid", recommended = "graphrag";
 
35
  const selectedModel = model || providerConfig.defaultModel;
36
  const startTime = Date.now();
37
 
38
+ // ── Parallel phase 1: LLM-Only + embedding fetch run simultaneously ──
39
+ // LLM-only needs no retrieval; start it immediately alongside the embed call.
40
+ const llmOnlyStart = Date.now();
41
+ const [llmOnlyResp, embedding] = await Promise.all([
42
+ callLLM({
43
+ provider, model: selectedModel,
44
+ messages: [
45
+ { role: "system", content: "Answer the question accurately and concisely from your knowledge. If unsure, say so." },
46
+ { role: "user", content: query },
47
+ ],
48
+ temperature: 0, maxTokens: 512,
49
+ }),
50
+ getEmbedding(query),
51
+ ]);
52
+ const llmOnlyLatency = Date.now() - llmOnlyStart;
53
+
54
+ // ── Retrieve chunks from TigerGraph (needs embedding) ─────────────────
55
  const chunks = embedding ? await searchChunks(embedding, topK) : [];
56
  const hasRetrieval = chunks.length > 0;
57
 
 
60
  ? chunks.map((c, i) => `[Passage ${i + 1}]\n${c.text}`).join("\n\n")
61
  : `No documents retrieved. Answering from general knowledge.`;
62
 
63
+ // Compact entity context (GraphRAG: first-sentence descriptions, pre-indexed at ingest time)
 
64
  const graphContext = hasRetrieval
65
+ ? chunks.map((c, i) => `[${i + 1}] ${chunkToEntityContext(c.text)}`).join("\n")
 
 
66
  : `No graph context available.`;
67
 
68
+ // ── Parallel phase 2: Basic RAG + GraphRAG run simultaneously ────────
 
 
 
 
 
 
 
 
 
 
 
 
69
  const ragStart = Date.now();
70
+ const [basicRagResp, graphragResp] = await Promise.all([
71
+ callLLM({
72
+ provider, model: selectedModel,
73
+ messages: [
74
+ { role: "system", content: "Answer the question using ONLY the provided context passages. Be accurate and concise." },
75
+ { role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${query}\n\nAnswer:` },
76
+ ],
77
+ temperature: 0, maxTokens: 512,
78
+ }),
79
+ callLLM({
80
+ provider, model: selectedModel,
81
+ messages: [
82
+ { role: "system", content: "You have access to a knowledge graph. The entity descriptions below were pre-indexed from the document corpus. Use them to answer precisely and concisely β€” follow any relationship chains implied." },
83
+ { role: "user", content: `Knowledge Graph Entities:\n${graphContext}\n\nQuestion: ${query}\n\nAnswer:` },
84
+ ],
85
+ temperature: 0, maxTokens: 512,
86
+ }),
87
+ ]);
88
+ // Both share the same wall-clock window; report individual latencies from their response objects.
89
+ const parallelLat = Date.now() - ragStart;
90
+ const ragLatency = basicRagResp.latencyMs;
91
+ const graphragLatency = graphragResp.latencyMs;
92
+ void parallelLat; // measured for tracing, total captured in totalTimeMs
93
 
94
  // ── Adaptive routing (complexity scoring) ────────────────────────────
95
  let complexity = 0.5, queryType = "factoid", recommended = "graphrag";
web/src/lib/llm-providers.ts CHANGED
@@ -316,6 +316,19 @@ export const PROVIDERS: Record<ProviderId, ProviderConfig> = {
316
 
317
  // ── Universal LLM Client ─────────────────────────────────
318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
320
  const provider = PROVIDERS[request.provider];
321
  if (!provider) throw new Error(`Unknown provider: ${request.provider}`);
@@ -330,8 +343,6 @@ export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
330
  }
331
 
332
  // ── All other providers use OpenAI SDK ───────────────
333
- const OpenAI = (await import("openai")).default;
334
-
335
  const apiKey = provider.isLocal
336
  ? "ollama"
337
  : process.env[provider.apiKeyEnv] || "";
@@ -340,10 +351,7 @@ export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
340
  throw new Error(`Missing API key: set ${provider.apiKeyEnv} environment variable`);
341
  }
342
 
343
- const client = new OpenAI({
344
- baseURL: provider.baseURL,
345
- apiKey,
346
- });
347
 
348
  const params: Record<string, unknown> = {
349
  model,
 
316
 
317
  // ── Universal LLM Client ─────────────────────────────────
318
 
319
+ // Cache OpenAI client instances per (baseURL, apiKey) pair β€” avoids re-instantiation on every call.
320
+ type OpenAIInstance = InstanceType<Awaited<typeof import("openai")>["default"]>;
321
+ const openAIClientCache = new Map<string, OpenAIInstance>();
322
+
323
+ async function getOpenAIClient(baseURL: string, apiKey: string): Promise<OpenAIInstance> {
324
+ const cacheKey = `${baseURL}|${apiKey}`;
325
+ if (!openAIClientCache.has(cacheKey)) {
326
+ const OpenAI = (await import("openai")).default;
327
+ openAIClientCache.set(cacheKey, new OpenAI({ baseURL, apiKey }));
328
+ }
329
+ return openAIClientCache.get(cacheKey)!;
330
+ }
331
+
332
  export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
333
  const provider = PROVIDERS[request.provider];
334
  if (!provider) throw new Error(`Unknown provider: ${request.provider}`);
 
343
  }
344
 
345
  // ── All other providers use OpenAI SDK ───────────────
 
 
346
  const apiKey = provider.isLocal
347
  ? "ollama"
348
  : process.env[provider.apiKeyEnv] || "";
 
351
  throw new Error(`Missing API key: set ${provider.apiKeyEnv} environment variable`);
352
  }
353
 
354
+ const client = await getOpenAIClient(provider.baseURL, apiKey);
 
 
 
355
 
356
  const params: Record<string, unknown> = {
357
  model,
web/src/lib/retrieval.ts CHANGED
@@ -8,8 +8,17 @@ export interface TGChunk {
8
  score: number;
9
  }
10
 
 
 
 
 
 
11
  /** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
12
  export async function getEmbedding(text: string): Promise<number[] | null> {
 
 
 
 
13
  const token = process.env.HUGGING_FACE_HUB_TOKEN || process.env.HF_TOKEN;
14
  if (!token) return null;
15
  try {
@@ -27,7 +36,13 @@ export async function getEmbedding(text: string): Promise<number[] | null> {
27
  if (!Array.isArray(data)) return null;
28
  // Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
29
  const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
30
- return flat.every((x) => typeof x === "number") ? flat : null;
 
 
 
 
 
 
31
  } catch {
32
  return null;
33
  }
 
8
  score: number;
9
  }
10
 
11
+ // In-process embedding cache β€” avoids re-hitting HF API for the same query text.
12
+ // Capped at 256 entries to prevent unbounded memory growth in long-running servers.
13
+ const embeddingCache = new Map<string, number[]>();
14
+ const EMBED_CACHE_MAX = 256;
15
+
16
  /** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
17
  export async function getEmbedding(text: string): Promise<number[] | null> {
18
+ const normalized = text.trim().toLowerCase();
19
+ const cached = embeddingCache.get(normalized);
20
+ if (cached) return cached;
21
+
22
  const token = process.env.HUGGING_FACE_HUB_TOKEN || process.env.HF_TOKEN;
23
  if (!token) return null;
24
  try {
 
36
  if (!Array.isArray(data)) return null;
37
  // Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
38
  const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
39
+ if (!flat.every((x) => typeof x === "number")) return null;
40
+
41
+ if (embeddingCache.size >= EMBED_CACHE_MAX) {
42
+ embeddingCache.delete(embeddingCache.keys().next().value!);
43
+ }
44
+ embeddingCache.set(normalized, flat);
45
+ return flat;
46
  } catch {
47
  return null;
48
  }