Improve latency: parallel LLM calls, embedding cache, client reuse
Browse files- compare/route: LLM-only + embedding fetch run in parallel (phase 1);
basicRag + graphRAG run in parallel after retrieval (phase 2).
Reduces compare wall time ~50% vs sequential.
- benchmark/route: all 10 samples run via Promise.allSettled (parallel);
within each sample same 2-phase parallel structure.
Reduces benchmark wall time from N*LLM_time to ~1*LLM_time.
- retrieval.ts: add 256-entry in-process embedding cache keyed by
normalized query text β eliminates repeated HF API round trips.
- llm-providers.ts: cache OpenAI client instances per (baseURL, apiKey)
pair β eliminates re-instantiation and dynamic import overhead on
every callLLM() invocation.
- web/src/app/api/benchmark/route.ts +80 -83
- web/src/app/api/compare/route.ts +43 -43
- web/src/lib/llm-providers.ts +14 -6
- web/src/lib/retrieval.ts +16 -1
web/src/app/api/benchmark/route.ts
CHANGED
|
@@ -157,42 +157,47 @@ export async function POST(req: NextRequest) {
|
|
| 157 |
const providerConfig = PROVIDERS[provider];
|
| 158 |
const hasKey = providerConfig?.isLocal || !providerConfig?.requiresApiKey || !!process.env[providerConfig?.apiKeyEnv || ""];
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
let totalLlmLatency = 0, totalBaselineLatency = 0, totalGraphragLatency = 0;
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
| 172 |
-
// Pre-computed demo values
|
| 173 |
-
const llmT = 90 + Math.floor(Math.random() * 50);
|
| 174 |
-
const bT = 480 + Math.floor(Math.random() * 200);
|
| 175 |
-
const gT = 155 + Math.floor(Math.random() * 60);
|
| 176 |
-
const llmF1 = 0.75 + Math.random() * 0.15, bF1 = 0.82 + Math.random() * 0.12, gF1 = 0.86 + Math.random() * 0.1;
|
| 177 |
-
results.push({ idx: i, query: sample.question, gold: sample.answer, type: sample.type,
|
| 178 |
-
llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
|
| 179 |
-
llmonly_em: Math.random() > 0.4 ? 1 : 0, baseline_em: Math.random() > 0.3 ? 1 : 0, graphrag_em: Math.random() > 0.25 ? 1 : 0,
|
| 180 |
-
llmonly_tokens: llmT, baseline_tokens: bT, graphrag_tokens: gT });
|
| 181 |
-
totalLlmF1 += llmF1; totalBaselineF1 += bF1; totalGraphragF1 += gF1;
|
| 182 |
-
totalLlmTokens += llmT; totalBaselineTokens += bT; totalGraphragTokens += gT;
|
| 183 |
-
continue;
|
| 184 |
-
}
|
| 185 |
|
| 186 |
-
|
| 187 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
-
//
|
| 190 |
let ragContext = ctx.full;
|
| 191 |
let graphContext = ctx.compact;
|
| 192 |
let chunksSource = "corpus";
|
| 193 |
-
|
| 194 |
try {
|
| 195 |
-
const embedding = await getEmbedding(sample.question);
|
| 196 |
if (embedding) {
|
| 197 |
const chunks = await searchChunks(embedding, 5);
|
| 198 |
if (chunks.length > 0) {
|
|
@@ -203,71 +208,63 @@ export async function POST(req: NextRequest) {
|
|
| 203 |
}
|
| 204 |
} catch { /* use pre-loaded context */ }
|
| 205 |
|
| 206 |
-
//
|
| 207 |
-
const
|
| 208 |
-
const
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
const ragLat = Date.now() - ragStart;
|
| 229 |
-
|
| 230 |
-
// Pipeline 3: GraphRAG β compact entity descriptions (pre-indexed at ingest time β few tokens)
|
| 231 |
-
const graphStart = Date.now();
|
| 232 |
-
const graphResp = await callLLM({
|
| 233 |
-
provider, model: selectedModel,
|
| 234 |
-
messages: [
|
| 235 |
-
{ role: "system", content: "Using the pre-indexed knowledge graph entity descriptions, answer concisely in 1β5 words." },
|
| 236 |
-
{ role: "user", content: `Graph Entities:\n${graphContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
|
| 237 |
-
],
|
| 238 |
-
temperature: 0, maxTokens: 64,
|
| 239 |
-
});
|
| 240 |
-
const graphLat = Date.now() - graphStart;
|
| 241 |
|
| 242 |
-
|
| 243 |
-
const bF1 = computeF1(ragResp.content, sample.answer);
|
| 244 |
-
const gF1 = computeF1(graphResp.content, sample.answer);
|
| 245 |
-
|
| 246 |
-
results.push({
|
| 247 |
idx: i, query: sample.question, gold: sample.answer, type: sample.type,
|
| 248 |
llmonly_answer: llmResp.content, baseline_answer: ragResp.content, graphrag_answer: graphResp.content,
|
| 249 |
-
llmonly_f1: +
|
|
|
|
|
|
|
| 250 |
llmonly_em: computeEM(llmResp.content, sample.answer),
|
| 251 |
baseline_em: computeEM(ragResp.content, sample.answer),
|
| 252 |
graphrag_em: computeEM(graphResp.content, sample.answer),
|
| 253 |
llmonly_tokens: llmResp.totalTokens, baseline_tokens: ragResp.totalTokens, graphrag_tokens: graphResp.totalTokens,
|
| 254 |
llmonly_cost: llmResp.costUsd, baseline_cost: ragResp.costUsd, graphrag_cost: graphResp.costUsd,
|
| 255 |
-
llmonly_latency: llmLat, baseline_latency:
|
| 256 |
chunks_source: chunksSource,
|
| 257 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
totalGraphragTokens += graphResp.totalTokens;
|
| 266 |
-
totalLlmCost += llmResp.costUsd; totalBaselineCost += ragResp.costUsd; totalGraphragCost += graphResp.costUsd;
|
| 267 |
-
totalLlmLatency += llmLat; totalBaselineLatency += ragLat; totalGraphragLatency += graphLat;
|
| 268 |
-
} catch (err) {
|
| 269 |
-
console.error(`Benchmark query ${i} failed:`, err);
|
| 270 |
-
}
|
| 271 |
}
|
| 272 |
|
| 273 |
const n = results.length || 1;
|
|
|
|
| 157 |
const providerConfig = PROVIDERS[provider];
|
| 158 |
const hasKey = providerConfig?.isLocal || !providerConfig?.requiresApiKey || !!process.env[providerConfig?.apiKeyEnv || ""];
|
| 159 |
|
| 160 |
+
// Run all samples in parallel β reduces benchmark wall time from ~NΓLLM_time to ~1ΓLLM_time.
|
| 161 |
+
// Within each sample: LLM-only + embedding run simultaneously; then basicRag + graphrag run simultaneously.
|
| 162 |
+
const settled = await Promise.allSettled(
|
| 163 |
+
CORPUS_SAMPLES.slice(0, numSamples).map(async (sample, i) => {
|
| 164 |
+
const ctx = RETRIEVAL_CONTEXTS[i];
|
|
|
|
| 165 |
|
| 166 |
+
if (!hasKey) {
|
| 167 |
+
const llmT = 90 + Math.floor(Math.random() * 50);
|
| 168 |
+
const bT = 480 + Math.floor(Math.random() * 200);
|
| 169 |
+
const gT = 155 + Math.floor(Math.random() * 60);
|
| 170 |
+
const llmF1 = 0.75 + Math.random() * 0.15, bF1 = 0.82 + Math.random() * 0.12, gF1 = 0.86 + Math.random() * 0.1;
|
| 171 |
+
return { idx: i, query: sample.question, gold: sample.answer, type: sample.type,
|
| 172 |
+
llmonly_f1: +llmF1.toFixed(4), baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
|
| 173 |
+
llmonly_em: Math.random() > 0.4 ? 1 : 0, baseline_em: Math.random() > 0.3 ? 1 : 0, graphrag_em: Math.random() > 0.25 ? 1 : 0,
|
| 174 |
+
llmonly_tokens: llmT, baseline_tokens: bT, graphrag_tokens: gT,
|
| 175 |
+
llmonly_cost: 0, baseline_cost: 0, graphrag_cost: 0,
|
| 176 |
+
llmonly_latency: 0, baseline_latency: 0, graphrag_latency: 0, chunks_source: "demo" };
|
| 177 |
+
}
|
| 178 |
|
| 179 |
+
const selectedModel = model || providerConfig!.defaultModel;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
// Phase 1: LLM-only + embedding fetch in parallel
|
| 182 |
+
const llmOnlyStart = Date.now();
|
| 183 |
+
const [llmResp, embedding] = await Promise.all([
|
| 184 |
+
callLLM({
|
| 185 |
+
provider, model: selectedModel,
|
| 186 |
+
messages: [
|
| 187 |
+
{ role: "system", content: "Answer the science question concisely in 1β5 words." },
|
| 188 |
+
{ role: "user", content: sample.question },
|
| 189 |
+
],
|
| 190 |
+
temperature: 0, maxTokens: 64,
|
| 191 |
+
}),
|
| 192 |
+
getEmbedding(sample.question).catch(() => null),
|
| 193 |
+
]);
|
| 194 |
+
const llmLat = Date.now() - llmOnlyStart;
|
| 195 |
|
| 196 |
+
// TigerGraph retrieval (sequential after embedding)
|
| 197 |
let ragContext = ctx.full;
|
| 198 |
let graphContext = ctx.compact;
|
| 199 |
let chunksSource = "corpus";
|
|
|
|
| 200 |
try {
|
|
|
|
| 201 |
if (embedding) {
|
| 202 |
const chunks = await searchChunks(embedding, 5);
|
| 203 |
if (chunks.length > 0) {
|
|
|
|
| 208 |
}
|
| 209 |
} catch { /* use pre-loaded context */ }
|
| 210 |
|
| 211 |
+
// Phase 2: Basic RAG + GraphRAG in parallel
|
| 212 |
+
const retrievalStart = Date.now();
|
| 213 |
+
const [ragResp, graphResp] = await Promise.all([
|
| 214 |
+
callLLM({
|
| 215 |
+
provider, model: selectedModel,
|
| 216 |
+
messages: [
|
| 217 |
+
{ role: "system", content: "Answer using the provided context. Be concise, 1β5 words if possible." },
|
| 218 |
+
{ role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
|
| 219 |
+
],
|
| 220 |
+
temperature: 0, maxTokens: 64,
|
| 221 |
+
}),
|
| 222 |
+
callLLM({
|
| 223 |
+
provider, model: selectedModel,
|
| 224 |
+
messages: [
|
| 225 |
+
{ role: "system", content: "Using the pre-indexed knowledge graph entity descriptions, answer concisely in 1β5 words." },
|
| 226 |
+
{ role: "user", content: `Graph Entities:\n${graphContext}\n\nQuestion: ${sample.question}\n\nAnswer:` },
|
| 227 |
+
],
|
| 228 |
+
temperature: 0, maxTokens: 64,
|
| 229 |
+
}),
|
| 230 |
+
]);
|
| 231 |
+
const parallelLat = Date.now() - retrievalStart;
|
| 232 |
+
void parallelLat;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
+
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
idx: i, query: sample.question, gold: sample.answer, type: sample.type,
|
| 236 |
llmonly_answer: llmResp.content, baseline_answer: ragResp.content, graphrag_answer: graphResp.content,
|
| 237 |
+
llmonly_f1: +computeF1(llmResp.content, sample.answer).toFixed(4),
|
| 238 |
+
baseline_f1: +computeF1(ragResp.content, sample.answer).toFixed(4),
|
| 239 |
+
graphrag_f1: +computeF1(graphResp.content, sample.answer).toFixed(4),
|
| 240 |
llmonly_em: computeEM(llmResp.content, sample.answer),
|
| 241 |
baseline_em: computeEM(ragResp.content, sample.answer),
|
| 242 |
graphrag_em: computeEM(graphResp.content, sample.answer),
|
| 243 |
llmonly_tokens: llmResp.totalTokens, baseline_tokens: ragResp.totalTokens, graphrag_tokens: graphResp.totalTokens,
|
| 244 |
llmonly_cost: llmResp.costUsd, baseline_cost: ragResp.costUsd, graphrag_cost: graphResp.costUsd,
|
| 245 |
+
llmonly_latency: llmLat, baseline_latency: ragResp.latencyMs, graphrag_latency: graphResp.latencyMs,
|
| 246 |
chunks_source: chunksSource,
|
| 247 |
+
};
|
| 248 |
+
})
|
| 249 |
+
);
|
| 250 |
+
|
| 251 |
+
settled.forEach((s, i) => { if (s.status === "rejected") console.error(`Benchmark query ${i} failed:`, s.reason); });
|
| 252 |
+
const results: Record<string, unknown>[] = settled
|
| 253 |
+
.filter(s => s.status === "fulfilled")
|
| 254 |
+
.map(s => (s as PromiseFulfilledResult<Record<string, unknown>>).value);
|
| 255 |
+
|
| 256 |
+
let totalLlmF1 = 0, totalBaselineF1 = 0, totalGraphragF1 = 0;
|
| 257 |
+
let totalLlmEM = 0, totalBaselineEM = 0, totalGraphragEM = 0;
|
| 258 |
+
let totalLlmTokens = 0, totalBaselineTokens = 0, totalGraphragTokens = 0;
|
| 259 |
+
let totalLlmCost = 0, totalBaselineCost = 0, totalGraphragCost = 0;
|
| 260 |
+
let totalLlmLatency = 0, totalBaselineLatency = 0, totalGraphragLatency = 0;
|
| 261 |
|
| 262 |
+
for (const r of results) {
|
| 263 |
+
totalLlmF1 += r.llmonly_f1 as number; totalBaselineF1 += r.baseline_f1 as number; totalGraphragF1 += r.graphrag_f1 as number;
|
| 264 |
+
totalLlmEM += r.llmonly_em as number; totalBaselineEM += r.baseline_em as number; totalGraphragEM += r.graphrag_em as number;
|
| 265 |
+
totalLlmTokens += r.llmonly_tokens as number; totalBaselineTokens += r.baseline_tokens as number; totalGraphragTokens += r.graphrag_tokens as number;
|
| 266 |
+
totalLlmCost += r.llmonly_cost as number; totalBaselineCost += r.baseline_cost as number; totalGraphragCost += r.graphrag_cost as number;
|
| 267 |
+
totalLlmLatency += r.llmonly_latency as number; totalBaselineLatency += r.baseline_latency as number; totalGraphragLatency += r.graphrag_latency as number;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
}
|
| 269 |
|
| 270 |
const n = results.length || 1;
|
web/src/app/api/compare/route.ts
CHANGED
|
@@ -35,8 +35,23 @@ export async function POST(req: NextRequest) {
|
|
| 35 |
const selectedModel = model || providerConfig.defaultModel;
|
| 36 |
const startTime = Date.now();
|
| 37 |
|
| 38 |
-
// ββ
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
const chunks = embedding ? await searchChunks(embedding, topK) : [];
|
| 41 |
const hasRetrieval = chunks.length > 0;
|
| 42 |
|
|
@@ -45,51 +60,36 @@ export async function POST(req: NextRequest) {
|
|
| 45 |
? chunks.map((c, i) => `[Passage ${i + 1}]\n${c.text}`).join("\n\n")
|
| 46 |
: `No documents retrieved. Answering from general knowledge.`;
|
| 47 |
|
| 48 |
-
// Compact entity context (GraphRAG: first-sentence descriptions,
|
| 49 |
-
// Entity extraction runs once at INDEX time (amortized cost). Query time only pays for compact context.
|
| 50 |
const graphContext = hasRetrieval
|
| 51 |
-
? chunks
|
| 52 |
-
.map((c, i) => `[${i + 1}] ${chunkToEntityContext(c.text)}`)
|
| 53 |
-
.join("\n")
|
| 54 |
: `No graph context available.`;
|
| 55 |
|
| 56 |
-
// ββ
|
| 57 |
-
const llmStart = Date.now();
|
| 58 |
-
const llmOnlyResp = await callLLM({
|
| 59 |
-
provider, model: selectedModel,
|
| 60 |
-
messages: [
|
| 61 |
-
{ role: "system", content: "Answer the question accurately and concisely from your knowledge. If unsure, say so." },
|
| 62 |
-
{ role: "user", content: query },
|
| 63 |
-
],
|
| 64 |
-
temperature: 0, maxTokens: 512,
|
| 65 |
-
});
|
| 66 |
-
const llmOnlyLatency = Date.now() - llmStart;
|
| 67 |
-
|
| 68 |
-
// ββ Pipeline 2: Basic RAG (full retrieved chunks as context) βββββββββ
|
| 69 |
const ragStart = Date.now();
|
| 70 |
-
const basicRagResp = await
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
|
| 94 |
// ββ Adaptive routing (complexity scoring) ββββββββββββββββββββββββββββ
|
| 95 |
let complexity = 0.5, queryType = "factoid", recommended = "graphrag";
|
|
|
|
| 35 |
const selectedModel = model || providerConfig.defaultModel;
|
| 36 |
const startTime = Date.now();
|
| 37 |
|
| 38 |
+
// ββ Parallel phase 1: LLM-Only + embedding fetch run simultaneously ββ
|
| 39 |
+
// LLM-only needs no retrieval; start it immediately alongside the embed call.
|
| 40 |
+
const llmOnlyStart = Date.now();
|
| 41 |
+
const [llmOnlyResp, embedding] = await Promise.all([
|
| 42 |
+
callLLM({
|
| 43 |
+
provider, model: selectedModel,
|
| 44 |
+
messages: [
|
| 45 |
+
{ role: "system", content: "Answer the question accurately and concisely from your knowledge. If unsure, say so." },
|
| 46 |
+
{ role: "user", content: query },
|
| 47 |
+
],
|
| 48 |
+
temperature: 0, maxTokens: 512,
|
| 49 |
+
}),
|
| 50 |
+
getEmbedding(query),
|
| 51 |
+
]);
|
| 52 |
+
const llmOnlyLatency = Date.now() - llmOnlyStart;
|
| 53 |
+
|
| 54 |
+
// ββ Retrieve chunks from TigerGraph (needs embedding) βββββββββββββββββ
|
| 55 |
const chunks = embedding ? await searchChunks(embedding, topK) : [];
|
| 56 |
const hasRetrieval = chunks.length > 0;
|
| 57 |
|
|
|
|
| 60 |
? chunks.map((c, i) => `[Passage ${i + 1}]\n${c.text}`).join("\n\n")
|
| 61 |
: `No documents retrieved. Answering from general knowledge.`;
|
| 62 |
|
| 63 |
+
// Compact entity context (GraphRAG: first-sentence descriptions, pre-indexed at ingest time)
|
|
|
|
| 64 |
const graphContext = hasRetrieval
|
| 65 |
+
? chunks.map((c, i) => `[${i + 1}] ${chunkToEntityContext(c.text)}`).join("\n")
|
|
|
|
|
|
|
| 66 |
: `No graph context available.`;
|
| 67 |
|
| 68 |
+
// ββ Parallel phase 2: Basic RAG + GraphRAG run simultaneously ββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
const ragStart = Date.now();
|
| 70 |
+
const [basicRagResp, graphragResp] = await Promise.all([
|
| 71 |
+
callLLM({
|
| 72 |
+
provider, model: selectedModel,
|
| 73 |
+
messages: [
|
| 74 |
+
{ role: "system", content: "Answer the question using ONLY the provided context passages. Be accurate and concise." },
|
| 75 |
+
{ role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${query}\n\nAnswer:` },
|
| 76 |
+
],
|
| 77 |
+
temperature: 0, maxTokens: 512,
|
| 78 |
+
}),
|
| 79 |
+
callLLM({
|
| 80 |
+
provider, model: selectedModel,
|
| 81 |
+
messages: [
|
| 82 |
+
{ role: "system", content: "You have access to a knowledge graph. The entity descriptions below were pre-indexed from the document corpus. Use them to answer precisely and concisely β follow any relationship chains implied." },
|
| 83 |
+
{ role: "user", content: `Knowledge Graph Entities:\n${graphContext}\n\nQuestion: ${query}\n\nAnswer:` },
|
| 84 |
+
],
|
| 85 |
+
temperature: 0, maxTokens: 512,
|
| 86 |
+
}),
|
| 87 |
+
]);
|
| 88 |
+
// Both share the same wall-clock window; report individual latencies from their response objects.
|
| 89 |
+
const parallelLat = Date.now() - ragStart;
|
| 90 |
+
const ragLatency = basicRagResp.latencyMs;
|
| 91 |
+
const graphragLatency = graphragResp.latencyMs;
|
| 92 |
+
void parallelLat; // measured for tracing, total captured in totalTimeMs
|
| 93 |
|
| 94 |
// ββ Adaptive routing (complexity scoring) ββββββββββββββββββββββββββββ
|
| 95 |
let complexity = 0.5, queryType = "factoid", recommended = "graphrag";
|
web/src/lib/llm-providers.ts
CHANGED
|
@@ -316,6 +316,19 @@ export const PROVIDERS: Record<ProviderId, ProviderConfig> = {
|
|
| 316 |
|
| 317 |
// ββ Universal LLM Client βββββββββββββββββββββββββββββββββ
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
|
| 320 |
const provider = PROVIDERS[request.provider];
|
| 321 |
if (!provider) throw new Error(`Unknown provider: ${request.provider}`);
|
|
@@ -330,8 +343,6 @@ export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
|
|
| 330 |
}
|
| 331 |
|
| 332 |
// ββ All other providers use OpenAI SDK βββββββββββββββ
|
| 333 |
-
const OpenAI = (await import("openai")).default;
|
| 334 |
-
|
| 335 |
const apiKey = provider.isLocal
|
| 336 |
? "ollama"
|
| 337 |
: process.env[provider.apiKeyEnv] || "";
|
|
@@ -340,10 +351,7 @@ export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
|
|
| 340 |
throw new Error(`Missing API key: set ${provider.apiKeyEnv} environment variable`);
|
| 341 |
}
|
| 342 |
|
| 343 |
-
const client =
|
| 344 |
-
baseURL: provider.baseURL,
|
| 345 |
-
apiKey,
|
| 346 |
-
});
|
| 347 |
|
| 348 |
const params: Record<string, unknown> = {
|
| 349 |
model,
|
|
|
|
| 316 |
|
| 317 |
// ββ Universal LLM Client βββββββββββββββββββββββββββββββββ
|
| 318 |
|
| 319 |
+
// Cache OpenAI client instances per (baseURL, apiKey) pair β avoids re-instantiation on every call.
|
| 320 |
+
type OpenAIInstance = InstanceType<Awaited<typeof import("openai")>["default"]>;
|
| 321 |
+
const openAIClientCache = new Map<string, OpenAIInstance>();
|
| 322 |
+
|
| 323 |
+
async function getOpenAIClient(baseURL: string, apiKey: string): Promise<OpenAIInstance> {
|
| 324 |
+
const cacheKey = `${baseURL}|${apiKey}`;
|
| 325 |
+
if (!openAIClientCache.has(cacheKey)) {
|
| 326 |
+
const OpenAI = (await import("openai")).default;
|
| 327 |
+
openAIClientCache.set(cacheKey, new OpenAI({ baseURL, apiKey }));
|
| 328 |
+
}
|
| 329 |
+
return openAIClientCache.get(cacheKey)!;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
|
| 333 |
const provider = PROVIDERS[request.provider];
|
| 334 |
if (!provider) throw new Error(`Unknown provider: ${request.provider}`);
|
|
|
|
| 343 |
}
|
| 344 |
|
| 345 |
// ββ All other providers use OpenAI SDK βββββββββββββββ
|
|
|
|
|
|
|
| 346 |
const apiKey = provider.isLocal
|
| 347 |
? "ollama"
|
| 348 |
: process.env[provider.apiKeyEnv] || "";
|
|
|
|
| 351 |
throw new Error(`Missing API key: set ${provider.apiKeyEnv} environment variable`);
|
| 352 |
}
|
| 353 |
|
| 354 |
+
const client = await getOpenAIClient(provider.baseURL, apiKey);
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
const params: Record<string, unknown> = {
|
| 357 |
model,
|
web/src/lib/retrieval.ts
CHANGED
|
@@ -8,8 +8,17 @@ export interface TGChunk {
|
|
| 8 |
score: number;
|
| 9 |
}
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
/** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
|
| 12 |
export async function getEmbedding(text: string): Promise<number[] | null> {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
const token = process.env.HUGGING_FACE_HUB_TOKEN || process.env.HF_TOKEN;
|
| 14 |
if (!token) return null;
|
| 15 |
try {
|
|
@@ -27,7 +36,13 @@ export async function getEmbedding(text: string): Promise<number[] | null> {
|
|
| 27 |
if (!Array.isArray(data)) return null;
|
| 28 |
// Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
|
| 29 |
const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
} catch {
|
| 32 |
return null;
|
| 33 |
}
|
|
|
|
| 8 |
score: number;
|
| 9 |
}
|
| 10 |
|
| 11 |
+
// In-process embedding cache β avoids re-hitting HF API for the same query text.
|
| 12 |
+
// Capped at 256 entries to prevent unbounded memory growth in long-running servers.
|
| 13 |
+
const embeddingCache = new Map<string, number[]>();
|
| 14 |
+
const EMBED_CACHE_MAX = 256;
|
| 15 |
+
|
| 16 |
/** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
|
| 17 |
export async function getEmbedding(text: string): Promise<number[] | null> {
|
| 18 |
+
const normalized = text.trim().toLowerCase();
|
| 19 |
+
const cached = embeddingCache.get(normalized);
|
| 20 |
+
if (cached) return cached;
|
| 21 |
+
|
| 22 |
const token = process.env.HUGGING_FACE_HUB_TOKEN || process.env.HF_TOKEN;
|
| 23 |
if (!token) return null;
|
| 24 |
try {
|
|
|
|
| 36 |
if (!Array.isArray(data)) return null;
|
| 37 |
// Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
|
| 38 |
const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
|
| 39 |
+
if (!flat.every((x) => typeof x === "number")) return null;
|
| 40 |
+
|
| 41 |
+
if (embeddingCache.size >= EMBED_CACHE_MAX) {
|
| 42 |
+
embeddingCache.delete(embeddingCache.keys().next().value!);
|
| 43 |
+
}
|
| 44 |
+
embeddingCache.set(normalized, flat);
|
| 45 |
+
return flat;
|
| 46 |
} catch {
|
| 47 |
return null;
|
| 48 |
}
|