Improve latency: parallel LLM calls, embedding cache, client reuse

90b36cb 5 days ago

7.96 kB

	import { NextRequest, NextResponse } from "next/server";
	import { callLLM, PROVIDERS, type ProviderId } from "@/lib/llm-providers";
	import { getEmbedding, searchChunks, chunkToEntityContext } from "@/lib/retrieval";

	export const runtime = "nodejs";
	export const dynamic = "force-dynamic";

	interface CompareRequest {
	query: string;
	provider?: ProviderId;
	model?: string;
	adaptiveRouting?: boolean;
	topK?: number;
	}

	export async function POST(req: NextRequest) {
	try {
	const body: CompareRequest = await req.json();
	const { query, provider = "openai", model, adaptiveRouting = true, topK = 5 } = body;

	if (!query?.trim()) {
	return NextResponse.json({ error: "Query required" }, { status: 400 });
	}

	const providerConfig = PROVIDERS[provider];
	if (!providerConfig) {
	return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 });
	}

	const hasKey = providerConfig.isLocal \|\| !providerConfig.requiresApiKey \|\| !!process.env[providerConfig.apiKeyEnv];
	if (!hasKey) {
	return NextResponse.json(getDemoResponse(query, provider));
	}

	const selectedModel = model \|\| providerConfig.defaultModel;
	const startTime = Date.now();

	// ── Parallel phase 1: LLM-Only + embedding fetch run simultaneously ──
	// LLM-only needs no retrieval; start it immediately alongside the embed call.
	const llmOnlyStart = Date.now();
	const [llmOnlyResp, embedding] = await Promise.all([
	callLLM({
	provider, model: selectedModel,
	messages: [
	{ role: "system", content: "Answer the question accurately and concisely from your knowledge. If unsure, say so." },
	{ role: "user", content: query },
	],
	temperature: 0, maxTokens: 512,
	}),
	getEmbedding(query),
	]);
	const llmOnlyLatency = Date.now() - llmOnlyStart;

	// ── Retrieve chunks from TigerGraph (needs embedding) ─────────────────
	const chunks = embedding ? await searchChunks(embedding, topK) : [];
	const hasRetrieval = chunks.length > 0;

	// Full text context (Basic RAG: raw chunks concatenated)
	const ragContext = hasRetrieval
	? chunks.map((c, i) => `[Passage ${i + 1}]\n${c.text}`).join("\n\n")
	: `No documents retrieved. Answering from general knowledge.`;

	// Compact entity context (GraphRAG: first-sentence descriptions, pre-indexed at ingest time)
	const graphContext = hasRetrieval
	? chunks.map((c, i) => `[${i + 1}] ${chunkToEntityContext(c.text)}`).join("\n")
	: `No graph context available.`;

	// ── Parallel phase 2: Basic RAG + GraphRAG run simultaneously ────────
	const ragStart = Date.now();
	const [basicRagResp, graphragResp] = await Promise.all([
	callLLM({
	provider, model: selectedModel,
	messages: [
	{ role: "system", content: "Answer the question using ONLY the provided context passages. Be accurate and concise." },
	{ role: "user", content: `Context:\n${ragContext}\n\nQuestion: ${query}\n\nAnswer:` },
	],
	temperature: 0, maxTokens: 512,
	}),
	callLLM({
	provider, model: selectedModel,
	messages: [
	{ role: "system", content: "You have access to a knowledge graph. The entity descriptions below were pre-indexed from the document corpus. Use them to answer precisely and concisely — follow any relationship chains implied." },
	{ role: "user", content: `Knowledge Graph Entities:\n${graphContext}\n\nQuestion: ${query}\n\nAnswer:` },
	],
	temperature: 0, maxTokens: 512,
	}),
	]);
	// Both share the same wall-clock window; report individual latencies from their response objects.
	const parallelLat = Date.now() - ragStart;
	const ragLatency = basicRagResp.latencyMs;
	const graphragLatency = graphragResp.latencyMs;
	void parallelLat; // measured for tracing, total captured in totalTimeMs

	// ── Adaptive routing (complexity scoring) ────────────────────────────
	let complexity = 0.5, queryType = "factoid", recommended = "graphrag";
	if (adaptiveRouting) {
	const words = query.toLowerCase();
	const isMultiHop = /same\|both\|compare\|which.first\|who.born\|difference\|related\|between/i.test(words);
	const isSimple = /what is\|define\|spell\|capital of/i.test(words);
	complexity = isSimple ? 0.2 : isMultiHop ? 0.8 : 0.55;
	queryType = isMultiHop ? "multi_hop" : isSimple ? "factoid" : "comparison";
	recommended = complexity >= 0.5 ? "graphrag" : "baseline";
	}

	// ── Entity list from compact context (for UI display) ────────────────
	const entities = chunks.map((c) => chunkToEntityContext(c.text, 80)).filter(Boolean);
	const relations: string[] = [];

	return NextResponse.json({
	llmOnly: {
	answer: llmOnlyResp.content,
	tokens: llmOnlyResp.totalTokens,
	latencyMs: llmOnlyLatency,
	costUsd: llmOnlyResp.costUsd,
	},
	baseline: {
	answer: basicRagResp.content,
	tokens: basicRagResp.totalTokens,
	latencyMs: ragLatency,
	costUsd: basicRagResp.costUsd,
	entities: [],
	relations: [],
	retrievedChunks: chunks.length,
	contextTokens: basicRagResp.inputTokens,
	},
	graphrag: {
	answer: graphragResp.content,
	tokens: graphragResp.totalTokens,
	latencyMs: graphragLatency,
	costUsd: graphragResp.costUsd,
	entities,
	relations,
	retrievedChunks: chunks.length,
	contextTokens: graphragResp.inputTokens,
	},
	complexity,
	queryType,
	recommended,
	provider,
	model: selectedModel,
	totalTimeMs: Date.now() - startTime,
	retrievalEnabled: hasRetrieval,
	chunksRetrieved: chunks.length,
	});
	} catch (error) {
	console.error("Compare API error:", error);
	const errMsg = error instanceof Error ? error.message : "Unknown error";
	return NextResponse.json(getDemoResponse("", "openai", errMsg));
	}
	}

	function getDemoResponse(query: string, provider: string, error?: string) {
	return {
	llmOnly: {
	answer: "Albert Einstein developed general relativity, and Niels Bohr contributed to quantum mechanics — they worked in different areas of physics.",
	tokens: 124, latencyMs: 820, costUsd: 0.000019,
	},
	baseline: {
	answer: "Based on the retrieved documents: General relativity was developed by Albert Einstein. Quantum mechanics was pioneered by several physicists including Niels Bohr, Werner Heisenberg, and Erwin Schrödinger. These are distinct theories — general relativity describes gravity at large scales while quantum mechanics describes subatomic behavior.",
	tokens: 1847, latencyMs: 1480, costUsd: 0.000277,
	entities: [], relations: [], retrievedChunks: 5, contextTokens: 1620,
	},
	graphrag: {
	answer: "General relativity (Einstein, 1915) describes gravity as spacetime curvature. Quantum mechanics (Bohr, Heisenberg, Schrödinger, 1920s) governs subatomic particles. They are complementary theories covering different scales.",
	tokens: 387, latencyMs: 980, costUsd: 0.000058,
	entities: ["Albert Einstein (physicist, general relativity)", "Niels Bohr (physicist, quantum model)", "Werner Heisenberg (physicist, uncertainty principle)"],
	relations: ["Einstein -[DEVELOPED]-> General Relativity", "Bohr -[DEVELOPED]-> Quantum Model of Atom"],
	retrievedChunks: 5, contextTokens: 312,
	},
	complexity: 0.65, queryType: "comparison", recommended: "graphrag",
	provider, model: "demo-mode", totalTimeMs: 3300,
	retrievalEnabled: false, chunksRetrieved: 0,
	...(error ? { demoMode: true, demoReason: error } : { demoMode: true, demoReason: "No API key configured" }),
	};
	}