Improve latency: parallel LLM calls, embedding cache, client reuse

90b36cb 5 days ago

3.3 kB

	/**
	* Retrieval utilities: HuggingFace embeddings + TigerGraph vector search
	*/

	export interface TGChunk {
	chunk_id: string;
	text: string;
	score: number;
	}

	// In-process embedding cache — avoids re-hitting HF API for the same query text.
	// Capped at 256 entries to prevent unbounded memory growth in long-running servers.
	const embeddingCache = new Map<string, number[]>();
	const EMBED_CACHE_MAX = 256;

	/** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
	export async function getEmbedding(text: string): Promise<number[] \| null> {
	const normalized = text.trim().toLowerCase();
	const cached = embeddingCache.get(normalized);
	if (cached) return cached;

	const token = process.env.HUGGING_FACE_HUB_TOKEN \|\| process.env.HF_TOKEN;
	if (!token) return null;
	try {
	const res = await fetch(
	"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2",
	{
	method: "POST",
	headers: { Authorization: `Bearer ${token}`, "Content-Type": "application/json" },
	body: JSON.stringify({ inputs: text, options: { wait_for_model: true } }),
	signal: AbortSignal.timeout(15000),
	}
	);
	if (!res.ok) return null;
	const data = await res.json();
	if (!Array.isArray(data)) return null;
	// Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
	const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
	if (!flat.every((x) => typeof x === "number")) return null;

	if (embeddingCache.size >= EMBED_CACHE_MAX) {
	embeddingCache.delete(embeddingCache.keys().next().value!);
	}
	embeddingCache.set(normalized, flat);
	return flat;
	} catch {
	return null;
	}
	}

	/** Call TigerGraph vectorSearchChunks installed query */
	export async function searchChunks(embedding: number[], topK = 5): Promise<TGChunk[]> {
	const host = (process.env.TG_HOST \|\| "").replace(/\/$/, "");
	const token = process.env.TG_TOKEN;
	const graph = process.env.TG_GRAPH \|\| "GraphRAG";
	if (!host \|\| !token \|\| !embedding.length) return [];
	try {
	const res = await fetch(`${host}/restpp/query/${graph}/vectorSearchChunks`, {
	method: "POST",
	headers: { Authorization: `Bearer ${token}`, "Content-Type": "application/json" },
	body: JSON.stringify({ queryVec: embedding, topK }),
	signal: AbortSignal.timeout(20000),
	});
	if (!res.ok) return [];
	const data = await res.json();
	return (data.results?.[0]?.["@@topChunks"] as TGChunk[]) \|\| [];
	} catch {
	return [];
	}
	}

	/** Extract compact entity descriptions from chunk text (simulates pre-indexed graph data).
	* Entity extraction runs at INGEST TIME so the cost is amortized.
	* At query time, we only pay for the compact entity context, not full chunk text. */
	export function chunkToEntityContext(text: string, maxChars = 220): string {
	// Take first sentence — Wikipedia science articles open with the key entity definition
	const firstSentence = text.split(/(?<=[.!?])\s+/)[0].trim();
	return firstSentence.slice(0, maxChars);
	}

	/** Rough token count estimate (1 token ≈ 0.75 words) */
	export function estimateTokens(text: string): number {
	return Math.ceil(text.split(/\s+/).filter(Boolean).length * 1.33);
	}