muthuk1's picture
Improve latency: parallel LLM calls, embedding cache, client reuse
90b36cb
/**
* Retrieval utilities: HuggingFace embeddings + TigerGraph vector search
*/
export interface TGChunk {
chunk_id: string;
text: string;
score: number;
}
// In-process embedding cache — avoids re-hitting HF API for the same query text.
// Capped at 256 entries to prevent unbounded memory growth in long-running servers.
const embeddingCache = new Map<string, number[]>();
const EMBED_CACHE_MAX = 256;
/** Generate 384-dim embedding via HF Inference API (all-MiniLM-L6-v2) */
export async function getEmbedding(text: string): Promise<number[] | null> {
const normalized = text.trim().toLowerCase();
const cached = embeddingCache.get(normalized);
if (cached) return cached;
const token = process.env.HUGGING_FACE_HUB_TOKEN || process.env.HF_TOKEN;
if (!token) return null;
try {
const res = await fetch(
"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2",
{
method: "POST",
headers: { Authorization: `Bearer ${token}`, "Content-Type": "application/json" },
body: JSON.stringify({ inputs: text, options: { wait_for_model: true } }),
signal: AbortSignal.timeout(15000),
}
);
if (!res.ok) return null;
const data = await res.json();
if (!Array.isArray(data)) return null;
// Handle both [0.1, 0.2, ...] and [[0.1, 0.2, ...]]
const flat: number[] = Array.isArray(data[0]) ? (data[0] as number[]) : (data as number[]);
if (!flat.every((x) => typeof x === "number")) return null;
if (embeddingCache.size >= EMBED_CACHE_MAX) {
embeddingCache.delete(embeddingCache.keys().next().value!);
}
embeddingCache.set(normalized, flat);
return flat;
} catch {
return null;
}
}
/** Call TigerGraph vectorSearchChunks installed query */
export async function searchChunks(embedding: number[], topK = 5): Promise<TGChunk[]> {
const host = (process.env.TG_HOST || "").replace(/\/$/, "");
const token = process.env.TG_TOKEN;
const graph = process.env.TG_GRAPH || "GraphRAG";
if (!host || !token || !embedding.length) return [];
try {
const res = await fetch(`${host}/restpp/query/${graph}/vectorSearchChunks`, {
method: "POST",
headers: { Authorization: `Bearer ${token}`, "Content-Type": "application/json" },
body: JSON.stringify({ queryVec: embedding, topK }),
signal: AbortSignal.timeout(20000),
});
if (!res.ok) return [];
const data = await res.json();
return (data.results?.[0]?.["@@topChunks"] as TGChunk[]) || [];
} catch {
return [];
}
}
/** Extract compact entity descriptions from chunk text (simulates pre-indexed graph data).
* Entity extraction runs at INGEST TIME so the cost is amortized.
* At query time, we only pay for the compact entity context, not full chunk text. */
export function chunkToEntityContext(text: string, maxChars = 220): string {
// Take first sentence — Wikipedia science articles open with the key entity definition
const firstSentence = text.split(/(?<=[.!?])\s+/)[0].trim();
return firstSentence.slice(0, maxChars);
}
/** Rough token count estimate (1 token ≈ 0.75 words) */
export function estimateTokens(text: string): number {
return Math.ceil(text.split(/\s+/).filter(Boolean).length * 1.33);
}