/** Drives two parallel `useLlmEpisodeRunner` instances against the same * episode seed. The whole point of the demo is to put two models on * identical input and compare their behaviour, scored by the same * verifier with no LLM-as-judge. * * Implementation note: each side gets its own session because the env * builds a turn-by-turn history that the next prompt depends on. We * *don't* fork a single session — that would corrupt history. Instead * we start two sessions with the same `system_id` + `seed`, which the * server already supports via its existing reset path. */ import { useCallback, useMemo, useRef, useState } from "react"; import { type LlmEpisodeRunnerControls, type LlmEpisodeRunnerState, useLlmEpisodeRunner, } from "@/hooks/useLlmEpisodeRunner"; import type { LlmConnection } from "@/lib/llmPresets"; export interface CompareSlot { id: "a" | "b"; state: LlmEpisodeRunnerState; controls: LlmEpisodeRunnerControls; } export interface CompareRunnerControls { /** Start both sides on the same seed + system. Each side uses its * own connection. */ startBoth: (options: { systemId?: string | undefined; maxTurns?: number | undefined; connectionA: LlmConnection; connectionB: LlmConnection; temperature?: number | undefined; }) => Promise; /** End both sessions and reset state. */ endBoth: () => Promise; } export interface CompareRunnerState { a: CompareSlot; b: CompareSlot; /** Seed the last `startBoth` call locked in. Surfaces in the UI so * users know both sides really saw the same episode. */ lastSeed: number | null; /** Resolved system_id (same for both slots). */ systemId: string | null; } export function useLlmCompareRunner(): CompareRunnerState & CompareRunnerControls { const a = useLlmEpisodeRunner(); const b = useLlmEpisodeRunner(); const [lastSeed, setLastSeed] = useState(null); const [systemId, setSystemId] = useState(null); // Keep the latest controls on a ref so `startBoth` doesn't have to // depend on them — useEpisodeRunner reinstates them on every render // and pulling them through the dep array would churn the callback. const controlsRef = useRef({ a: a, b: b }); controlsRef.current = { a, b }; const startBoth = useCallback( async (options: { systemId?: string | undefined; maxTurns?: number | undefined; connectionA: LlmConnection; connectionB: LlmConnection; temperature?: number | undefined; }) => { // Generate a single seed so both sides see identical observations. // 31 bits keeps us inside JS-safe int range and Numpy-acceptable. const seed = Math.floor(Math.random() * 2_147_483_647); setLastSeed(seed); setSystemId(options.systemId ?? null); const common = { systemId: options.systemId, seed, maxTurns: options.maxTurns, temperature: options.temperature, }; // Kick off both in parallel — the server makes independent // sessions so they can't deadlock on each other. await Promise.all([ controlsRef.current.a.start({ ...common, connection: options.connectionA }), controlsRef.current.b.start({ ...common, connection: options.connectionB }), ]); }, [], ); const endBoth = useCallback(async () => { await Promise.all([ controlsRef.current.a.end(), controlsRef.current.b.end(), ]); setLastSeed(null); setSystemId(null); }, []); const slotA = useMemo( () => ({ id: "a", state: { ...a }, controls: { ...a }, }), [a], ); const slotB = useMemo( () => ({ id: "b", state: { ...b }, controls: { ...b }, }), [b], ); return { a: slotA, b: slotB, lastSeed, systemId: systemId ?? a.systemId ?? b.systemId, startBoth, endBoth, }; }