physix / frontend /src /hooks /useLlmCompareRunner.ts
Pratyush-01's picture
Upload folder using huggingface_hub
0e24aff verified
/** Drives two parallel `useLlmEpisodeRunner` instances against the same
* episode seed. The whole point of the demo is to put two models on
* identical input and compare their behaviour, scored by the same
* verifier with no LLM-as-judge.
*
* Implementation note: each side gets its own session because the env
* builds a turn-by-turn history that the next prompt depends on. We
* *don't* fork a single session — that would corrupt history. Instead
* we start two sessions with the same `system_id` + `seed`, which the
* server already supports via its existing reset path. */
import { useCallback, useMemo, useRef, useState } from "react";
import {
type LlmEpisodeRunnerControls,
type LlmEpisodeRunnerState,
useLlmEpisodeRunner,
} from "@/hooks/useLlmEpisodeRunner";
import type { LlmConnection } from "@/lib/llmPresets";
export interface CompareSlot {
id: "a" | "b";
state: LlmEpisodeRunnerState;
controls: LlmEpisodeRunnerControls;
}
export interface CompareRunnerControls {
/** Start both sides on the same seed + system. Each side uses its
* own connection. */
startBoth: (options: {
systemId?: string | undefined;
maxTurns?: number | undefined;
connectionA: LlmConnection;
connectionB: LlmConnection;
temperature?: number | undefined;
}) => Promise<void>;
/** End both sessions and reset state. */
endBoth: () => Promise<void>;
}
export interface CompareRunnerState {
a: CompareSlot;
b: CompareSlot;
/** Seed the last `startBoth` call locked in. Surfaces in the UI so
* users know both sides really saw the same episode. */
lastSeed: number | null;
/** Resolved system_id (same for both slots). */
systemId: string | null;
}
export function useLlmCompareRunner(): CompareRunnerState & CompareRunnerControls {
const a = useLlmEpisodeRunner();
const b = useLlmEpisodeRunner();
const [lastSeed, setLastSeed] = useState<number | null>(null);
const [systemId, setSystemId] = useState<string | null>(null);
// Keep the latest controls on a ref so `startBoth` doesn't have to
// depend on them — useEpisodeRunner reinstates them on every render
// and pulling them through the dep array would churn the callback.
const controlsRef = useRef({ a: a, b: b });
controlsRef.current = { a, b };
const startBoth = useCallback(
async (options: {
systemId?: string | undefined;
maxTurns?: number | undefined;
connectionA: LlmConnection;
connectionB: LlmConnection;
temperature?: number | undefined;
}) => {
// Generate a single seed so both sides see identical observations.
// 31 bits keeps us inside JS-safe int range and Numpy-acceptable.
const seed = Math.floor(Math.random() * 2_147_483_647);
setLastSeed(seed);
setSystemId(options.systemId ?? null);
const common = {
systemId: options.systemId,
seed,
maxTurns: options.maxTurns,
temperature: options.temperature,
};
// Kick off both in parallel — the server makes independent
// sessions so they can't deadlock on each other.
await Promise.all([
controlsRef.current.a.start({ ...common, connection: options.connectionA }),
controlsRef.current.b.start({ ...common, connection: options.connectionB }),
]);
},
[],
);
const endBoth = useCallback(async () => {
await Promise.all([
controlsRef.current.a.end(),
controlsRef.current.b.end(),
]);
setLastSeed(null);
setSystemId(null);
}, []);
const slotA = useMemo<CompareSlot>(
() => ({
id: "a",
state: { ...a },
controls: { ...a },
}),
[a],
);
const slotB = useMemo<CompareSlot>(
() => ({
id: "b",
state: { ...b },
controls: { ...b },
}),
[b],
);
return {
a: slotA,
b: slotB,
lastSeed,
systemId: systemId ?? a.systemId ?? b.systemId,
startBoth,
endBoth,
};
}