Spaces:
Sleeping
Sleeping
| /** Drives two parallel `useLlmEpisodeRunner` instances against the same | |
| * episode seed. The whole point of the demo is to put two models on | |
| * identical input and compare their behaviour, scored by the same | |
| * verifier with no LLM-as-judge. | |
| * | |
| * Implementation note: each side gets its own session because the env | |
| * builds a turn-by-turn history that the next prompt depends on. We | |
| * *don't* fork a single session — that would corrupt history. Instead | |
| * we start two sessions with the same `system_id` + `seed`, which the | |
| * server already supports via its existing reset path. */ | |
| import { useCallback, useMemo, useRef, useState } from "react"; | |
| import { | |
| type LlmEpisodeRunnerControls, | |
| type LlmEpisodeRunnerState, | |
| useLlmEpisodeRunner, | |
| } from "@/hooks/useLlmEpisodeRunner"; | |
| import type { LlmConnection } from "@/lib/llmPresets"; | |
| export interface CompareSlot { | |
| id: "a" | "b"; | |
| state: LlmEpisodeRunnerState; | |
| controls: LlmEpisodeRunnerControls; | |
| } | |
| export interface CompareRunnerControls { | |
| /** Start both sides on the same seed + system. Each side uses its | |
| * own connection. */ | |
| startBoth: (options: { | |
| systemId?: string | undefined; | |
| maxTurns?: number | undefined; | |
| connectionA: LlmConnection; | |
| connectionB: LlmConnection; | |
| temperature?: number | undefined; | |
| }) => Promise<void>; | |
| /** End both sessions and reset state. */ | |
| endBoth: () => Promise<void>; | |
| } | |
| export interface CompareRunnerState { | |
| a: CompareSlot; | |
| b: CompareSlot; | |
| /** Seed the last `startBoth` call locked in. Surfaces in the UI so | |
| * users know both sides really saw the same episode. */ | |
| lastSeed: number | null; | |
| /** Resolved system_id (same for both slots). */ | |
| systemId: string | null; | |
| } | |
| export function useLlmCompareRunner(): CompareRunnerState & CompareRunnerControls { | |
| const a = useLlmEpisodeRunner(); | |
| const b = useLlmEpisodeRunner(); | |
| const [lastSeed, setLastSeed] = useState<number | null>(null); | |
| const [systemId, setSystemId] = useState<string | null>(null); | |
| // Keep the latest controls on a ref so `startBoth` doesn't have to | |
| // depend on them — useEpisodeRunner reinstates them on every render | |
| // and pulling them through the dep array would churn the callback. | |
| const controlsRef = useRef({ a: a, b: b }); | |
| controlsRef.current = { a, b }; | |
| const startBoth = useCallback( | |
| async (options: { | |
| systemId?: string | undefined; | |
| maxTurns?: number | undefined; | |
| connectionA: LlmConnection; | |
| connectionB: LlmConnection; | |
| temperature?: number | undefined; | |
| }) => { | |
| // Generate a single seed so both sides see identical observations. | |
| // 31 bits keeps us inside JS-safe int range and Numpy-acceptable. | |
| const seed = Math.floor(Math.random() * 2_147_483_647); | |
| setLastSeed(seed); | |
| setSystemId(options.systemId ?? null); | |
| const common = { | |
| systemId: options.systemId, | |
| seed, | |
| maxTurns: options.maxTurns, | |
| temperature: options.temperature, | |
| }; | |
| // Kick off both in parallel — the server makes independent | |
| // sessions so they can't deadlock on each other. | |
| await Promise.all([ | |
| controlsRef.current.a.start({ ...common, connection: options.connectionA }), | |
| controlsRef.current.b.start({ ...common, connection: options.connectionB }), | |
| ]); | |
| }, | |
| [], | |
| ); | |
| const endBoth = useCallback(async () => { | |
| await Promise.all([ | |
| controlsRef.current.a.end(), | |
| controlsRef.current.b.end(), | |
| ]); | |
| setLastSeed(null); | |
| setSystemId(null); | |
| }, []); | |
| const slotA = useMemo<CompareSlot>( | |
| () => ({ | |
| id: "a", | |
| state: { ...a }, | |
| controls: { ...a }, | |
| }), | |
| [a], | |
| ); | |
| const slotB = useMemo<CompareSlot>( | |
| () => ({ | |
| id: "b", | |
| state: { ...b }, | |
| controls: { ...b }, | |
| }), | |
| [b], | |
| ); | |
| return { | |
| a: slotA, | |
| b: slotB, | |
| lastSeed, | |
| systemId: systemId ?? a.systemId ?? b.systemId, | |
| startBoth, | |
| endBoth, | |
| }; | |
| } | |