Spaces:

Pratyush-01
/

physix

Sleeping

App Files Files Community

physix / frontend /src /hooks /useLlmCompareRunner.ts

Pratyush-01

Upload folder using huggingface_hub

0e24aff verified 15 days ago

raw

history blame contribute delete

3.96 kB

	/** Drives two parallel `useLlmEpisodeRunner` instances against the same
	* episode seed. The whole point of the demo is to put two models on
	* identical input and compare their behaviour, scored by the same
	* verifier with no LLM-as-judge.
	*
	* Implementation note: each side gets its own session because the env
	* builds a turn-by-turn history that the next prompt depends on. We
	* don't fork a single session — that would corrupt history. Instead
	* we start two sessions with the same `system_id` + `seed`, which the
	* server already supports via its existing reset path. */

	import { useCallback, useMemo, useRef, useState } from "react";

	import {
	type LlmEpisodeRunnerControls,
	type LlmEpisodeRunnerState,
	useLlmEpisodeRunner,
	} from "@/hooks/useLlmEpisodeRunner";
	import type { LlmConnection } from "@/lib/llmPresets";

	export interface CompareSlot {
	id: "a" \| "b";
	state: LlmEpisodeRunnerState;
	controls: LlmEpisodeRunnerControls;
	}

	export interface CompareRunnerControls {
	/** Start both sides on the same seed + system. Each side uses its
	* own connection. */
	startBoth: (options: {
	systemId?: string \| undefined;
	maxTurns?: number \| undefined;
	connectionA: LlmConnection;
	connectionB: LlmConnection;
	temperature?: number \| undefined;
	}) => Promise<void>;
	/** End both sessions and reset state. */
	endBoth: () => Promise<void>;
	}

	export interface CompareRunnerState {
	a: CompareSlot;
	b: CompareSlot;
	/** Seed the last `startBoth` call locked in. Surfaces in the UI so
	* users know both sides really saw the same episode. */
	lastSeed: number \| null;
	/** Resolved system_id (same for both slots). */
	systemId: string \| null;
	}

	export function useLlmCompareRunner(): CompareRunnerState & CompareRunnerControls {
	const a = useLlmEpisodeRunner();
	const b = useLlmEpisodeRunner();
	const [lastSeed, setLastSeed] = useState<number \| null>(null);
	const [systemId, setSystemId] = useState<string \| null>(null);

	// Keep the latest controls on a ref so `startBoth` doesn't have to
	// depend on them — useEpisodeRunner reinstates them on every render
	// and pulling them through the dep array would churn the callback.
	const controlsRef = useRef({ a: a, b: b });
	controlsRef.current = { a, b };

	const startBoth = useCallback(
	async (options: {
	systemId?: string \| undefined;
	maxTurns?: number \| undefined;
	connectionA: LlmConnection;
	connectionB: LlmConnection;
	temperature?: number \| undefined;
	}) => {
	// Generate a single seed so both sides see identical observations.
	// 31 bits keeps us inside JS-safe int range and Numpy-acceptable.
	const seed = Math.floor(Math.random() * 2_147_483_647);
	setLastSeed(seed);
	setSystemId(options.systemId ?? null);

	const common = {
	systemId: options.systemId,
	seed,
	maxTurns: options.maxTurns,
	temperature: options.temperature,
	};

	// Kick off both in parallel — the server makes independent
	// sessions so they can't deadlock on each other.
	await Promise.all([
	controlsRef.current.a.start({ ...common, connection: options.connectionA }),
	controlsRef.current.b.start({ ...common, connection: options.connectionB }),
	]);
	},
	[],
	);

	const endBoth = useCallback(async () => {
	await Promise.all([
	controlsRef.current.a.end(),
	controlsRef.current.b.end(),
	]);
	setLastSeed(null);
	setSystemId(null);
	}, []);

	const slotA = useMemo<CompareSlot>(
	() => ({
	id: "a",
	state: { ...a },
	controls: { ...a },
	}),
	[a],
	);
	const slotB = useMemo<CompareSlot>(
	() => ({
	id: "b",
	state: { ...b },
	controls: { ...b },
	}),
	[b],
	);

	return {
	a: slotA,
	b: slotB,
	lastSeed,
	systemId: systemId ?? a.systemId ?? b.systemId,
	startBoth,
	endBoth,
	};
	}