Spaces:

Pratyush-01
/

physix-live

Sleeping

App Files Files Community

Pratyush-01 commited on 16 days ago

Commit

8225d8a

verified ·

1 Parent(s): 2f81a49

frontend: sync clean source — drop ComparePane + Compare LLMs tab + Which-endpoint copy; ship 3-preset picker

Browse files

Files changed (5) hide show

frontend/src/App.tsx +2 -10
frontend/src/components/ComparePane.tsx +0 -624
frontend/src/components/RunWithLlmPane.tsx +189 -65
frontend/src/hooks/useLlmCompareRunner.ts +0 -128
frontend/src/lib/llmPresets.ts +89 -35

frontend/src/App.tsx CHANGED Viewed

@@ -1,12 +1,11 @@
 import { useState } from "react";
-import { ComparePane } from "@/components/ComparePane";
 import { HeroIntro } from "@/components/HeroIntro";
 import { OpenEnvExplorerPane } from "@/components/OpenEnvExplorerPane";
 import { RunWithLlmPane } from "@/components/RunWithLlmPane";
 import { cn } from "@/lib/cn";
-type TabId = "run" | "compare" | "openenv";
 interface TabDef {
   id: TabId;
@@ -15,7 +14,6 @@ interface TabDef {
 const TABS: TabDef[] = [
   { id: "run", label: "Run with LLM" },
-  { id: "compare", label: "Compare LLMs" },
   { id: "openenv", label: "OpenEnv API" },
 ];
@@ -75,13 +73,7 @@ export function App(): JSX.Element {
         id={`panel-${activeTab}`}
         aria-labelledby={`tab-${activeTab}`}
       >
-        {activeTab === "run" ? (
-          <RunWithLlmPane />
-        ) : activeTab === "compare" ? (
-          <ComparePane />
-        ) : (
-          <OpenEnvExplorerPane />
-        )}
       </div>
     </main>
   );

 import { useState } from "react";
 import { HeroIntro } from "@/components/HeroIntro";
 import { OpenEnvExplorerPane } from "@/components/OpenEnvExplorerPane";
 import { RunWithLlmPane } from "@/components/RunWithLlmPane";
 import { cn } from "@/lib/cn";
+type TabId = "run" | "openenv";
 interface TabDef {
   id: TabId;
 const TABS: TabDef[] = [
   { id: "run", label: "Run with LLM" },
   { id: "openenv", label: "OpenEnv API" },
 ];
         id={`panel-${activeTab}`}
         aria-labelledby={`tab-${activeTab}`}
       >
+        {activeTab === "run" ? <RunWithLlmPane /> : <OpenEnvExplorerPane />}
       </div>
     </main>
   );

frontend/src/components/ComparePane.tsx DELETED Viewed

@@ -1,624 +0,0 @@
-/** A/B comparison pane: trained PhysiX vs. baseline, scored by the
- *  same verifier on the same episode seed.
- *
- *  Two `LlmConnectionPanel`s feed two parallel `useLlmEpisodeRunner`
- *  sessions, started via `useLlmCompareRunner`. The panes render the
- *  same trajectory canvas (each side overlays its own predicted
- *  trajectory) plus a per-side reward strip and turn transcript. */
-import { useEffect, useMemo, useState } from "react";
-import { EquationDisplay } from "@/components/EquationDisplay";
-import { LlmConnectionPanel } from "@/components/LlmConnectionPanel";
-import { Skeleton } from "@/components/Skeleton";
-import { TrajectoryCanvas } from "@/components/TrajectoryCanvas";
-import { useLlmCompareRunner, type CompareSlot } from "@/hooks/useLlmCompareRunner";
-import { type LlmTurn, type RunnerStatus } from "@/hooks/useLlmEpisodeRunner";
-import { cn } from "@/lib/cn";
-import { formatPercent } from "@/lib/format";
-import {
-  DEFAULT_CONNECTION_A,
-  DEFAULT_CONNECTION_B,
-  type LlmConnection,
-} from "@/lib/llmPresets";
-import { pickPrimaryVariable } from "@/lib/trajectory";
-import type { RewardBreakdown } from "@/types/physix";
-const ZERO_REWARD: RewardBreakdown = {
-  match: 0,
-  progress: 0,
-  simplicity: 0,
-  format: 0,
-  total: 0,
-  shape: 0,
-  freq: 0,
-  amplitude: 0,
-};
-const CONVERGENCE_THRESHOLD = 0.93;
-export function ComparePane(): JSX.Element {
-  const runner = useLlmCompareRunner();
-  // Connection state lives here so navigating the system / button bar
-  // doesn't clear the API key or model id.
-  const [connectionA, setConnectionA] = useState<LlmConnection>(
-    () => DEFAULT_CONNECTION_A,
-  );
-  const [connectionB, setConnectionB] = useState<LlmConnection>(
-    () => DEFAULT_CONNECTION_B,
-  );
-  const [systemId, setSystemId] = useState<string>("");
-  const [maxTurns, setMaxTurns] = useState<number>(8);
-  const [temperature, setTemperature] = useState<number>(0.4);
-  // Default to damped_spring; fall back to first in list.
-  useEffect(() => {
-    if (!systemId && runner.a.state.systems && runner.a.state.systems.length > 0) {
-      const preferred = runner.a.state.systems.find((s) => s.system_id === "damped_spring");
-      setSystemId(preferred?.system_id ?? runner.a.state.systems[0]?.system_id ?? "");
-    }
-  }, [runner.a.state.systems, systemId]);
-  const eitherRunning =
-    isActive(runner.a.state.status) || isActive(runner.b.state.status);
-  const eitherStarting =
-    runner.a.state.status === "starting" || runner.b.state.status === "starting";
-  const canStart =
-    !eitherRunning &&
-    !eitherStarting &&
-    !!connectionA.model.trim() &&
-    !!connectionB.model.trim() &&
-    !!connectionA.baseUrl.trim() &&
-    !!connectionB.baseUrl.trim();
-  function handleStart(): void {
-    void runner.startBoth({
-      systemId: systemId,
-      maxTurns,
-      connectionA,
-      connectionB,
-      temperature,
-    });
-  }
-  return (
-    <section className="flex flex-col gap-6">
-      <div className="grid grid-cols-1 gap-4 lg:grid-cols-2">
-        <LlmConnectionPanel
-          title="A"
-          subtitle="Left side. Suggested: the trained model."
-          accent="primary"
-          value={connectionA}
-          onChange={setConnectionA}
-          disabled={eitherRunning || eitherStarting}
-          installedOllamaModels={runner.a.state.models ?? []}
-          installedOllamaLoading={runner.a.state.models === null}
-          installedOllamaError={runner.a.state.modelsError}
-          onRefreshOllama={() => void runner.a.controls.refreshModels()}
-        />
-        <LlmConnectionPanel
-          title="B"
-          subtitle="Right side. Suggested: a baseline you'd expect A to beat."
-          accent="blue"
-          value={connectionB}
-          onChange={setConnectionB}
-          disabled={eitherRunning || eitherStarting}
-          installedOllamaModels={runner.b.state.models ?? []}
-          installedOllamaLoading={runner.b.state.models === null}
-          installedOllamaError={runner.b.state.modelsError}
-          onRefreshOllama={() => void runner.b.controls.refreshModels()}
-        />
-      </div>
-      <CompareControlBar
-        systems={runner.a.state.systems}
-        systemId={systemId}
-        onSelectSystem={setSystemId}
-        temperature={temperature}
-        onChangeTemperature={setTemperature}
-        maxTurns={maxTurns}
-        onChangeMaxTurns={setMaxTurns}
-        canStart={canStart}
-        eitherRunning={eitherRunning}
-        eitherStarting={eitherStarting}
-        onStart={handleStart}
-        onEnd={() => void runner.endBoth()}
-        seed={runner.lastSeed}
-      />
-      <div className="grid grid-cols-1 gap-4 lg:grid-cols-2">
-        <SlotColumn
-          slot={runner.a}
-          connection={connectionA}
-          accent="primary"
-          title="A"
-        />
-        <SlotColumn
-          slot={runner.b}
-          connection={connectionB}
-          accent="blue"
-          title="B"
-        />
-      </div>
-      <ScoreboardBanner a={runner.a} b={runner.b} />
-    </section>
-  );
-}
-// ---------------------------------------------------------------------------
-function CompareControlBar({
-  systems,
-  systemId,
-  onSelectSystem,
-  temperature,
-  onChangeTemperature,
-  maxTurns,
-  onChangeMaxTurns,
-  canStart,
-  eitherRunning,
-  eitherStarting,
-  onStart,
-  onEnd,
-  seed,
-}: {
-  systems: import("@/lib/interactiveClient").SystemDescriptor[] | null;
-  systemId: string;
-  onSelectSystem: (id: string) => void;
-  temperature: number;
-  onChangeTemperature: (n: number) => void;
-  maxTurns: number;
-  onChangeMaxTurns: (n: number) => void;
-  canStart: boolean;
-  eitherRunning: boolean;
-  eitherStarting: boolean;
-  onStart: () => void;
-  onEnd: () => void;
-  seed: number | null;
-}): JSX.Element {
-  return (
-    <header className="panel flex flex-col gap-3">
-      <div>
-        <p className="heading-eyebrow text-primary">Run side-by-side</p>
-        <h2 className="mt-1 text-xl font-semibold leading-tight">
-          Same episode, two models, one verifier
-        </h2>
-        <p className="mt-1 max-w-3xl text-xs text-textMuted">
-          Both sides see the same trajectory, hint, and seed. Reward
-          comes only from <code className="font-mono">scipy.odeint</code>{" "}
-          and per-step R² — there is no LLM-as-judge. Differences in the
-          score are differences in the physics the model proposes.
-        </p>
-      </div>
-      <div className="flex flex-wrap items-end gap-3">
-        <Field label="System">
-          <select
-            className="rounded-lg border border-border bg-surfaceMuted px-3 py-2 text-sm text-textPrimary outline-none transition focus:border-textMuted disabled:opacity-50"
-            value={systemId}
-            onChange={(event) => onSelectSystem(event.target.value)}
-            disabled={systems === null || eitherRunning || eitherStarting}
-          >
-            {systems === null ? (
-              <option value="">Loading…</option>
-            ) : (
-              systems.map((descriptor) => (
-                <option key={descriptor.system_id} value={descriptor.system_id}>
-                  {prettySystemId(descriptor.system_id)}
-                </option>
-              ))
-            )}
-          </select>
-        </Field>
-        <Field label="Temp">
-          <input
-            type="number"
-            min={0}
-            max={2}
-            step={0.1}
-            className="w-20 rounded-lg border border-border bg-surfaceMuted px-3 py-2 text-sm text-textPrimary outline-none focus:border-textMuted"
-            value={temperature}
-            onChange={(event) =>
-              onChangeTemperature(
-                Math.max(0, Math.min(2, Number(event.target.value))),
-              )
-            }
-            disabled={eitherRunning || eitherStarting}
-          />
-        </Field>
-        <Field label="Turn budget">
-          <input
-            type="number"
-            min={1}
-            max={32}
-            className="w-24 rounded-lg border border-border bg-surfaceMuted px-3 py-2 text-sm text-textPrimary outline-none focus:border-textMuted"
-            value={maxTurns}
-            onChange={(event) =>
-              onChangeMaxTurns(Math.max(1, Number(event.target.value)))
-            }
-            disabled={eitherRunning || eitherStarting}
-          />
-        </Field>
-        <div className="ml-auto flex flex-wrap items-center gap-2">
-          {seed !== null ? (
-            <span className="rounded-full border border-border bg-surface px-2 py-1 font-mono text-[10px] text-textMuted">
-              seed {seed}
-            </span>
-          ) : null}
-          {!eitherRunning && !eitherStarting ? (
-            <button
-              type="button"
-              className="btn-primary"
-              onClick={onStart}
-              disabled={!canStart}
-            >
-              ▶ Run side-by-side
-            </button>
-          ) : (
-            <button type="button" className="btn-secondary" onClick={onEnd}>
-              End both
-            </button>
-          )}
-        </div>
-      </div>
-    </header>
-  );
-}
-// ---------------------------------------------------------------------------
-function SlotColumn({
-  slot,
-  connection,
-  accent,
-  title,
-}: {
-  slot: CompareSlot;
-  connection: LlmConnection;
-  accent: "primary" | "blue";
-  title: string;
-}): JSX.Element {
-  const turns = slot.state.turns;
-  const latestTurn: LlmTurn | undefined = turns[turns.length - 1];
-  const observation = latestTurn?.observation ?? slot.state.initialObservation;
-  const lastReward = latestTurn?.observation.reward_breakdown ?? ZERO_REWARD;
-  const observed = slot.state.initialObservation?.trajectory ?? observation?.trajectory ?? [];
-  const stateVariables = observation?.state_variables ?? [];
-  const primaryVariable = useMemo(
-    () =>
-      stateVariables.length > 0 ? pickPrimaryVariable(stateVariables) : "y",
-    [stateVariables],
-  );
-  const finalMatch = lastReward.match ?? 0;
-  const converged = finalMatch >= CONVERGENCE_THRESHOLD;
-  const accentBorder = accent === "primary" ? "border-primary/40" : "border-accentBlue/40";
-  const accentText = accent === "primary" ? "text-primary" : "text-accentBlue";
-  return (
-    <section className={cn("panel flex flex-col gap-3 border-l-4", accentBorder)}>
-      <header className="flex items-baseline justify-between gap-2">
-        <div className="flex items-baseline gap-2">
-          <span className={cn("heading-eyebrow", accentText)}>Side {title}</span>
-          <span className="font-mono text-xs text-textPrimary">
-            {connection.model || "(no model selected)"}
-          </span>
-        </div>
-        <SlotStatusBadge status={slot.state.status} converged={converged} />
-      </header>
-      {slot.state.errorMessage ? (
-        <ErrorRow
-          message={slot.state.errorMessage}
-          onDismiss={() => slot.controls.resetError()}
-        />
-      ) : null}
-      {observation ? (
-        <>
-          <div className="rounded-lg border border-border bg-surfaceMuted p-3">
-            <TrajectoryCanvas
-              observed={observed}
-              predicted={latestTurn?.predictedTrajectory ?? []}
-              variable={primaryVariable}
-              variableLabel={primaryVariable}
-              predictedProgress={1}
-            />
-          </div>
-          <DenseRewardRow reward={lastReward} />
-          <p className="text-[11px] text-textMuted">
-            <span className="font-semibold uppercase tracking-wide text-textPrimary">
-              Hint:
-            </span>{" "}
-            {observation.hint || "(none)"}
-          </p>
-          <SlotTurns turns={turns} />
-        </>
-      ) : (
-        <SlotPlaceholder status={slot.state.status} />
-      )}
-    </section>
-  );
-}
-function SlotStatusBadge({
-  status,
-  converged,
-}: {
-  status: RunnerStatus;
-  converged: boolean;
-}): JSX.Element {
-  if (converged) {
-    return (
-      <span className="rounded-full border border-accentGreen/40 bg-accentGreen/10 px-2 py-0.5 text-[10px] uppercase tracking-wider text-accentGreen">
-        converged
-      </span>
-    );
-  }
-  const tone =
-    status === "running"
-      ? "border-accentBlue/40 text-accentBlue"
-      : status === "ended"
-        ? "border-textMuted/40 text-textMuted"
-        : status === "error"
-          ? "border-primary/40 text-primary"
-          : "border-textMuted/40 text-textMuted";
-  return (
-    <span className={cn("rounded-full border bg-surface px-2 py-0.5 text-[10px] uppercase tracking-wider", tone)}>
-      {labelForStatus(status)}
-    </span>
-  );
-}
-function labelForStatus(status: RunnerStatus): string {
-  switch (status) {
-    case "starting":
-      return "starting";
-    case "running":
-      return "running";
-    case "paused":
-      return "paused";
-    case "ended":
-      return "done";
-    case "error":
-      return "error";
-    case "idle":
-    default:
-      return "idle";
-  }
-}
-function SlotPlaceholder({ status }: { status: RunnerStatus }): JSX.Element {
-  if (status === "starting") {
-    return (
-      <div className="flex flex-col gap-2" aria-busy>
-        <Skeleton className="h-[240px] w-full" />
-        <Skeleton className="h-3 w-48" />
-      </div>
-    );
-  }
-  return (
-    <p className="text-xs text-textMuted">
-      Configure both connections and press Run to start.
-    </p>
-  );
-}
-function SlotTurns({ turns }: { turns: LlmTurn[] }): JSX.Element {
-  if (turns.length === 0) {
-    return <p className="text-[11px] text-textMuted">No turns yet.</p>;
-  }
-  // Render a short transcript: just the latest two turns to keep the
-  // column readable in the side-by-side layout. Power users can read
-  // the raw turn-by-turn dump from the original single-model view.
-  const visible = turns.slice(-2);
-  return (
-    <details className="rounded-md border border-border bg-surface px-3 py-2 text-[11px] text-textMuted">
-      <summary className="cursor-pointer text-textPrimary">
-        Latest turns ({turns.length})
-      </summary>
-      <ol className="mt-2 flex flex-col gap-2">
-        {visible.map((turn) => (
-          <li
-            key={turn.turn}
-            className="rounded border border-border bg-surfaceMuted p-2"
-          >
-            <div className="mb-1 flex items-center justify-between text-[10px] text-textMuted">
-              <span>turn {turn.turn}</span>
-              <span className="font-mono">
-                R² {(turn.observation.reward_breakdown.match * 100).toFixed(0)}%
-              </span>
-            </div>
-            {turn.action.equation ? (
-              <EquationDisplay
-                equation={turn.action.equation}
-                rationale={turn.action.rationale}
-              />
-            ) : (
-              <span className="text-accentAmber">unparseable</span>
-            )}
-          </li>
-        ))}
-      </ol>
-    </details>
-  );
-}
-function DenseRewardRow({ reward }: { reward: RewardBreakdown }): JSX.Element {
-  // Reward components (top) feed the trainer's weighted total.
-  // Diagnostic sub-scores (bottom) are visual-closeness signals only —
-  // see RewardBreakdown class docstring on the backend.
-  const rewardComponents: { name: string; value: number }[] = [
-    { name: "match", value: reward.match ?? 0 },
-    { name: "progress", value: reward.progress ?? 0 },
-    { name: "simplicity", value: reward.simplicity ?? 0 },
-    { name: "format", value: reward.format ?? 0 },
-  ];
-  const diagComponents: { name: string; value: number }[] = [
-    { name: "shape", value: reward.shape ?? 0 },
-    { name: "freq", value: reward.freq ?? 0 },
-    { name: "amplitude", value: reward.amplitude ?? 0 },
-  ];
-  return (
-    <div className="flex flex-col gap-2 rounded-md border border-border bg-surface px-3 py-2 font-mono text-[11px]">
-      <div className="grid grid-cols-4 gap-2">
-        {rewardComponents.map(({ name, value }) => (
-          <RewardCell key={name} name={name} value={value} />
-        ))}
-      </div>
-      <div className="flex items-center gap-2 border-t border-border/60 pt-2">
-        <span
-          className="text-[10px] uppercase tracking-wider text-textMuted"
-          title="Diagnostic-only — not part of the reward total. Captures visual closeness (shape / freq / amplitude) where R² collapses to 0."
-        >
-          diag
-        </span>
-        <div className="grid flex-1 grid-cols-3 gap-2">
-          {diagComponents.map(({ name, value }) => (
-            <RewardCell key={name} name={name} value={value} muted />
-          ))}
-        </div>
-      </div>
-    </div>
-  );
-}
-function RewardCell({
-  name,
-  value,
-  muted = false,
-}: {
-  name: string;
-  value: number;
-  muted?: boolean;
-}): JSX.Element {
-  return (
-    <div className="flex flex-col gap-1">
-      <div className="flex items-baseline justify-between">
-        <span className="text-textMuted">{name}</span>
-        <span className={muted ? "text-textMuted" : "text-textPrimary"}>
-          {value.toFixed(2)}
-        </span>
-      </div>
-      <div
-        className="h-1 w-full overflow-hidden rounded-full bg-border"
-        aria-hidden
-      >
-        <div
-          className={cn(
-            "h-full rounded-full",
-            value >= 0.7
-              ? muted
-                ? "bg-accentBlue/60"
-                : "bg-accentGreen/70"
-              : value >= 0.3
-                ? "bg-accentAmber/70"
-                : "bg-textMuted/40",
-          )}
-          style={{ width: `${Math.max(0, Math.min(1, value)) * 100}%` }}
-        />
-      </div>
-    </div>
-  );
-}
-function ScoreboardBanner({
-  a,
-  b,
-}: {
-  a: CompareSlot;
-  b: CompareSlot;
-}): JSX.Element | null {
-  const aDone = a.state.status === "ended";
-  const bDone = b.state.status === "ended";
-  if (!aDone || !bDone) return null;
-  const aMatch = lastMatch(a);
-  const bMatch = lastMatch(b);
-  const winner = aMatch === bMatch ? null : aMatch > bMatch ? "A" : "B";
-  return (
-    <div className="panel border border-accentGreen/30 bg-accentGreen/5 text-sm">
-      <p className="heading-eyebrow text-accentGreen">Scoreboard</p>
-      <div className="mt-2 flex flex-wrap items-baseline gap-6 text-textPrimary">
-        <span>
-          A: <span className="font-mono">{formatPercent(aMatch)}</span> R²
-        </span>
-        <span>
-          B: <span className="font-mono">{formatPercent(bMatch)}</span> R²
-        </span>
-        {winner ? (
-          <span className="text-accentGreen">
-            Winner:{" "}
-            <strong className="font-semibold text-textPrimary">{winner}</strong>
-          </span>
-        ) : (
-          <span className="text-textMuted">Tied.</span>
-        )}
-      </div>
-    </div>
-  );
-}
-function ErrorRow({
-  message,
-  onDismiss,
-}: {
-  message: string;
-  onDismiss: () => void;
-}): JSX.Element {
-  return (
-    <div
-      role="alert"
-      className="flex items-start justify-between gap-2 rounded-md border border-accentAmber/40 bg-accentAmber/5 px-3 py-2 text-xs text-accentAmber"
-    >
-      <span className="whitespace-pre-line">{message}</span>
-      <button
-        type="button"
-        onClick={onDismiss}
-        className="text-[10px] text-textMuted underline hover:text-textPrimary"
-      >
-        dismiss
-      </button>
-    </div>
-  );
-}
-function Field({
-  label,
-  children,
-}: {
-  label: string;
-  children: React.ReactNode;
-}): JSX.Element {
-  return (
-    <label className="flex flex-col gap-1 text-xs text-textMuted">
-      <span className="heading-eyebrow">{label}</span>
-      {children}
-    </label>
-  );
-}
-function isActive(status: RunnerStatus): boolean {
-  return status === "running" || status === "paused";
-}
-function lastMatch(slot: CompareSlot): number {
-  const turns = slot.state.turns;
-  const last = turns[turns.length - 1];
-  return last?.observation.reward_breakdown.match ?? 0;
-}
-function prettySystemId(systemId: string): string {
-  if (!systemId) return "(none)";
-  return systemId
-    .split("_")
-    .map((part) => part.charAt(0).toUpperCase() + part.slice(1))
-    .join(" ");
-}

frontend/src/components/RunWithLlmPane.tsx CHANGED Viewed

@@ -1,17 +1,19 @@
-/** Single-LLM run pane: pick a connection, hit run, watch the model
- *  step through the episode turn by turn.
  *
  *  This is the "all the steps" view — every turn's hypothesis, reward
  *  breakdown, latency, and raw completion are surfaced in a scrollable
  *  transcript so judges can audit exactly what the model proposed.
  *
- *  Compare with `ComparePane`, which runs two of these side by side
- *  with a condensed transcript per side. */
 import { useEffect, useMemo, useState } from "react";
 import { EquationDisplay } from "@/components/EquationDisplay";
-import { LlmConnectionPanel } from "@/components/LlmConnectionPanel";
 import { Skeleton } from "@/components/Skeleton";
 import { TrajectoryCanvas } from "@/components/TrajectoryCanvas";
 import {
@@ -22,7 +24,13 @@ import {
 import { cn } from "@/lib/cn";
 import {
   DEFAULT_SINGLE_LLM_CONNECTION,
   type LlmConnection,
 } from "@/lib/llmPresets";
 import { pickPrimaryVariable } from "@/lib/trajectory";
 import type { RewardBreakdown } from "@/types/physix";
@@ -60,7 +68,13 @@ export function RunWithLlmPane(): JSX.Element {
   const status = runner.status;
   const busy = status === "starting" || status === "running";
   const hasSession = runner.sessionId !== null;
-  const canStart = !busy && !!connection.model.trim() && !!connection.baseUrl.trim();
   function handleStart(): void {
     void runner.start({
@@ -105,19 +119,10 @@ export function RunWithLlmPane(): JSX.Element {
         </p>
       </header>
-      <EndpointGuide />
-      <LlmConnectionPanel
-        title="LLM"
-        subtitle="One model drives the episode."
-        accent="primary"
-        value={connection}
         onChange={setConnection}
         disabled={busy}
-        installedOllamaModels={runner.models ?? []}
-        installedOllamaLoading={runner.models === null}
-        installedOllamaError={runner.modelsError}
-        onRefreshOllama={() => void runner.refreshModels()}
       />
       <ControlBar
@@ -639,56 +644,175 @@ function RewardCell({
   );
 }
-/** "Which endpoint should I pick?" callout shown above the connection
- *  panel. Three rows of one-liner guidance; no images, no links to
- *  external docs — keeps the page fast and the answer visible without
- *  scrolling.
- *
- *  Why this exists:
- *    The endpoint dropdown has 5 options and the optimal pick depends
- *    on what the user has on hand. Without this callout most visitors
- *    default to whatever's first and either (a) hit a token error
- *    (HF Router with no token) or (b) sit through a 90 s GPU cold-boot
- *    (PhysiX-Infer) without knowing it's coming. */
-function EndpointGuide(): JSX.Element {
   return (
-    <div className="rounded-lg border border-border bg-surfaceMuted px-4 py-3 text-xs leading-relaxed text-textMuted">
-      <p className="heading-eyebrow text-textPrimary">
-        Which endpoint should you pick?
       </p>
-      <ul className="mt-2 flex flex-col gap-1.5">
-        <li>
-          <span className="text-textPrimary">Hugging Face Router</span>{" "}
-          <span className="rounded bg-surface px-1.5 py-0.5 text-[10px] uppercase tracking-wider text-textMuted">
-            default
-          </span>{" "}
-          — easiest path. Paste a token from{" "}
-          <code className="font-mono text-textPrimary">
-            huggingface.co/settings/tokens
-          </code>{" "}
-          (with the &quot;Make calls to Inference Providers&quot; permission),
-          pick a suggested model, hit Run. Responds in ~2 s, no warm-up.
-        </li>
-        <li>
-          <span className="text-textPrimary">PhysiX-Infer GPU ✦</span> — only
-          way to compare the GRPO-trained{" "}
-          <code className="font-mono text-textPrimary">physix-3b-rl</code>{" "}
-          against its{" "}
-          <code className="font-mono text-textPrimary">Qwen 2.5 3B</code> base
-          on identical hardware. No token. Sleeps after 5 min idle so first
-          request after sleep takes ~90-120 s while two 3B models load on the
-          L4 — the status banner below shows live state, with a Prewarm
-          button to wake it before you hit Run.
-        </li>
-        <li>
-          <span className="text-textPrimary">Ollama / OpenAI / Custom</span>{" "}
-          — bring your own endpoint. Useful for local dev (Ollama on
-          <code className="font-mono text-textPrimary">localhost:11434</code>),
-          frontier-model baselines (OpenAI), or pointing at a private vLLM /
-          inference endpoint URL.
-        </li>
-      </ul>
-    </div>
   );
 }

+/** Single-LLM run pane: pick one of three preset models, hit run,
+ *  watch the model step through the episode turn by turn.
  *
  *  This is the "all the steps" view — every turn's hypothesis, reward
  *  breakdown, latency, and raw completion are surfaced in a scrollable
  *  transcript so judges can audit exactly what the model proposed.
  *
+ *  The model picker is intentionally a hard 3-option choice (trained
+ *  PhysiX-3B, Qwen 3B base, Qwen 7B baseline) — typing model ids was
+ *  confusing for first-time users and most picks ended up being one of
+ *  these three anyway. */
 import { useEffect, useMemo, useState } from "react";
 import { EquationDisplay } from "@/components/EquationDisplay";
+import { PhysixInferStatus } from "@/components/PhysixInferStatus";
 import { Skeleton } from "@/components/Skeleton";
 import { TrajectoryCanvas } from "@/components/TrajectoryCanvas";
 import {
 import { cn } from "@/lib/cn";
 import {
   DEFAULT_SINGLE_LLM_CONNECTION,
+  MODEL_PRESETS,
   type LlmConnection,
+  type ModelPreset,
+  findEndpoint,
+  loadApiKey,
+  presetForConnection,
+  saveApiKey,
 } from "@/lib/llmPresets";
 import { pickPrimaryVariable } from "@/lib/trajectory";
 import type { RewardBreakdown } from "@/types/physix";
   const status = runner.status;
   const busy = status === "starting" || status === "running";
   const hasSession = runner.sessionId !== null;
+  const endpoint = findEndpoint(connection.endpointId);
+  const hasRequiredKey = !endpoint.needsKey || !!connection.apiKey.trim();
+  const canStart =
+    !busy &&
+    !!connection.model.trim() &&
+    !!connection.baseUrl.trim() &&
+    hasRequiredKey;
   function handleStart(): void {
     void runner.start({
         </p>
       </header>
+      <ModelPresetPicker
+        connection={connection}
         onChange={setConnection}
         disabled={busy}
       />
       <ControlBar
   );
 }
+// ---------------------------------------------------------------------
+// Model preset picker — three buttons + one (optional) API-key field.
+// ---------------------------------------------------------------------
+//
+// The picker replaces the old "Endpoint dropdown + freeform model id +
+// hint paragraph" UI. Users always pick one of three known-good models;
+// the API-key field only appears when the picked endpoint needs one
+// (just the HF Router 7B preset today). Per-preset connections are
+// persisted in localStorage by base URL via `loadApiKey` / `saveApiKey`,
+// so a token typed for the 7B preset survives a page reload and isn't
+// shown when the trained PhysiX preset is selected (it doesn't need
+// one).
+interface ModelPresetPickerProps {
+  connection: LlmConnection;
+  onChange: (next: LlmConnection) => void;
+  disabled?: boolean;
+}
+function ModelPresetPicker({
+  connection,
+  onChange,
+  disabled,
+}: ModelPresetPickerProps): JSX.Element {
+  const selected = presetForConnection(connection) ?? MODEL_PRESETS[0]!;
+  const endpoint = findEndpoint(selected.connection.endpointId);
+  const needsKey = endpoint.needsKey;
+  const [revealKey, setRevealKey] = useState(false);
+  // Hydrate the API key from per-URL storage whenever the preset (and
+  // therefore base URL) changes.
+  useEffect(() => {
+    if (!connection.baseUrl) return;
+    const stored = loadApiKey(connection.baseUrl);
+    if (stored && stored !== connection.apiKey) {
+      onChange({ ...connection, apiKey: stored });
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [connection.baseUrl]);
+  function selectPreset(preset: ModelPreset): void {
+    onChange({
+      ...preset.connection,
+      apiKey: loadApiKey(preset.connection.baseUrl),
+    });
+  }
+  function setApiKey(key: string): void {
+    saveApiKey(connection.baseUrl, key);
+    onChange({ ...connection, apiKey: key });
+  }
   return (
+    <section className="panel flex flex-col gap-4">
+      <header>
+        <p className="heading-eyebrow text-primary">Pick a model</p>
+        <p className="mt-1 text-xs text-textMuted">
+          Three known-good options — the trained PhysiX-3B, its Qwen 3B
+          base, and a Qwen 7B baseline. No URLs to type, no model ids to
+          paste.
+        </p>
+      </header>
+      <div
+        role="radiogroup"
+        aria-label="Model"
+        className="grid grid-cols-1 gap-3 md:grid-cols-3"
+      >
+        {MODEL_PRESETS.map((preset) => (
+          <PresetCard
+            key={preset.id}
+            preset={preset}
+            selected={selected.id === preset.id}
+            disabled={disabled ?? false}
+            onSelect={() => selectPreset(preset)}
+          />
+        ))}
+      </div>
+      {needsKey ? (
+        <label className="flex flex-col gap-1 text-xs text-textMuted">
+          <span className="heading-eyebrow flex items-baseline justify-between gap-2">
+            <span>HF token (required)</span>
+            <button
+              type="button"
+              onClick={() => setRevealKey((v) => !v)}
+              className="text-[10px] uppercase tracking-wider text-textMuted underline hover:text-textPrimary"
+            >
+              {revealKey ? "hide" : "show"}
+            </button>
+          </span>
+          <input
+            type={revealKey ? "text" : "password"}
+            value={connection.apiKey}
+            onChange={(e) => setApiKey(e.target.value)}
+            disabled={disabled}
+            placeholder="hf_..."
+            className="w-full rounded-lg border border-border bg-surfaceMuted px-3 py-2 font-mono text-xs text-textPrimary outline-none transition focus:border-textMuted disabled:opacity-50"
+          />
+          <span className="text-[11px] leading-relaxed text-textMuted">
+            Get one at{" "}
+            <code className="font-mono text-textPrimary">
+              huggingface.co/settings/tokens
+            </code>{" "}
+            with the &quot;Make calls to Inference Providers&quot;
+            permission. Saved per endpoint in your browser.
+          </span>
+        </label>
+      ) : null}
+      {/* Live banner only when the picked preset hits the GPU Space. */}
+      {selected.connection.endpointId === "physix" ? (
+        <PhysixInferStatus />
+      ) : null}
+    </section>
+  );
+}
+interface PresetCardProps {
+  preset: ModelPreset;
+  selected: boolean;
+  disabled: boolean;
+  onSelect: () => void;
+}
+function PresetCard({
+  preset,
+  selected,
+  disabled,
+  onSelect,
+}: PresetCardProps): JSX.Element {
+  return (
+    <button
+      type="button"
+      role="radio"
+      aria-checked={selected}
+      onClick={onSelect}
+      disabled={disabled}
+      className={cn(
+        "flex flex-col gap-2 rounded-xl border bg-surfaceMuted p-3 text-left transition",
+        "disabled:cursor-not-allowed disabled:opacity-50",
+        selected
+          ? "border-primary bg-primary/5 shadow-sm"
+          : "border-border hover:border-textMuted",
+      )}
+    >
+      <div className="flex items-center justify-between gap-2">
+        <span className="text-sm font-semibold text-textPrimary">
+          {preset.label}
+        </span>
+        <span
+          className={cn(
+            "rounded-full border bg-surface px-2 py-0.5 text-[10px] uppercase tracking-wider",
+            selected
+              ? "border-primary/60 text-primary"
+              : "border-border text-textMuted",
+          )}
+        >
+          {preset.badge}
+        </span>
+      </div>
+      <p className="text-[11px] leading-relaxed text-textMuted">
+        {preset.description}
       </p>
+      <code className="font-mono text-[10px] text-textMuted">
+        {preset.connection.model}
+      </code>
+    </button>
   );
 }

frontend/src/hooks/useLlmCompareRunner.ts DELETED Viewed

@@ -1,128 +0,0 @@
-/** Drives two parallel `useLlmEpisodeRunner` instances against the same
- *  episode seed. The whole point of the demo is to put two models on
- *  identical input and compare their behaviour, scored by the same
- *  verifier with no LLM-as-judge.
- *
- *  Implementation note: each side gets its own session because the env
- *  builds a turn-by-turn history that the next prompt depends on. We
- *  *don't* fork a single session — that would corrupt history. Instead
- *  we start two sessions with the same `system_id` + `seed`, which the
- *  server already supports via its existing reset path. */
-import { useCallback, useMemo, useRef, useState } from "react";
-import {
-  type LlmEpisodeRunnerControls,
-  type LlmEpisodeRunnerState,
-  useLlmEpisodeRunner,
-} from "@/hooks/useLlmEpisodeRunner";
-import type { LlmConnection } from "@/lib/llmPresets";
-export interface CompareSlot {
-  id: "a" | "b";
-  state: LlmEpisodeRunnerState;
-  controls: LlmEpisodeRunnerControls;
-}
-export interface CompareRunnerControls {
-  /** Start both sides on the same seed + system. Each side uses its
-   *  own connection. */
-  startBoth: (options: {
-    systemId?: string | undefined;
-    maxTurns?: number | undefined;
-    connectionA: LlmConnection;
-    connectionB: LlmConnection;
-    temperature?: number | undefined;
-  }) => Promise<void>;
-  /** End both sessions and reset state. */
-  endBoth: () => Promise<void>;
-}
-export interface CompareRunnerState {
-  a: CompareSlot;
-  b: CompareSlot;
-  /** Seed the last `startBoth` call locked in. Surfaces in the UI so
-   *  users know both sides really saw the same episode. */
-  lastSeed: number | null;
-  /** Resolved system_id (same for both slots). */
-  systemId: string | null;
-}
-export function useLlmCompareRunner(): CompareRunnerState & CompareRunnerControls {
-  const a = useLlmEpisodeRunner();
-  const b = useLlmEpisodeRunner();
-  const [lastSeed, setLastSeed] = useState<number | null>(null);
-  const [systemId, setSystemId] = useState<string | null>(null);
-  // Keep the latest controls on a ref so `startBoth` doesn't have to
-  // depend on them — useEpisodeRunner reinstates them on every render
-  // and pulling them through the dep array would churn the callback.
-  const controlsRef = useRef({ a: a, b: b });
-  controlsRef.current = { a, b };
-  const startBoth = useCallback(
-    async (options: {
-      systemId?: string | undefined;
-      maxTurns?: number | undefined;
-      connectionA: LlmConnection;
-      connectionB: LlmConnection;
-      temperature?: number | undefined;
-    }) => {
-      // Generate a single seed so both sides see identical observations.
-      // 31 bits keeps us inside JS-safe int range and Numpy-acceptable.
-      const seed = Math.floor(Math.random() * 2_147_483_647);
-      setLastSeed(seed);
-      setSystemId(options.systemId ?? null);
-      const common = {
-        systemId: options.systemId,
-        seed,
-        maxTurns: options.maxTurns,
-        temperature: options.temperature,
-      };
-      // Kick off both in parallel — the server makes independent
-      // sessions so they can't deadlock on each other.
-      await Promise.all([
-        controlsRef.current.a.start({ ...common, connection: options.connectionA }),
-        controlsRef.current.b.start({ ...common, connection: options.connectionB }),
-      ]);
-    },
-    [],
-  );
-  const endBoth = useCallback(async () => {
-    await Promise.all([
-      controlsRef.current.a.end(),
-      controlsRef.current.b.end(),
-    ]);
-    setLastSeed(null);
-    setSystemId(null);
-  }, []);
-  const slotA = useMemo<CompareSlot>(
-    () => ({
-      id: "a",
-      state: { ...a },
-      controls: { ...a },
-    }),
-    [a],
-  );
-  const slotB = useMemo<CompareSlot>(
-    () => ({
-      id: "b",
-      state: { ...b },
-      controls: { ...b },
-    }),
-    [b],
-  );
-  return {
-    a: slotA,
-    b: slotB,
-    lastSeed,
-    systemId: systemId ?? a.systemId ?? b.systemId,
-    startBoth,
-    endBoth,
-  };
-}

frontend/src/lib/llmPresets.ts CHANGED Viewed

@@ -186,48 +186,102 @@ export interface LlmConnection {
   apiKey: string;
 }
-/** Default for the single-LLM "Run with LLM" pane.
- *
- *  HF Router is the lowest-friction option for a first-time visitor:
- *  paste a token, pick a suggested model (all live-probed and known to
- *  serve), get a response in ~2 s. No GPU cold-start, no localhost
- *  dependency.
- *
- *  We prefill the model so the Run button is enabled the moment the
- *  user pastes a token — keeping the model empty and forcing them to
- *  pick from the dropdown is friction we don't need. The api key
- *  field is hydrated from localStorage by the panel on first render. */
 export const DEFAULT_SINGLE_LLM_CONNECTION: LlmConnection = {
-  endpointId: "hf",
-  baseUrl: HF_ROUTER_BASE_URL,
-  // Matches the first entry of the "hf" endpoint's modelSuggestions —
-  // smallest router-served Qwen model, fastest response.
-  model: "Qwen/Qwen2.5-7B-Instruct",
-  apiKey: "",
-};
-/** Default A side of the Compare pane: trained PhysiX-3B on the sister
- *  GPU Space. The Compare pane's whole purpose is the trained-vs-base
- *  side-by-side, so it's worth the cold-start penalty here even though
- *  the single-LLM pane avoids it. No token needed. */
-export const DEFAULT_CONNECTION_A: LlmConnection = {
   endpointId: "physix",
   baseUrl: PHYSIX_INFER_BASE_URL,
   model: PHYSIX_MODEL_ID,
   apiKey: "",
 };
-/** Default B side of the Compare pane: same sister Space, same L4 GPU,
- *  just the Qwen 2.5 3B baseline. Apples-to-apples — identical
- *  architecture, identical hardware, identical generation params; only
- *  the weights differ. Both models share the same Space, so warming
- *  side A also warms B. */
-export const DEFAULT_CONNECTION_B: LlmConnection = {
-  endpointId: "physix",
-  baseUrl: PHYSIX_INFER_BASE_URL,
-  model: QWEN_BASE_MODEL_ID,
-  apiKey: "",
-};
 /** Build a fresh connection when the user changes endpoints. Keeps the
  *  api key for the new base URL out of localStorage in this helper —

   apiKey: string;
 }
+/** Default for the single-LLM "Run with LLM" pane: the trained
+ *  PhysiX-3B. The picker is now a 3-button preset — the first preset's
+ *  connection IS this default, so they stay in sync. */
 export const DEFAULT_SINGLE_LLM_CONNECTION: LlmConnection = {
   endpointId: "physix",
   baseUrl: PHYSIX_INFER_BASE_URL,
   model: PHYSIX_MODEL_ID,
   apiKey: "",
 };
+// ---------------------------------------------------------------------
+// Model presets — the 3 fixed options the Run pane exposes.
+// ---------------------------------------------------------------------
+/** A single preset = "click here to talk to model X via endpoint Y".
+ *  The whole point is to spare users from picking an endpoint, then a
+ *  model id, then realising the two don't match. Each preset bundles
+ *  exactly the (endpoint, model, baseUrl, needsKey) tuple that works. */
+export interface ModelPreset {
+  id: string;
+  label: string;
+  /** One-line "what is this" copy shown under the label. */
+  description: string;
+  /** Short tag rendered as a pill (e.g. "trained", "3B base", "7B"). */
+  badge: string;
+  /** Pre-built connection — drop straight into the runner. */
+  connection: LlmConnection;
+}
+/** The three options the Run-with-LLM picker exposes. Order matters:
+ *  the first entry is the default selection on a fresh page-load.
+ *
+ *  Two of the three live on the PhysiX-Infer GPU Space (no token, same
+ *  L4 hardware) so users can compare the trained PhysiX-3B against its
+ *  Qwen 3B base apples-to-apples with one click. The 7B baseline runs
+ *  through HF Router because no provider serves Qwen 3B today and HF
+ *  Router gives a "bigger model" reference point in <2 s once a token
+ *  is pasted. */
+export const MODEL_PRESETS: readonly ModelPreset[] = [
+  {
+    id: "physix-3b-rl",
+    label: "PhysiX-3B (trained)",
+    description:
+      "Our GRPO-trained Qwen-3B on a sister L4 GPU Space. No token needed; first request after sleep is ~90-120 s while vLLM warms.",
+    badge: "trained ✦",
+    connection: {
+      endpointId: "physix",
+      baseUrl: PHYSIX_INFER_BASE_URL,
+      model: PHYSIX_MODEL_ID,
+      apiKey: "",
+    },
+  },
+  {
+    id: "qwen-3b-base",
+    label: "Qwen 2.5 3B (base)",
+    description:
+      "Untrained base of PhysiX-3B on the same L4 Space. Apples-to-apples — identical hardware and generation params, only the weights differ.",
+    badge: "3B base",
+    connection: {
+      endpointId: "physix",
+      baseUrl: PHYSIX_INFER_BASE_URL,
+      model: QWEN_BASE_MODEL_ID,
+      apiKey: "",
+    },
+  },
+  {
+    id: "qwen-7b-hf",
+    label: "Qwen 2.5 7B (HF Router)",
+    description:
+      "Bigger 7B baseline routed through Hugging Face. Needs an HF token with 'Make calls to Inference Providers' permission; responds in ~2 s.",
+    badge: "7B",
+    connection: {
+      endpointId: "hf",
+      baseUrl: HF_ROUTER_BASE_URL,
+      model: "Qwen/Qwen2.5-7B-Instruct",
+      apiKey: "",
+    },
+  },
+];
+export function findPreset(id: string): ModelPreset {
+  return MODEL_PRESETS.find((p) => p.id === id) ?? MODEL_PRESETS[0]!;
+}
+/** Match a connection back to a preset (e.g. for selection state when
+ *  hydrating from storage). Returns the first preset whose endpoint+
+ *  model match; null if none match. */
+export function presetForConnection(c: LlmConnection): ModelPreset | null {
+  return (
+    MODEL_PRESETS.find(
+      (p) =>
+        p.connection.endpointId === c.endpointId &&
+        p.connection.model === c.model,
+    ) ?? null
+  );
+}
 /** Build a fresh connection when the user changes endpoints. Keeps the
  *  api key for the new base URL out of localStorage in this helper —