| import { readFileSync, readdirSync, mkdirSync } from 'fs'; |
| import { join, dirname } from 'path'; |
| import { fileURLToPath } from 'url'; |
| import { parseArgs } from 'util'; |
| import type { EvalScenario, ScenarioRunResult, CheckpointResult, EvalReport } from './types'; |
| import type { Action } from '@/lib/types/action'; |
| import { runAgentLoop, type AgentLoopIterationResult } from '@/lib/chat/agent-loop'; |
| import { EvalStateManager } from './state-manager'; |
| import { initCapture, captureWhiteboard, closeCapture } from './capture'; |
| import { scoreScreenshot } from './scorer'; |
| import { generateReport } from './reporter'; |
| import { createRunDir } from '../shared/run-dir'; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| const { values: args } = parseArgs({ |
| options: { |
| scenario: { type: 'string' }, |
| repeat: { type: 'string', default: '1' }, |
| 'base-url': { type: 'string', default: 'http://localhost:3000' }, |
| 'output-dir': { type: 'string', default: 'eval/whiteboard-layout/results' }, |
| rescore: { type: 'string' }, |
| }, |
| }); |
|
|
| const BASE_URL = args['base-url']!; |
| const CHAT_MODEL_RAW = process.env.EVAL_CHAT_MODEL || process.env.DEFAULT_MODEL; |
| const SCORER_MODEL_RAW = process.env.EVAL_SCORER_MODEL; |
| const ENABLE_THINKING = |
| process.env.EVAL_ENABLE_THINKING === '1' || process.env.EVAL_ENABLE_THINKING === 'true'; |
| if (!CHAT_MODEL_RAW) { |
| console.error( |
| 'Error: EVAL_CHAT_MODEL (or DEFAULT_MODEL) must be set. Example: EVAL_CHAT_MODEL=openai:gpt-4.1', |
| ); |
| process.exit(1); |
| } |
| if (!SCORER_MODEL_RAW) { |
| console.error( |
| 'Error: EVAL_SCORER_MODEL must be set. Example: EVAL_SCORER_MODEL=google:gemini-2.5-flash', |
| ); |
| process.exit(1); |
| } |
| const CHAT_MODEL: string = CHAT_MODEL_RAW; |
| const SCORER_MODEL: string = SCORER_MODEL_RAW; |
| const REPEAT = parseInt(args.repeat || '1', 10); |
| const OUTPUT_DIR = args['output-dir']!; |
| const SCENARIO_FILTER = args.scenario; |
| const MAX_AGENT_TURNS = 10; |
|
|
| |
|
|
| function loadScenarios(): EvalScenario[] { |
| const currentDir = |
| typeof __dirname !== 'undefined' ? __dirname : dirname(fileURLToPath(import.meta.url)); |
| const scenarioDir = join(currentDir, 'scenarios'); |
| const files = readdirSync(scenarioDir).filter((f) => f.endsWith('.json')); |
| const scenarios: EvalScenario[] = []; |
|
|
| for (const file of files) { |
| const scenario: EvalScenario = JSON.parse(readFileSync(join(scenarioDir, file), 'utf-8')); |
| if (SCENARIO_FILTER && scenario.id !== SCENARIO_FILTER && !file.includes(SCENARIO_FILTER)) { |
| continue; |
| } |
| scenarios.push(scenario); |
| } |
|
|
| return scenarios; |
| } |
|
|
| |
|
|
| async function runScenario( |
| scenario: EvalScenario, |
| runIndex: number, |
| runDir: string, |
| ): Promise<ScenarioRunResult> { |
| const model = scenario.model || CHAT_MODEL; |
| const checkpoints: CheckpointResult[] = []; |
|
|
| console.log(` [run ${runIndex + 1}] Starting...`); |
|
|
| |
| const scenarioDir = join(runDir, scenario.id); |
| mkdirSync(scenarioDir, { recursive: true }); |
|
|
| const stateManager = new EvalStateManager(scenario.initialStoreState); |
| const messages: Array<{ |
| role: string; |
| content: string; |
| parts?: unknown[]; |
| metadata?: unknown; |
| }> = []; |
|
|
| |
| |
| const turnDurationsMs: number[] = []; |
|
|
| try { |
| for (let turnIdx = 0; turnIdx < scenario.turns.length; turnIdx++) { |
| const turn = scenario.turns[turnIdx]; |
| console.log(` Turn ${turnIdx + 1}: "${turn.userMessage.slice(0, 50)}..."`); |
|
|
| messages.push({ |
| role: 'user', |
| content: turn.userMessage, |
| parts: [{ type: 'text', text: turn.userMessage }], |
| metadata: { createdAt: Date.now() }, |
| }); |
|
|
| |
| let iterResult: AgentLoopIterationResult | null = null; |
| let currentAgentId: string | null = null; |
| let currentMessageId: string | null = null; |
| const textParts: string[] = []; |
| const actionParts: Array<{ type: string; actionName: string; params: unknown }> = []; |
| let cueUserReceived = false; |
| |
| |
| |
| |
| |
| let actionChain: Promise<void> = Promise.resolve(); |
|
|
| |
| const controller = new AbortController(); |
| const turnStartMs = Date.now(); |
| await runAgentLoop( |
| { |
| config: scenario.config, |
| apiKey: '', |
| model, |
| }, |
| { |
| getStoreState: () => stateManager.getStoreState(), |
| getMessages: () => messages, |
|
|
| fetchChat: async (body, signal) => { |
| |
| currentAgentId = null; |
| currentMessageId = null; |
| textParts.length = 0; |
| actionParts.length = 0; |
| cueUserReceived = false; |
| iterResult = null; |
| actionChain = Promise.resolve(); |
|
|
| |
| |
| |
| const bodyWithThinking = ENABLE_THINKING |
| ? { ...body, thinking: { enabled: true } } |
| : body; |
|
|
| return fetch(`${BASE_URL}/api/chat`, { |
| method: 'POST', |
| headers: { 'Content-Type': 'application/json' }, |
| body: JSON.stringify(bodyWithThinking), |
| signal, |
| }); |
| }, |
|
|
| onEvent: (event) => { |
| switch (event.type) { |
| case 'agent_start': |
| currentAgentId = event.data.agentId; |
| currentMessageId = event.data.messageId; |
| break; |
|
|
| case 'text_delta': |
| textParts.push(event.data.content); |
| break; |
|
|
| case 'action': { |
| const action: Action = { |
| id: event.data.actionId, |
| type: event.data.actionName, |
| ...event.data.params, |
| } as Action; |
| |
| |
| |
| actionChain = actionChain.then(() => stateManager.executeAction(action)); |
| actionParts.push({ |
| type: `action-${event.data.actionName}`, |
| actionName: event.data.actionName, |
| params: event.data.params, |
| }); |
| break; |
| } |
|
|
| case 'cue_user': |
| cueUserReceived = true; |
| break; |
|
|
| case 'done': |
| iterResult = { |
| directorState: event.data.directorState, |
| totalAgents: event.data.totalAgents, |
| agentHadContent: event.data.agentHadContent ?? true, |
| cueUserReceived, |
| }; |
| break; |
|
|
| case 'error': |
| throw new Error(`API error: ${event.data.message}`); |
| } |
| }, |
|
|
| onIterationEnd: async () => { |
| |
| |
| try { |
| await actionChain; |
| } catch (err) { |
| const msg = err instanceof Error ? err.message : String(err); |
| console.error(` Action execution error: ${msg.slice(0, 120)}`); |
| } |
|
|
| |
| if (currentMessageId && (textParts.length > 0 || actionParts.length > 0)) { |
| const parts: unknown[] = []; |
| if (textParts.length > 0) { |
| parts.push({ type: 'text', text: textParts.join('') }); |
| } |
| for (const ap of actionParts) { |
| parts.push({ ...ap, state: 'result', output: { success: true } }); |
| } |
| messages.push({ |
| role: 'assistant', |
| content: textParts.join(''), |
| parts, |
| metadata: { |
| senderName: currentAgentId || 'agent', |
| originalRole: 'agent', |
| agentId: currentAgentId, |
| createdAt: Date.now(), |
| }, |
| }); |
| } |
|
|
| return iterResult; |
| }, |
| }, |
| controller.signal, |
| MAX_AGENT_TURNS, |
| ); |
| const turnDurationMs = Date.now() - turnStartMs; |
| turnDurationsMs.push(turnDurationMs); |
| console.log( |
| ` [timing] turn ${turnIdx + 1} ran in ${(turnDurationMs / 1000).toFixed(1)}s`, |
| ); |
|
|
| |
| const isLastTurn = turnIdx === scenario.turns.length - 1; |
| const isCheckpoint = turn.checkpoint || isLastTurn; |
|
|
| if (isCheckpoint) { |
| const elements = stateManager.getWhiteboardElements(); |
| const screenshotFilename = `run${runIndex}_turn${turnIdx}.png`; |
| const screenshotPath = await captureWhiteboard(elements, scenarioDir, screenshotFilename); |
| console.log(` Captured: ${screenshotFilename} (${elements.length} elements)`); |
|
|
| try { |
| const score = await scoreScreenshot(screenshotPath, SCORER_MODEL); |
| console.log(` Score: overall=${score.overall}, overlap=${score.overlap.score}`); |
| checkpoints.push({ turnIndex: turnIdx, screenshotPath, score, elements }); |
| } catch (scoreErr) { |
| const msg = scoreErr instanceof Error ? scoreErr.message : String(scoreErr); |
| console.error(` Score error (continuing): ${msg.slice(0, 120)}`); |
| checkpoints.push({ turnIndex: turnIdx, screenshotPath, score: null, elements }); |
| } |
| } |
| } |
| } catch (error) { |
| const msg = error instanceof Error ? error.message : String(error); |
| console.error(` Error: ${msg}`); |
| return { scenarioId: scenario.id, runIndex, model, checkpoints, turnDurationsMs, error: msg }; |
| } finally { |
| stateManager.dispose(); |
| } |
|
|
| return { scenarioId: scenario.id, runIndex, model, checkpoints, turnDurationsMs }; |
| } |
|
|
| |
|
|
| async function rescoreRun(runDir: string) { |
| console.log('=== Rescore Mode ==='); |
| console.log(`Scorer: ${SCORER_MODEL}`); |
| console.log(`Run dir: ${runDir}`); |
|
|
| |
| const reportPath = join(runDir, 'report.json'); |
| const oldReport: EvalReport = JSON.parse(readFileSync(reportPath, 'utf-8')); |
|
|
| const allResults: ScenarioRunResult[] = []; |
|
|
| for (const oldResult of oldReport.scenarios) { |
| console.log(`\nScenario: ${oldResult.scenarioId} (run ${oldResult.runIndex + 1})`); |
| const checkpoints: CheckpointResult[] = []; |
|
|
| for (const oldCp of oldResult.checkpoints) { |
| const pngPath = oldCp.screenshotPath; |
| console.log(` Rescoring: ${pngPath}`); |
|
|
| try { |
| const score = await scoreScreenshot(pngPath, SCORER_MODEL); |
| console.log(` Score: overall=${score.overall}, overlap=${score.overlap.score}`); |
| checkpoints.push({ ...oldCp, score }); |
| } catch (scoreErr) { |
| const msg = scoreErr instanceof Error ? scoreErr.message : String(scoreErr); |
| console.error(` Score error: ${msg.slice(0, 120)}`); |
| checkpoints.push(oldCp); |
| } |
| } |
|
|
| allResults.push({ ...oldResult, checkpoints }); |
| } |
|
|
| const report: EvalReport = { |
| timestamp: new Date().toISOString(), |
| model: oldReport.model, |
| scenarios: allResults, |
| }; |
|
|
| const { json, md } = generateReport(report, runDir); |
| console.log(`\nReport saved:`); |
| console.log(` JSON: ${json}`); |
| console.log(` Markdown: ${md}`); |
| } |
|
|
| |
|
|
| async function main() { |
| |
| if (args.rescore) { |
| await rescoreRun(args.rescore); |
| return; |
| } |
|
|
| console.log('=== Whiteboard Layout Eval ==='); |
| console.log(`Chat: ${CHAT_MODEL} | Scorer: ${SCORER_MODEL} | Repeats: ${REPEAT}`); |
| console.log(`Thinking: ${ENABLE_THINKING ? 'ON' : 'OFF'}`); |
| console.log(''); |
|
|
| const scenarios = loadScenarios(); |
| if (scenarios.length === 0) { |
| console.error('No scenarios found. Check eval/whiteboard-layout/scenarios/'); |
| process.exit(1); |
| } |
| console.log(`Loaded ${scenarios.length} scenario(s)`); |
|
|
| const runDir = createRunDir(OUTPUT_DIR, CHAT_MODEL); |
| console.log(`Output: ${runDir}`); |
|
|
| await initCapture(BASE_URL); |
|
|
| const allResults: ScenarioRunResult[] = []; |
|
|
| for (const scenario of scenarios) { |
| console.log(`\nScenario: ${scenario.name} (${scenario.id})`); |
| const repeats = scenario.repeat ?? REPEAT; |
|
|
| for (let r = 0; r < repeats; r++) { |
| const result = await runScenario(scenario, r, runDir); |
| allResults.push(result); |
| } |
| } |
|
|
| await closeCapture(); |
|
|
| const report: EvalReport = { |
| timestamp: new Date().toISOString(), |
| model: CHAT_MODEL, |
| scenarios: allResults, |
| }; |
|
|
| const { json, md } = generateReport(report, runDir); |
| console.log(`\nReport saved:`); |
| console.log(` JSON: ${json}`); |
| console.log(` Markdown: ${md}`); |
| } |
|
|
| main().catch((err) => { |
| console.error('Fatal error:', err); |
| process.exit(1); |
| }); |
|
|