| import type { PPTElement } from '@/lib/types/slides'; | |
| import type { Stage, Scene } from '@/lib/types/stage'; | |
| // ==================== Scenario ==================== | |
| export interface EvalTurn { | |
| userMessage: string; | |
| checkpoint?: boolean; | |
| } | |
| export interface EvalScenario { | |
| id: string; | |
| name: string; | |
| description: string; | |
| tags: string[]; | |
| initialStoreState: { | |
| stage: Stage | null; | |
| scenes: Scene[]; | |
| currentSceneId: string | null; | |
| whiteboardElements?: PPTElement[]; | |
| }; | |
| config: { | |
| agentIds: string[]; | |
| sessionType: 'qa' | 'discussion'; | |
| }; | |
| turns: EvalTurn[]; | |
| model?: string; | |
| repeat?: number; | |
| } | |
| // ==================== Scoring ==================== | |
| export interface DimensionScore { | |
| score: number; | |
| reason: string; | |
| } | |
| export interface VlmScore { | |
| readability: DimensionScore; | |
| overlap: DimensionScore; | |
| rendering_correctness: DimensionScore; | |
| content_completeness: DimensionScore; | |
| layout_logic: DimensionScore; | |
| overall: number; | |
| issues: string[]; | |
| } | |
| // ==================== Results ==================== | |
| export interface CheckpointResult { | |
| turnIndex: number; | |
| screenshotPath: string; | |
| /** null when VLM scoring failed — screenshot is still preserved. */ | |
| score: VlmScore | null; | |
| elements: PPTElement[]; | |
| } | |
| export interface ScenarioRunResult { | |
| scenarioId: string; | |
| runIndex: number; | |
| model: string; | |
| checkpoints: CheckpointResult[]; | |
| /** Per-turn wall-clock latency (ms) from runAgentLoop start to end. */ | |
| turnDurationsMs?: number[]; | |
| error?: string; | |
| } | |
| export interface EvalReport { | |
| timestamp: string; | |
| model: string; | |
| scenarios: ScenarioRunResult[]; | |
| } | |