File size: 1,626 Bytes
a0ebf39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import type { PPTElement } from '@/lib/types/slides';
import type { Stage, Scene } from '@/lib/types/stage';
// ==================== Scenario ====================
export interface EvalTurn {
userMessage: string;
checkpoint?: boolean;
}
export interface EvalScenario {
id: string;
name: string;
description: string;
tags: string[];
initialStoreState: {
stage: Stage | null;
scenes: Scene[];
currentSceneId: string | null;
whiteboardElements?: PPTElement[];
};
config: {
agentIds: string[];
sessionType: 'qa' | 'discussion';
};
turns: EvalTurn[];
model?: string;
repeat?: number;
}
// ==================== Scoring ====================
export interface DimensionScore {
score: number;
reason: string;
}
export interface VlmScore {
readability: DimensionScore;
overlap: DimensionScore;
rendering_correctness: DimensionScore;
content_completeness: DimensionScore;
layout_logic: DimensionScore;
overall: number;
issues: string[];
}
// ==================== Results ====================
export interface CheckpointResult {
turnIndex: number;
screenshotPath: string;
/** null when VLM scoring failed — screenshot is still preserved. */
score: VlmScore | null;
elements: PPTElement[];
}
export interface ScenarioRunResult {
scenarioId: string;
runIndex: number;
model: string;
checkpoints: CheckpointResult[];
/** Per-turn wall-clock latency (ms) from runAgentLoop start to end. */
turnDurationsMs?: number[];
error?: string;
}
export interface EvalReport {
timestamp: string;
model: string;
scenarios: ScenarioRunResult[];
}
|