File size: 1,626 Bytes
a0ebf39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import type { PPTElement } from '@/lib/types/slides';
import type { Stage, Scene } from '@/lib/types/stage';

// ==================== Scenario ====================

export interface EvalTurn {
  userMessage: string;
  checkpoint?: boolean;
}

export interface EvalScenario {
  id: string;
  name: string;
  description: string;
  tags: string[];
  initialStoreState: {
    stage: Stage | null;
    scenes: Scene[];
    currentSceneId: string | null;
    whiteboardElements?: PPTElement[];
  };
  config: {
    agentIds: string[];
    sessionType: 'qa' | 'discussion';
  };
  turns: EvalTurn[];
  model?: string;
  repeat?: number;
}

// ==================== Scoring ====================

export interface DimensionScore {
  score: number;
  reason: string;
}

export interface VlmScore {
  readability: DimensionScore;
  overlap: DimensionScore;
  rendering_correctness: DimensionScore;
  content_completeness: DimensionScore;
  layout_logic: DimensionScore;
  overall: number;
  issues: string[];
}

// ==================== Results ====================

export interface CheckpointResult {
  turnIndex: number;
  screenshotPath: string;
  /** null when VLM scoring failed — screenshot is still preserved. */
  score: VlmScore | null;
  elements: PPTElement[];
}

export interface ScenarioRunResult {
  scenarioId: string;
  runIndex: number;
  model: string;
  checkpoints: CheckpointResult[];
  /** Per-turn wall-clock latency (ms) from runAgentLoop start to end. */
  turnDurationsMs?: number[];
  error?: string;
}

export interface EvalReport {
  timestamp: string;
  model: string;
  scenarios: ScenarioRunResult[];
}