| import { writeFileSync, mkdirSync } from 'fs'; |
| import { join } from 'path'; |
| import type { EvalReport, VlmScore } from './types'; |
|
|
| function mean(nums: number[]): number { |
| if (nums.length === 0) return 0; |
| return nums.reduce((a, b) => a + b, 0) / nums.length; |
| } |
|
|
| function formatNum(n: number): string { |
| return n.toFixed(1); |
| } |
|
|
| |
| |
| |
| export function generateReport( |
| report: EvalReport, |
| outputDir: string, |
| ): { json: string; md: string } { |
| mkdirSync(outputDir, { recursive: true }); |
|
|
| |
| const allScores: VlmScore[] = []; |
| for (const scenario of report.scenarios) { |
| for (const cp of scenario.checkpoints) { |
| if (cp.score) allScores.push(cp.score); |
| } |
| } |
|
|
| const dimensions = [ |
| 'readability', |
| 'overlap', |
| 'rendering_correctness', |
| 'content_completeness', |
| 'layout_logic', |
| ] as const; |
|
|
| |
| const summary: Record<string, { mean: number; min: number; max: number }> = {}; |
| if (allScores.length > 0) { |
| for (const dim of dimensions) { |
| const vals = allScores.map((s) => s[dim]?.score).filter((v): v is number => v != null); |
| if (vals.length === 0) continue; |
| summary[dim] = { |
| mean: mean(vals), |
| min: Math.min(...vals), |
| max: Math.max(...vals), |
| }; |
| } |
| const overallVals = allScores.map((s) => s.overall); |
| summary['overall'] = { |
| mean: mean(overallVals), |
| min: Math.min(...overallVals), |
| max: Math.max(...overallVals), |
| }; |
| } |
|
|
| |
| const jsonPath = join(outputDir, 'report.json'); |
| writeFileSync(jsonPath, JSON.stringify(report, null, 2)); |
|
|
| |
| const lines: string[] = []; |
| lines.push('# Whiteboard Layout Eval Report'); |
| lines.push( |
| `Run: ${report.timestamp} | Model: ${report.model} | Scenarios: ${report.scenarios.length}`, |
| ); |
| lines.push(''); |
| lines.push('## Summary'); |
| lines.push('| Metric | Mean | Min | Max |'); |
| lines.push('|--------|------|-----|-----|'); |
| for (const [key, stats] of Object.entries(summary)) { |
| lines.push(`| ${key} | ${formatNum(stats.mean)} | ${stats.min} | ${stats.max} |`); |
| } |
| lines.push(''); |
|
|
| |
| const allTurnDurations: number[] = []; |
| for (const scenario of report.scenarios) { |
| if (scenario.turnDurationsMs) { |
| for (const ms of scenario.turnDurationsMs) allTurnDurations.push(ms); |
| } |
| } |
| if (allTurnDurations.length > 0) { |
| const sorted = [...allTurnDurations].sort((a, b) => a - b); |
| const p50 = sorted[Math.floor(sorted.length * 0.5)]; |
| const p95 = sorted[Math.min(sorted.length - 1, Math.floor(sorted.length * 0.95))]; |
| const meanMs = mean(allTurnDurations); |
| const totalS = allTurnDurations.reduce((a, b) => a + b, 0) / 1000; |
| lines.push('## Turn latency'); |
| lines.push('| Metric | Value |'); |
| lines.push('|--------|-------|'); |
| lines.push(`| Turns measured | ${allTurnDurations.length} |`); |
| lines.push(`| Mean | ${(meanMs / 1000).toFixed(2)}s |`); |
| lines.push(`| p50 | ${(p50 / 1000).toFixed(2)}s |`); |
| lines.push(`| p95 | ${(p95 / 1000).toFixed(2)}s |`); |
| lines.push(`| Total across all turns | ${totalS.toFixed(1)}s |`); |
| lines.push(''); |
| } |
|
|
| lines.push('## Scenarios'); |
| for (const scenario of report.scenarios) { |
| const lastCp = scenario.checkpoints[scenario.checkpoints.length - 1]; |
| lines.push(`### ${scenario.scenarioId} (run ${scenario.runIndex + 1})`); |
| if (scenario.error) { |
| lines.push(`- Error: ${scenario.error}`); |
| } else if (lastCp) { |
| if (lastCp.score) { |
| lines.push(`- Overall: ${lastCp.score.overall}`); |
| lines.push(`- Overlap: ${lastCp.score.overlap.score} — ${lastCp.score.overlap.reason}`); |
| if (lastCp.score.issues.length > 0) { |
| lines.push(`- Issues: ${lastCp.score.issues.join('; ')}`); |
| } |
| } else { |
| lines.push(`- Score: (scoring failed)`); |
| } |
| lines.push(`- Screenshot: ${lastCp.screenshotPath}`); |
| } |
| lines.push(''); |
| } |
|
|
| const mdPath = join(outputDir, 'report.md'); |
| writeFileSync(mdPath, lines.join('\n')); |
|
|
| return { json: jsonPath, md: mdPath }; |
| } |
|
|