| import { describe, it, expect, beforeEach, afterEach } from 'vitest'; |
| import { mkdtempSync, rmSync, readFileSync, existsSync } from 'fs'; |
| import { tmpdir } from 'os'; |
| import { join } from 'path'; |
| import { writeReport } from '@/eval/outline-language/reporter'; |
| import type { EvalResult } from '@/eval/outline-language/types'; |
|
|
| describe('writeReport', () => { |
| let runDir: string; |
|
|
| beforeEach(() => { |
| runDir = mkdtempSync(join(tmpdir(), 'reporter-test-')); |
| }); |
|
|
| afterEach(() => { |
| rmSync(runDir, { recursive: true, force: true }); |
| }); |
|
|
| const sample: EvalResult[] = [ |
| { |
| case_id: 'en-001', |
| category: 'english-teaching', |
| requirement: 'teach English to beginners', |
| groundTruth: 'English as primary', |
| directive: 'Use English as the teaching language', |
| outlinesCount: 3, |
| judgePassed: true, |
| judgeReason: 'correct language', |
| }, |
| { |
| case_id: 'zh-002', |
| category: 'chinese-teaching', |
| requirement: '用中文讲授数学', |
| groundTruth: '以中文为主', |
| directive: 'Use Chinese throughout', |
| outlinesCount: 2, |
| judgePassed: false, |
| judgeReason: 'wrong phrasing', |
| }, |
| ]; |
|
|
| it('writes a report.md file with header, per-case detail, and summary table', () => { |
| const path = writeReport(runDir, sample, { |
| inferenceModel: 'openai:gpt-4.1', |
| judgeModel: 'anthropic:claude-haiku-4-5', |
| }); |
| expect(existsSync(path)).toBe(true); |
| expect(path).toBe(join(runDir, 'report.md')); |
|
|
| const content = readFileSync(path, 'utf-8'); |
| expect(content).toContain('# Outline Language Inference Eval Results'); |
| expect(content).toContain('**Model**: openai:gpt-4.1'); |
| expect(content).toContain('**Judge model**: anthropic:claude-haiku-4-5'); |
| expect(content).toContain('**Passed**: 1/2'); |
| expect(content).toContain('### PASS en-001'); |
| expect(content).toContain('### **FAIL** zh-002'); |
| expect(content).toContain('## Summary'); |
| expect(content).toContain('| # | Case | Category | Outlines | Result | Judge reason |'); |
| expect(content).toContain('| 1 | en-001 | english-teaching | 3 | PASS | correct language |'); |
| expect(content).toContain('| 2 | zh-002 | chinese-teaching | 2 | FAIL | wrong phrasing |'); |
| }); |
|
|
| it('returns a successfully written path even when all cases pass', () => { |
| const allPass: EvalResult[] = [{ ...sample[0] }]; |
| const path = writeReport(runDir, allPass, { |
| inferenceModel: 'openai:gpt-4.1', |
| judgeModel: 'openai:gpt-4.1', |
| }); |
| const content = readFileSync(path, 'utf-8'); |
| expect(content).toContain('**Passed**: 1/1 (100%)'); |
| }); |
| }); |
|
|