import { generateText, type LanguageModel } from 'ai'; import type { JudgeResult } from './types'; const JUDGE_SYSTEM_PROMPT = `You are evaluating whether a language directive for an AI course generation system is reasonable given the expected behavior. You will be given: 1. The original user requirement 2. The generated language directive 3. The ground truth description of expected behavior Evaluation criteria — the directive should: - Use the correct primary teaching language - Handle terminology in a reasonable way for the subject and audience - For cross-language scenarios (foreign language learning, cross-language PDF), acknowledge both languages Be LENIENT in your evaluation: - The directive does NOT need to match the ground truth word-for-word - Different but equally valid approaches should PASS - If the teaching language is correct and the overall approach is reasonable, it should PASS - Only FAIL if the directive is clearly WRONG (e.g., wrong teaching language, completely ignoring a cross-language situation) Respond with ONLY a JSON object: {"pass": true/false, "reason": "brief explanation (1-2 sentences)"}`; /** * Ask an LLM-as-judge whether `directive` is a reasonable language directive * for `requirement` given `groundTruth`. Lenient rubric — see system prompt. */ export async function judgeDirective( judgeModel: LanguageModel, requirement: string, directive: string, groundTruth: string, ): Promise { const result = await generateText({ model: judgeModel, system: JUDGE_SYSTEM_PROMPT, prompt: `Requirement: "${requirement}"\n\nGenerated directive: "${directive}"\n\nGround truth: "${groundTruth}"`, temperature: 0, }); try { const text = result.text.replace(/```json\s*|\s*```/g, '').trim(); return JSON.parse(text) as JudgeResult; } catch { return { pass: false, reason: `Failed to parse judge response: ${result.text}` }; } }