| import { generateText, type LanguageModel } from 'ai'; |
| import type { JudgeResult } from './types'; |
|
|
| const JUDGE_SYSTEM_PROMPT = `You are evaluating whether a language directive for an AI course generation system is reasonable given the expected behavior. |
| |
| You will be given: |
| 1. The original user requirement |
| 2. The generated language directive |
| 3. The ground truth description of expected behavior |
| |
| Evaluation criteria — the directive should: |
| - Use the correct primary teaching language |
| - Handle terminology in a reasonable way for the subject and audience |
| - For cross-language scenarios (foreign language learning, cross-language PDF), acknowledge both languages |
| |
| Be LENIENT in your evaluation: |
| - The directive does NOT need to match the ground truth word-for-word |
| - Different but equally valid approaches should PASS |
| - If the teaching language is correct and the overall approach is reasonable, it should PASS |
| - Only FAIL if the directive is clearly WRONG (e.g., wrong teaching language, completely ignoring a cross-language situation) |
| |
| Respond with ONLY a JSON object: |
| {"pass": true/false, "reason": "brief explanation (1-2 sentences)"}`; |
|
|
| |
| |
| |
| |
| export async function judgeDirective( |
| judgeModel: LanguageModel, |
| requirement: string, |
| directive: string, |
| groundTruth: string, |
| ): Promise<JudgeResult> { |
| const result = await generateText({ |
| model: judgeModel, |
| system: JUDGE_SYSTEM_PROMPT, |
| prompt: `Requirement: "${requirement}"\n\nGenerated directive: "${directive}"\n\nGround truth: "${groundTruth}"`, |
| temperature: 0, |
| }); |
|
|
| try { |
| const text = result.text.replace(/```json\s*|\s*```/g, '').trim(); |
| return JSON.parse(text) as JudgeResult; |
| } catch { |
| return { pass: false, reason: `Failed to parse judge response: ${result.text}` }; |
| } |
| } |
|
|