File size: 6,528 Bytes
a0ebf39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | /**
* VLM Scorer for whiteboard layout quality.
*
* Uses the project's LLM infrastructure (resolveModel + generateText from AI SDK)
* so model configuration follows the same `provider:model` convention as the rest
* of the codebase. Supports all providers (OpenAI, Google, Anthropic, etc.).
*
* The caller supplies the model string explicitly (typically from EVAL_SCORER_MODEL);
* this function no longer has a hardcoded default.
*/
import { readFileSync } from 'fs';
import { generateText } from 'ai';
import { resolveModel } from '@/lib/server/resolve-model';
import type { VlmScore } from './types';
const RUBRIC_PROMPT = `You are evaluating a classroom whiteboard screenshot from an AI teaching assistant. Score like a teacher reviewing their own board work for a student's benefit.
Context: This is a real-time teaching whiteboard, NOT a poster or infographic.
- Empty space is NORMAL and NOT a problem β teachers write in one area at a time.
- What matters: would a student be confused, misled, or unable to read the content?
- Ignore the small dark circle "N" in the corner β it is a page UI element, not whiteboard content.
Score each dimension from 1 to 10 (10 = perfect, 1 = broken):
1. readability β Can a student read every element easily?
- Font size CONSISTENCY is critical: penalize heavily if some text is 2x+ larger than other text on the same board (e.g., one giant title + tiny formulas).
- Are characters crisp? Any Chinese rendered as boxes or missing glyphs?
- Penalize text styled like UI components (gray boxes, card backgrounds) that don't match handwritten whiteboard feel.
2. overlap β Are elements clear of each other, AND does new content respect existing content?
- Penalize any occlusion (shapes over text, text stacked on text, arrows piercing labels).
- CRITICAL: penalize "writing over existing content" β if a new formula is placed directly on top of an existing table row when empty space was available nearby, that is a layout failure, not just overlap.
- 10 = everything distinct; 1 = multiple elements unreadable due to occlusion.
3. rendering_correctness β Are formulas, shapes, and symbols drawn correctly?
- LaTeX must render: raw source like "\\\\frac", "\\\\theta", or garbled chunks like "0ext", "Gsinheta", "heta" = major penalty.
- Subscripts/superscripts must render: "G_x" shown as raw underscore (not Gβ) = penalty.
- Chinese inside LaTeX math mode (e.g., "ε£θ―(ε½ a > 0 ext ζΆ)") = penalty.
- Diagram ACCURACY matters: a parabola drawn as V-shape straight lines, a circle drawn as ellipse-when-should-be-circle, an angle labeled wrong = penalty.
- 10 = all math/shapes render correctly and match the concept; 1 = multiple broken renders OR fundamentally wrong diagrams.
4. content_completeness β Is the content whole, bounded, and annotated?
- Edge clipping: any element cut off at canvas edge (formula missing its left character, table column cut, arrow head beyond edge) = major penalty.
- Unexpected clearing: if previous turns' content has vanished in a later turn with no reason, penalize.
- Bare diagrams with no labels (a circle with no annotation of what it represents) = penalty.
- 10 = all content fully visible and annotated; 1 = significant content lost, truncated, or unlabeled.
5. layout_logic β Does the arrangement support teaching flow?
- Related elements grouped (a diagram with its labels/formulas together)?
- Natural reading order for the concept (cause β effect, equation β graph β solution)?
- Spatial planning: does new content go to sensibly-chosen empty areas rather than crammed near or over existing elements?
overall: 1β10 holistic teaching-quality score. Weight overlap and rendering_correctness more heavily since they directly block comprehension.
issues: 1-5 short concrete problem descriptions a teacher would call out.
Output ONLY a JSON object with this exact structure (no markdown, no code fences):
{"readability":{"score":N,"reason":"..."},"overlap":{"score":N,"reason":"..."},"rendering_correctness":{"score":N,"reason":"..."},"content_completeness":{"score":N,"reason":"..."},"layout_logic":{"score":N,"reason":"..."},"overall":N,"issues":["..."]}`;
/**
* Score a whiteboard screenshot using a VLM.
*
* The caller must provide the model string explicitly (typically from EVAL_SCORER_MODEL);
* this function no longer has a hardcoded default.
*/
export async function scoreScreenshot(
screenshotPath: string,
modelString: string,
): Promise<VlmScore> {
const imageBuffer = readFileSync(screenshotPath);
const { model } = await resolveModel({ modelString });
const result = await generateText({
model,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: RUBRIC_PROMPT },
{ type: 'image', image: imageBuffer },
],
},
],
temperature: 0,
maxOutputTokens: 3000,
});
const content = result.text;
// Extract JSON from response (may be wrapped in markdown code fences)
const jsonMatch = content.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error(`VLM returned non-JSON response: ${content.slice(0, 200)}`);
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let raw: any;
try {
raw = JSON.parse(jsonMatch[0]);
} catch {
// VLM sometimes produces unescaped quotes or trailing content β attempt cleanup
const cleaned = jsonMatch[0]
.replace(/,\s*}/g, '}') // trailing commas
.replace(/,\s*]/g, ']');
try {
raw = JSON.parse(cleaned);
} catch (e2) {
throw new Error(
`VLM returned invalid JSON: ${(e2 as Error).message}\n${jsonMatch[0].slice(0, 300)}`,
);
}
}
const dimensions = [
'readability',
'overlap',
'rendering_correctness',
'content_completeness',
'layout_logic',
] as const;
for (const dim of dimensions) {
if (!raw[dim] || typeof raw[dim].score !== 'number') {
throw new Error(`VLM response missing or invalid dimension: ${dim}`);
}
}
if (typeof raw.overall !== 'number') {
throw new Error('VLM response missing overall score');
}
const score: VlmScore = {
readability: raw.readability,
overlap: raw.overlap,
rendering_correctness: raw.rendering_correctness,
content_completeness: raw.content_completeness,
layout_logic: raw.layout_logic,
overall: raw.overall,
issues: Array.isArray(raw.issues) ? raw.issues : [],
};
return score;
}
|