csa / test /e2e-prompt-ab2.mjs
ricebug's picture
Upload 76 files
ca8ab2d verified
/**
* test/e2e-prompt-ab2.mjs
*
* ็ฌฌไบŒ่ฝฎๆ็คบ่ฏ A/B ๆต‹่ฏ•๏ผš
* โ‘ค ๅทฅๅ…ท็ป“ๆžœ็ปญ่กŒๆ็คบ (extractToolResultNatural ๅฐพ้ƒจ)
* โ‘ก thinkingSuffix (ๆฏๆก็”จๆˆทๆถˆๆฏๆœซๅฐพ)
* โ‘ข fewShotResponse (few-shot ็คบไพ‹ๆ–‡ๅญ—)
*
* ๆฏไธชๆ็คบ่ฏ็š„ๆต‹่ฏ•่ฎพ่ฎกไพง้‡ไบŽๅ…ถ็‰นๅฎšๅฝฑๅ“้ข๏ผš
* - โ‘ค ็ปญ่กŒๆ็คบ โ†’ ๅคš่ฝฎๅทฅๅ…ทๅพช็Žฏไธญๆจกๅž‹ๆ˜ฏๅฆๆŒ็ปญ่กŒๅŠจ
* - โ‘ก ๆ–นๅ‘ๅŽ็ผ€ โ†’ ๆจกๅž‹ๆ˜ฏๅฆๅœจๆฏๆกๆถˆๆฏๅŽ็ซ‹ๅณ่กŒๅŠจ
* - โ‘ข few-shot โ†’ ๆ ผๅผ้ตๅพชๅบฆๅ’Œๅ™่ฟฐ้ฃŽๆ ผ
*
* ็”จๆณ•๏ผš
* VARIANT=baseline node test/e2e-prompt-ab2.mjs
* VARIANT=candidate_x node test/e2e-prompt-ab2.mjs
* node test/e2e-prompt-ab2.mjs --compare
*/
const BASE_URL = `http://localhost:${process.env.PORT || 3010}`;
const MODEL = 'claude-sonnet-4-5-20251120';
const MAX_TURNS = 10;
const VARIANT = process.env.VARIANT || 'current';
const COMPARE_MODE = process.argv.includes('--compare');
// โ”€โ”€โ”€ ้ขœ่‰ฒ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
const C = {
reset: '\x1b[0m', bold: '\x1b[1m', dim: '\x1b[2m',
green: '\x1b[32m', red: '\x1b[31m', yellow: '\x1b[33m',
cyan: '\x1b[36m', blue: '\x1b[34m', magenta: '\x1b[35m', gray: '\x1b[90m',
};
const ok = s => `${C.green}โœ… ${s}${C.reset}`;
const fail = s => `${C.red}โŒ ${s}${C.reset}`;
const warn = s => `${C.yellow}โš  ${s}${C.reset}`;
const hdr = s => `\n${C.bold}${C.cyan}โ”โ”โ” ${s} โ”โ”โ”${C.reset}`;
const info = s => ` ${C.gray}${s}${C.reset}`;
// โ”€โ”€โ”€ ๅŸบ็ก€ๅทฅๅ…ท้›† โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
const TOOLS = [
{
name: 'Read', description: 'Reads a file.', input_schema: {
type: 'object', properties: { file_path: { type: 'string' } }, required: ['file_path'],
},
},
{
name: 'Write', description: 'Writes a file.', input_schema: {
type: 'object', properties: {
file_path: { type: 'string' }, content: { type: 'string' },
}, required: ['file_path', 'content'],
},
},
{
name: 'Bash', description: 'Executes a bash command.', input_schema: {
type: 'object', properties: { command: { type: 'string' } }, required: ['command'],
},
},
{
name: 'Grep', description: 'Search for patterns in files.', input_schema: {
type: 'object', properties: {
pattern: { type: 'string' }, path: { type: 'string' },
}, required: ['pattern'],
},
},
{
name: 'LS', description: 'Lists directory contents.', input_schema: {
type: 'object', properties: { path: { type: 'string' } }, required: ['path'],
},
},
{
name: 'attempt_completion', description: 'Present the final result.', input_schema: {
type: 'object', properties: { result: { type: 'string' } }, required: ['result'],
},
},
];
// โ”€โ”€โ”€ ่™šๆ‹Ÿๆ–‡ไปถ็ณป็ปŸ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
const MOCK_FS = {
'/project/package.json': '{"name":"my-app","version":"2.0.0","dependencies":{"express":"^4.18.0","lodash":"^4.17.21"}}',
'/project/src/index.ts': 'import express from "express";\nimport { router } from "./router";\nconst app = express();\napp.use("/api", router);\napp.listen(3000);\n',
'/project/src/router.ts': 'import { Router } from "express";\nexport const router = Router();\nrouter.get("/health", (_, res) => res.json({ ok: true }));\nrouter.get("/users", (_, res) => res.json([]));\n// TODO: add POST /users\n',
'/project/src/utils.ts': 'export function clamp(v: number, min: number, max: number) {\n return Math.min(Math.max(v, min), max);\n}\n// TODO: add debounce function\n',
'/project/tsconfig.json': '{"compilerOptions":{"target":"es2020","module":"commonjs","strict":true}}',
'/project/README.md': '# My App\nExpress API server.\n## API\n- GET /api/health\n- GET /api/users\n',
};
function mockExec(name, input) {
switch (name) {
case 'Read': return MOCK_FS[input.file_path] || `Error: File not found: ${input.file_path}`;
case 'Write': { MOCK_FS[input.file_path] = input.content; return `Wrote ${input.content.length} chars`; }
case 'Bash': {
if (input.command?.includes('npm test')) return 'Tests passed: 3/3';
if (input.command?.includes('tsc')) return 'Compilation successful';
return `$ ${input.command}\n(ok)`;
}
case 'Grep': {
const results = [];
for (const [fp, c] of Object.entries(MOCK_FS)) {
c.split('\n').forEach((line, i) => {
if (line.toLowerCase().includes((input.pattern || '').toLowerCase()))
results.push(`${fp}:${i + 1}:${line.trim()}`);
});
}
return results.join('\n') || 'No matches.';
}
case 'LS': return Object.keys(MOCK_FS).filter(p => p.startsWith(input.path || '/project')).join('\n');
case 'attempt_completion': return `__DONE__:${input.result}`;
default: return `Executed ${name}`;
}
}
// โ”€โ”€โ”€ ๅคš่ฝฎๅผ•ๆ“Ž โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async function runMultiTurn(userMessage, opts = {}) {
const { tools = TOOLS, systemPrompt = '', toolChoice, maxTurns = MAX_TURNS } = opts;
const messages = [{ role: 'user', content: userMessage }];
const system = systemPrompt || 'You are an AI coding assistant. Working directory: /project.';
let totalToolCalls = 0, totalTextChars = 0, turns = 0;
let firstTurnToolCount = 0, firstTurnTextLen = 0;
const toolLog = [];
let completed = false;
let stopped = false; // ๆจกๅž‹ๆ˜ฏๅฆไธญ้€”ๅœๆญข๏ผˆend_turn but not completed๏ผ‰
while (turns < maxTurns) {
turns++;
const resp = await fetch(`${BASE_URL}/v1/messages`, {
method: 'POST',
headers: { 'Content-Type': 'application/json', 'x-api-key': 'dummy' },
body: JSON.stringify({
model: MODEL, max_tokens: 4096, system, tools,
...(toolChoice ? { tool_choice: toolChoice } : {}),
messages,
}),
});
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
const data = await resp.json();
const textBlocks = data.content?.filter(b => b.type === 'text') || [];
const toolUseBlocks = data.content?.filter(b => b.type === 'tool_use') || [];
const turnText = textBlocks.reduce((s, b) => s + (b.text?.length || 0), 0);
totalTextChars += turnText;
totalToolCalls += toolUseBlocks.length;
if (turns === 1) {
firstTurnToolCount = toolUseBlocks.length;
firstTurnTextLen = turnText;
}
for (const tb of toolUseBlocks) {
toolLog.push({ turn: turns, tool: tb.name });
}
if (data.stop_reason === 'end_turn' || toolUseBlocks.length === 0) {
if (!completed) stopped = true;
break;
}
messages.push({ role: 'assistant', content: data.content });
const results = toolUseBlocks.map(tb => ({
type: 'tool_result', tool_use_id: tb.id,
content: mockExec(tb.name, tb.input),
}));
messages.push({ role: 'user', content: results });
if (results.some(r => r.content.startsWith('__DONE__'))) { completed = true; break; }
}
return {
totalToolCalls, totalTextChars, turns,
firstTurnToolCount, firstTurnTextLen,
toolLog, completed, stopped,
narrationRatio: totalTextChars / Math.max(totalTextChars + totalToolCalls * 100, 1),
toolPath: toolLog.map(t => `${t.turn}:${t.tool}`).join(' โ†’ '),
};
}
// โ”€โ”€โ”€ ๆต‹่ฏ•ๅœบๆ™ฏ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
const SCENARIOS = [
// ========= โ‘ค ็ปญ่กŒๆ็คบๆต‹่ฏ• =========
{
id: 'continuation_3step',
group: 'โ‘ค ็ปญ่กŒๆ็คบ',
name: '3 ๆญฅ่ฟž็ปญไปปๅŠก๏ผˆไธไธญๆ–ญ๏ผ‰',
description: 'ๆจกๅž‹ๅฟ…้กป่ฟž็ปญๆ‰ง่กŒ 3 ๆญฅ๏ผŒไธ่ƒฝไธญ้€”ๅœไธ‹ใ€‚ๆต‹่ฏ•็ปญ่กŒๆŒ‡ไปคๆ˜ฏๅฆๆœ‰ๆ•ˆใ€‚',
prompt: 'Step 1: Read /project/src/router.ts. Step 2: Read /project/src/utils.ts. Step 3: After reading both, use attempt_completion to summarize all TODO items found.',
expect: { minTools: 3, completed: true },
toolChoice: { type: 'any' },
},
{
id: 'continuation_after_error',
group: 'โ‘ค ็ปญ่กŒๆ็คบ',
name: '้”™่ฏฏๅŽ็ปง็ปญ',
description: '่ฏปๅ–ไธๅญ˜ๅœจ็š„ๆ–‡ไปถโ†’ๆ”ถๅˆฐ้”™่ฏฏโ†’ๅบ”็ปง็ปญๅฐ่ฏ•ๅ…ถไป–ๆ–‡ไปถ่€Œไธๆ˜ฏๅœไธ‹ใ€‚',
prompt: 'Read /project/src/nonexistent.ts. If it fails, read /project/src/index.ts instead.',
expect: { minTools: 2 },
},
{
id: 'continuation_long_chain',
group: 'โ‘ค ็ปญ่กŒๆ็คบ',
name: '้•ฟ้“พไปปๅŠก๏ผˆโ‰ฅ4 ๆญฅ๏ผ‰',
description: 'ๆต‹่ฏ•ๅœจ 4+ ๆญฅๅทฅๅ…ท้“พไธญๆจกๅž‹ๆ˜ฏๅฆๆŒ็ปญๆŽจ่ฟ›ใ€‚',
prompt: 'Please do these steps in order: 1) LS /project/src 2) Read /project/src/index.ts 3) Read /project/src/router.ts 4) Grep for "TODO" in /project/src 5) attempt_completion with a summary of all findings.',
expect: { minTools: 4, completed: true },
toolChoice: { type: 'any' },
},
// ========= โ‘ก ๆ–นๅ‘ๅŽ็ผ€ๆต‹่ฏ• =========
{
id: 'suffix_immediate_action',
group: 'โ‘ก ๆ–นๅ‘ๅŽ็ผ€',
name: '็ซ‹ๅณ่กŒๅŠจ๏ผˆๆ— ๅ™่ฟฐ๏ผ‰',
description: '็ฎ€ๅ•่ฏทๆฑ‚ๅบ”็ดง้šๅŽ็ผ€ๆŒ‡็คบ็›ดๆŽฅ่กŒๅŠจ๏ผŒ่€Œไธๆ˜ฏๅ…ˆๆ่ฟฐ่ฎกๅˆ’ใ€‚',
prompt: 'Show me the project structure.',
expect: { firstTurnAction: true, maxFirstTurnText: 100 },
},
{
id: 'suffix_ambiguous_task',
group: 'โ‘ก ๆ–นๅ‘ๅŽ็ผ€',
name: 'ๆจก็ณŠไปปๅŠกไนŸ่กŒๅŠจ',
description: 'ๅณไฝฟไปปๅŠก็จๆœ‰ๆจก็ณŠ๏ผŒๆจกๅž‹ไนŸๅบ”ๅ…ˆ่กŒๅŠจ๏ผˆ่ฏปๆ–‡ไปถ๏ผ‰ๅ†่ฎจ่ฎบใ€‚',
prompt: 'Help me understand this project.',
expect: { firstTurnAction: true },
},
{
id: 'suffix_multi_file',
group: 'โ‘ก ๆ–นๅ‘ๅŽ็ผ€',
name: 'ๅคšๆ–‡ไปถๅนถ่กŒ',
description: 'ๆ–นๅ‘ๅŽ็ผ€ๅบ”่ฎฉๆจกๅž‹ๅœจไธ€่ฝฎๅ†…ๅนถ่กŒ่ฐƒ็”จๅคšไธชๅทฅๅ…ทใ€‚',
prompt: 'Read /project/src/index.ts and /project/src/router.ts and /project/tsconfig.json.',
expect: { firstTurnMinTools: 2 },
},
// ========= โ‘ข few-shot ๆต‹่ฏ• =========
{
id: 'fewshot_format',
group: 'โ‘ข few-shot',
name: '่พ“ๅ‡บๆ ผๅผ้ตๅพชๅบฆ',
description: 'ๆจกๅž‹ๆ˜ฏๅฆไธฅๆ ผ้ตๅพช ```json action ๆ ผๅผ๏ผˆ่€Œไธๆ˜ฏๅ…ถไป–ๅ˜ไฝ“๏ผ‰ใ€‚',
prompt: 'Read /project/package.json and tell me the project name.',
expect: { formatCorrect: true, minTools: 1 },
},
{
id: 'fewshot_style_match',
group: 'โ‘ข few-shot',
name: '้ฃŽๆ ผๆจกไปฟ โ€”โ€” ๅ™่ฟฐ็ฎ€ๆดๅบฆ',
description: 'few-shot ๆ ทๆœฌ่ถŠ็ฎ€ๆด๏ผŒๆจกๅž‹็š„ๅ›žๅคไนŸๅบ”่ถŠ็ฎ€ๆดใ€‚',
prompt: 'List all TypeScript files in the project.',
expect: { maxFirstTurnText: 80 },
},
{
id: 'fewshot_no_meta',
group: 'โ‘ข few-shot',
name: 'ๆ— ๅ…ƒๅ™่ฟฐ',
description: 'ๆจกๅž‹ไธๅบ”่พ“ๅ‡บ็ฑปไผผ "I will use the structured format" ็š„่‡ชๆˆ‘ๆ่ฟฐใ€‚',
prompt: 'Check if there are any TODO comments in /project/src/utils.ts.',
expect: { noMetaText: true, minTools: 1 },
},
];
// โ”€โ”€โ”€ ๅฏนๆฏ”ๆจกๅผ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if (COMPARE_MODE) {
const fs = await import('fs');
const files = fs.readdirSync('test')
.filter(f => f.startsWith('prompt-ab2-results-') && f.endsWith('.json'))
.sort();
if (files.length < 2) {
console.log(`\n${fail('้œ€่ฆ่‡ณๅฐ‘ 2 ไธช็ป“ๆžœๆ–‡ไปถใ€‚')}`);
process.exit(1);
}
const results = files.map(f => ({ file: f, ...JSON.parse(fs.readFileSync(`test/${f}`, 'utf-8')) }));
console.log(`\n${C.bold}${C.magenta}โ•โ• ็ฌฌไบŒ่ฝฎๆ็คบ่ฏ A/B ๅฏนๆฏ” โ•โ•${C.reset}\n`);
results.forEach(r => console.log(` ${C.cyan}${r.variant}${C.reset} โ€” ${r.timestamp}`));
// ๆŒ‰ group ๅˆ†็ป„่พ“ๅ‡บ
const groups = [...new Set(SCENARIOS.map(s => s.group))];
for (const group of groups) {
console.log(hdr(group));
const groupScenarios = SCENARIOS.filter(s => s.group === group);
console.log(`${'โ”€'.repeat(120)}`);
const headerParts = [`${'ๅœบๆ™ฏ'.padEnd(28)}`];
for (const r of results) headerParts.push(r.variant.padEnd(25));
console.log(`${C.bold}${headerParts.join('')}${C.reset}`);
console.log(`${'โ”€'.repeat(120)}`);
for (const sc of groupScenarios) {
const row = [sc.id.padEnd(28)];
for (const r of results) {
const s = r.scenarios.find(x => x.id === sc.id);
if (!s) { row.push('N/A'.padEnd(25)); continue; }
const m = s.metrics;
const emoji = s.passed ? 'โœ…' : 'โŒ';
const brief = m
? `${emoji} T:${m.totalToolCalls} N:${Math.round((m.narrationRatio || 0) * 100)}% ${m.turns}่ฝฎ`
: 'โŒ ERR';
row.push(brief.padEnd(25));
}
console.log(row.join(''));
}
}
// ๆฑ‡ๆ€ป
console.log(`\n${C.bold}ๆฑ‡ๆ€ป:${C.reset}`);
for (const r of results) {
const pass = r.scenarios.filter(s => s.passed).length;
const avgNarr = r.scenarios.reduce((s, x) => s + (x.metrics?.narrationRatio || 0), 0) / r.scenarios.length;
const totalTools = r.scenarios.reduce((s, x) => s + (x.metrics?.totalToolCalls || 0), 0);
const completions = r.scenarios.filter(s => s.metrics?.completed).length;
console.log(` ${C.cyan}${r.variant}${C.reset}: ${pass}/${r.scenarios.length}้€š่ฟ‡ ๅทฅๅ…ท:${totalTools} ๅ™่ฟฐ:${Math.round(avgNarr * 100)}% ๅฎŒๆˆ:${completions}`);
}
process.exit(0);
}
// โ”€โ”€โ”€ ไธปๆต‹่ฏ•ๆต็จ‹ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
console.log(`\n${C.bold}${C.magenta} ็ฌฌไบŒ่ฝฎๆ็คบ่ฏ A/B ๆต‹่ฏ•${C.reset}`);
console.log(info(`VARIANT=${VARIANT} MODEL=${MODEL}`));
try {
const r = await fetch(`${BASE_URL}/v1/models`, { headers: { 'x-api-key': 'dummy' } });
if (!r.ok) throw new Error();
console.log(`\n${ok('ๆœๅŠกๅ™จๅœจ็บฟ')}`);
} catch { console.log(`\n${fail('ๆœๅŠกๅ™จๆœช่ฟ่กŒ')}`); process.exit(1); }
const scenarioResults = [];
let passed = 0, failedCount = 0;
let currentGroup = '';
for (const sc of SCENARIOS) {
if (sc.group !== currentGroup) {
currentGroup = sc.group;
console.log(hdr(currentGroup));
}
process.stdout.write(` ${C.blue}โ–ถ${C.reset} ${C.bold}${sc.name}${C.reset}\n`);
console.log(info(sc.description));
const t0 = Date.now();
try {
const r = await runMultiTurn(sc.prompt, { toolChoice: sc.toolChoice });
let testPassed = true;
const failReasons = [];
// ๆฃ€ๆŸฅๆœŸๆœ›
if (sc.expect.minTools && r.totalToolCalls < sc.expect.minTools) {
testPassed = false; failReasons.push(`ๅทฅๅ…ท่ฐƒ็”จ ${r.totalToolCalls} < ${sc.expect.minTools}`);
}
if (sc.expect.completed && !r.completed) {
testPassed = false; failReasons.push('ไปปๅŠกๆœชๅฎŒๆˆ๏ผˆๆœช่ฐƒ็”จ attempt_completion๏ผ‰');
}
if (sc.expect.firstTurnAction && r.firstTurnToolCount === 0) {
testPassed = false; failReasons.push('็ฌฌไธ€่ฝฎๆ— ๅทฅๅ…ท่ฐƒ็”จ');
}
if (sc.expect.maxFirstTurnText && r.firstTurnTextLen > sc.expect.maxFirstTurnText) {
failReasons.push(`้ฆ–่ฝฎๆ–‡ๆœฌ ${r.firstTurnTextLen} > ${sc.expect.maxFirstTurnText} (่ญฆๅ‘Š)`);
}
if (sc.expect.firstTurnMinTools && r.firstTurnToolCount < sc.expect.firstTurnMinTools) {
testPassed = false; failReasons.push(`้ฆ–่ฝฎๅทฅๅ…ท ${r.firstTurnToolCount} < ${sc.expect.firstTurnMinTools}`);
}
if (sc.expect.formatCorrect !== undefined && sc.expect.formatCorrect && r.totalToolCalls === 0) {
testPassed = false; failReasons.push('ๆ— ๅทฅๅ…ท่ฐƒ็”จ๏ผˆๆ— ๆณ•้ชŒ่ฏๆ ผๅผ๏ผ‰');
}
console.log(info(` ๅทฅๅ…ท: ${r.totalToolCalls} ่ฝฎๆ•ฐ: ${r.turns} ๆ–‡ๆœฌ: ${r.totalTextChars}chars ๅ™่ฟฐ: ${Math.round(r.narrationRatio * 100)}% ๅฎŒๆˆ: ${r.completed ? 'โœ…' : 'โŒ'}`));
console.log(info(` ้“พ: ${r.toolPath}`));
const ms = ((Date.now() - t0) / 1000).toFixed(1);
if (testPassed && failReasons.length === 0) {
console.log(` ${ok('้€š่ฟ‡')} (${ms}s)`);
passed++;
} else if (testPassed) {
console.log(` ${ok('้€š่ฟ‡')} (${ms}s) โ€” ${failReasons.join(', ')}`);
passed++;
} else {
console.log(` ${fail('ๆœช้€š่ฟ‡')} (${ms}s)`);
failReasons.forEach(r2 => console.log(` ${C.yellow}โ†’ ${r2}${C.reset}`));
failedCount++;
}
scenarioResults.push({ id: sc.id, name: sc.name, group: sc.group, passed: testPassed, failReasons, metrics: r });
} catch (err) {
console.log(` ${fail('้”™่ฏฏ')}: ${err.message}`);
failedCount++;
scenarioResults.push({ id: sc.id, name: sc.name, group: sc.group, passed: false, failReasons: [err.message], metrics: null });
}
}
const total = passed + failedCount;
console.log(`\n${'โ•'.repeat(62)}`);
console.log(`${C.bold} [${VARIANT}] ็ป“ๆžœ: ${C.green}${passed} ้€š่ฟ‡${C.reset}${C.bold} / ${failedCount > 0 ? C.red : ''}${failedCount} ๆœช้€š่ฟ‡${C.reset}${C.bold} / ${total} ๅœบๆ™ฏ${C.reset}`);
console.log('โ•'.repeat(62));
const fs = await import('fs');
const out = { variant: VARIANT, timestamp: new Date().toISOString(), model: MODEL, scenarios: scenarioResults, summary: { passed, failed: failedCount, total } };
const outFile = `test/prompt-ab2-results-${VARIANT}.json`;
fs.writeFileSync(outFile, JSON.stringify(out, null, 2));
console.log(`\n${info(`ๅทฒไฟๅญ˜: ${outFile}`)}`);
console.log(info('ๅฏนๆฏ”: node test/e2e-prompt-ab2.mjs --compare'));
console.log();