Spaces:

ricebug
/

csa

Sleeping

App Files Files Community

csa / test /e2e-prompt-ab2.mjs

ricebug's picture

Upload 76 files

ca8ab2d verified about 1 month ago

history blame contribute delete

18.8 kB

	/**
	* test/e2e-prompt-ab2.mjs
	*
	* 第二轮提示词 A/B 测试：
	* ⑤ 工具结果续行提示 (extractToolResultNatural 尾部)
	* ② thinkingSuffix (每条用户消息末尾)
	* ③ fewShotResponse (few-shot 示例文字)
	*
	* 每个提示词的测试设计侧重于其特定影响面：
	* - ⑤ 续行提示 → 多轮工具循环中模型是否持续行动
	* - ② 方向后缀 → 模型是否在每条消息后立即行动
	* - ③ few-shot → 格式遵循度和叙述风格
	*
	* 用法：
	* VARIANT=baseline node test/e2e-prompt-ab2.mjs
	* VARIANT=candidate_x node test/e2e-prompt-ab2.mjs
	* node test/e2e-prompt-ab2.mjs --compare
	*/

	const BASE_URL = `http://localhost:${process.env.PORT \|\| 3010}`;
	const MODEL = 'claude-sonnet-4-5-20251120';
	const MAX_TURNS = 10;
	const VARIANT = process.env.VARIANT \|\| 'current';
	const COMPARE_MODE = process.argv.includes('--compare');

	// ─── 颜色 ───────────────────────────────────────────────────────────
	const C = {
	reset: '\x1b[0m', bold: '\x1b[1m', dim: '\x1b[2m',
	green: '\x1b[32m', red: '\x1b[31m', yellow: '\x1b[33m',
	cyan: '\x1b[36m', blue: '\x1b[34m', magenta: '\x1b[35m', gray: '\x1b[90m',
	};
	const ok = s => `${C.green}✅ ${s}${C.reset}`;
	const fail = s => `${C.red}❌ ${s}${C.reset}`;
	const warn = s => `${C.yellow}⚠ ${s}${C.reset}`;
	const hdr = s => `\n${C.bold}${C.cyan}━━━ ${s} ━━━${C.reset}`;
	const info = s => ` ${C.gray}${s}${C.reset}`;

	// ─── 基础工具集 ──────────────────────────────────────────────────────
	const TOOLS = [
	{
	name: 'Read', description: 'Reads a file.', input_schema: {
	type: 'object', properties: { file_path: { type: 'string' } }, required: ['file_path'],
	},
	},
	{
	name: 'Write', description: 'Writes a file.', input_schema: {
	type: 'object', properties: {
	file_path: { type: 'string' }, content: { type: 'string' },
	}, required: ['file_path', 'content'],
	},
	},
	{
	name: 'Bash', description: 'Executes a bash command.', input_schema: {
	type: 'object', properties: { command: { type: 'string' } }, required: ['command'],
	},
	},
	{
	name: 'Grep', description: 'Search for patterns in files.', input_schema: {
	type: 'object', properties: {
	pattern: { type: 'string' }, path: { type: 'string' },
	}, required: ['pattern'],
	},
	},
	{
	name: 'LS', description: 'Lists directory contents.', input_schema: {
	type: 'object', properties: { path: { type: 'string' } }, required: ['path'],
	},
	},
	{
	name: 'attempt_completion', description: 'Present the final result.', input_schema: {
	type: 'object', properties: { result: { type: 'string' } }, required: ['result'],
	},
	},
	];

	// ─── 虚拟文件系统 ────────────────────────────────────────────────────
	const MOCK_FS = {
	'/project/package.json': '{"name":"my-app","version":"2.0.0","dependencies":{"express":"^4.18.0","lodash":"^4.17.21"}}',
	'/project/src/index.ts': 'import express from "express";\nimport { router } from "./router";\nconst app = express();\napp.use("/api", router);\napp.listen(3000);\n',
	'/project/src/router.ts': 'import { Router } from "express";\nexport const router = Router();\nrouter.get("/health", (_, res) => res.json({ ok: true }));\nrouter.get("/users", (_, res) => res.json([]));\n// TODO: add POST /users\n',
	'/project/src/utils.ts': 'export function clamp(v: number, min: number, max: number) {\n return Math.min(Math.max(v, min), max);\n}\n// TODO: add debounce function\n',
	'/project/tsconfig.json': '{"compilerOptions":{"target":"es2020","module":"commonjs","strict":true}}',
	'/project/README.md': '# My App\nExpress API server.\n## API\n- GET /api/health\n- GET /api/users\n',
	};

	function mockExec(name, input) {
	switch (name) {
	case 'Read': return MOCK_FS[input.file_path] \|\| `Error: File not found: ${input.file_path}`;
	case 'Write': { MOCK_FS[input.file_path] = input.content; return `Wrote ${input.content.length} chars`; }
	case 'Bash': {
	if (input.command?.includes('npm test')) return 'Tests passed: 3/3';
	if (input.command?.includes('tsc')) return 'Compilation successful';
	return `$ ${input.command}\n(ok)`;
	}
	case 'Grep': {
	const results = [];
	for (const [fp, c] of Object.entries(MOCK_FS)) {
	c.split('\n').forEach((line, i) => {
	if (line.toLowerCase().includes((input.pattern \|\| '').toLowerCase()))
	results.push(`${fp}:${i + 1}:${line.trim()}`);
	});
	}
	return results.join('\n') \|\| 'No matches.';
	}
	case 'LS': return Object.keys(MOCK_FS).filter(p => p.startsWith(input.path \|\| '/project')).join('\n');
	case 'attempt_completion': return `__DONE__:${input.result}`;
	default: return `Executed ${name}`;
	}
	}

	// ─── 多轮引擎 ─────────────────────────────────────────────────────
	async function runMultiTurn(userMessage, opts = {}) {
	const { tools = TOOLS, systemPrompt = '', toolChoice, maxTurns = MAX_TURNS } = opts;
	const messages = [{ role: 'user', content: userMessage }];
	const system = systemPrompt \|\| 'You are an AI coding assistant. Working directory: /project.';

	let totalToolCalls = 0, totalTextChars = 0, turns = 0;
	let firstTurnToolCount = 0, firstTurnTextLen = 0;
	const toolLog = [];
	let completed = false;
	let stopped = false; // 模型是否中途停止（end_turn but not completed）

	while (turns < maxTurns) {
	turns++;
	const resp = await fetch(`${BASE_URL}/v1/messages`, {
	method: 'POST',
	headers: { 'Content-Type': 'application/json', 'x-api-key': 'dummy' },
	body: JSON.stringify({
	model: MODEL, max_tokens: 4096, system, tools,
	...(toolChoice ? { tool_choice: toolChoice } : {}),
	messages,
	}),
	});
	if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
	const data = await resp.json();

	const textBlocks = data.content?.filter(b => b.type === 'text') \|\| [];
	const toolUseBlocks = data.content?.filter(b => b.type === 'tool_use') \|\| [];
	const turnText = textBlocks.reduce((s, b) => s + (b.text?.length \|\| 0), 0);

	totalTextChars += turnText;
	totalToolCalls += toolUseBlocks.length;

	if (turns === 1) {
	firstTurnToolCount = toolUseBlocks.length;
	firstTurnTextLen = turnText;
	}

	for (const tb of toolUseBlocks) {
	toolLog.push({ turn: turns, tool: tb.name });
	}

	if (data.stop_reason === 'end_turn' \|\| toolUseBlocks.length === 0) {
	if (!completed) stopped = true;
	break;
	}

	messages.push({ role: 'assistant', content: data.content });
	const results = toolUseBlocks.map(tb => ({
	type: 'tool_result', tool_use_id: tb.id,
	content: mockExec(tb.name, tb.input),
	}));
	messages.push({ role: 'user', content: results });

	if (results.some(r => r.content.startsWith('__DONE__'))) { completed = true; break; }
	}

	return {
	totalToolCalls, totalTextChars, turns,
	firstTurnToolCount, firstTurnTextLen,
	toolLog, completed, stopped,
	narrationRatio: totalTextChars / Math.max(totalTextChars + totalToolCalls * 100, 1),
	toolPath: toolLog.map(t => `${t.turn}:${t.tool}`).join(' → '),
	};
	}

	// ─── 测试场景 ─────────────────────────────────────────────────────
	const SCENARIOS = [
	// ========= ⑤ 续行提示测试 =========
	{
	id: 'continuation_3step',
	group: '⑤ 续行提示',
	name: '3 步连续任务（不中断）',
	description: '模型必须连续执行 3 步，不能中途停下。测试续行指令是否有效。',
	prompt: 'Step 1: Read /project/src/router.ts. Step 2: Read /project/src/utils.ts. Step 3: After reading both, use attempt_completion to summarize all TODO items found.',
	expect: { minTools: 3, completed: true },
	toolChoice: { type: 'any' },
	},
	{
	id: 'continuation_after_error',
	group: '⑤ 续行提示',
	name: '错误后继续',
	description: '读取不存在的文件→收到错误→应继续尝试其他文件而不是停下。',
	prompt: 'Read /project/src/nonexistent.ts. If it fails, read /project/src/index.ts instead.',
	expect: { minTools: 2 },
	},
	{
	id: 'continuation_long_chain',
	group: '⑤ 续行提示',
	name: '长链任务（≥4 步）',
	description: '测试在 4+ 步工具链中模型是否持续推进。',
	prompt: 'Please do these steps in order: 1) LS /project/src 2) Read /project/src/index.ts 3) Read /project/src/router.ts 4) Grep for "TODO" in /project/src 5) attempt_completion with a summary of all findings.',
	expect: { minTools: 4, completed: true },
	toolChoice: { type: 'any' },
	},

	// ========= ② 方向后缀测试 =========
	{
	id: 'suffix_immediate_action',
	group: '② 方向后缀',
	name: '立即行动（无叙述）',
	description: '简单请求应紧随后缀指示直接行动，而不是先描述计划。',
	prompt: 'Show me the project structure.',
	expect: { firstTurnAction: true, maxFirstTurnText: 100 },
	},
	{
	id: 'suffix_ambiguous_task',
	group: '② 方向后缀',
	name: '模糊任务也行动',
	description: '即使任务稍有模糊，模型也应先行动（读文件）再讨论。',
	prompt: 'Help me understand this project.',
	expect: { firstTurnAction: true },
	},
	{
	id: 'suffix_multi_file',
	group: '② 方向后缀',
	name: '多文件并行',
	description: '方向后缀应让模型在一轮内并行调用多个工具。',
	prompt: 'Read /project/src/index.ts and /project/src/router.ts and /project/tsconfig.json.',
	expect: { firstTurnMinTools: 2 },
	},

	// ========= ③ few-shot 测试 =========
	{
	id: 'fewshot_format',
	group: '③ few-shot',
	name: '输出格式遵循度',
	description: '模型是否严格遵循 ```json action 格式（而不是其他变体）。',
	prompt: 'Read /project/package.json and tell me the project name.',
	expect: { formatCorrect: true, minTools: 1 },
	},
	{
	id: 'fewshot_style_match',
	group: '③ few-shot',
	name: '风格模仿 —— 叙述简洁度',
	description: 'few-shot 样本越简洁，模型的回复也应越简洁。',
	prompt: 'List all TypeScript files in the project.',
	expect: { maxFirstTurnText: 80 },
	},
	{
	id: 'fewshot_no_meta',
	group: '③ few-shot',
	name: '无元叙述',
	description: '模型不应输出类似 "I will use the structured format" 的自我描述。',
	prompt: 'Check if there are any TODO comments in /project/src/utils.ts.',
	expect: { noMetaText: true, minTools: 1 },
	},
	];

	// ─── 对比模式 ────────────────────────────────────────────────────────
	if (COMPARE_MODE) {
	const fs = await import('fs');
	const files = fs.readdirSync('test')
	.filter(f => f.startsWith('prompt-ab2-results-') && f.endsWith('.json'))
	.sort();

	if (files.length < 2) {
	console.log(`\n${fail('需要至少 2 个结果文件。')}`);
	process.exit(1);
	}

	const results = files.map(f => ({ file: f, ...JSON.parse(fs.readFileSync(`test/${f}`, 'utf-8')) }));

	console.log(`\n${C.bold}${C.magenta}══ 第二轮提示词 A/B 对比 ══${C.reset}\n`);
	results.forEach(r => console.log(` ${C.cyan}${r.variant}${C.reset} — ${r.timestamp}`));

	// 按 group 分组输出
	const groups = [...new Set(SCENARIOS.map(s => s.group))];
	for (const group of groups) {
	console.log(hdr(group));
	const groupScenarios = SCENARIOS.filter(s => s.group === group);

	console.log(`${'─'.repeat(120)}`);
	const headerParts = [`${'场景'.padEnd(28)}`];
	for (const r of results) headerParts.push(r.variant.padEnd(25));
	console.log(`${C.bold}${headerParts.join('')}${C.reset}`);
	console.log(`${'─'.repeat(120)}`);

	for (const sc of groupScenarios) {
	const row = [sc.id.padEnd(28)];
	for (const r of results) {
	const s = r.scenarios.find(x => x.id === sc.id);
	if (!s) { row.push('N/A'.padEnd(25)); continue; }
	const m = s.metrics;
	const emoji = s.passed ? '✅' : '❌';
	const brief = m
	? `${emoji} T:${m.totalToolCalls} N:${Math.round((m.narrationRatio \|\| 0) * 100)}% ${m.turns}轮`
	: '❌ ERR';
	row.push(brief.padEnd(25));
	}
	console.log(row.join(''));
	}
	}

	// 汇总
	console.log(`\n${C.bold}汇总:${C.reset}`);
	for (const r of results) {
	const pass = r.scenarios.filter(s => s.passed).length;
	const avgNarr = r.scenarios.reduce((s, x) => s + (x.metrics?.narrationRatio \|\| 0), 0) / r.scenarios.length;
	const totalTools = r.scenarios.reduce((s, x) => s + (x.metrics?.totalToolCalls \|\| 0), 0);
	const completions = r.scenarios.filter(s => s.metrics?.completed).length;
	console.log(` ${C.cyan}${r.variant}${C.reset}: ${pass}/${r.scenarios.length}通过工具:${totalTools} 叙述:${Math.round(avgNarr * 100)}% 完成:${completions}`);
	}
	process.exit(0);
	}

	// ─── 主测试流程 ────────────────────────────────────────────────────
	console.log(`\n${C.bold}${C.magenta} 第二轮提示词 A/B 测试${C.reset}`);
	console.log(info(`VARIANT=${VARIANT} MODEL=${MODEL}`));

	try {
	const r = await fetch(`${BASE_URL}/v1/models`, { headers: { 'x-api-key': 'dummy' } });
	if (!r.ok) throw new Error();
	console.log(`\n${ok('服务器在线')}`);
	} catch { console.log(`\n${fail('服务器未运行')}`); process.exit(1); }

	const scenarioResults = [];
	let passed = 0, failedCount = 0;
	let currentGroup = '';

	for (const sc of SCENARIOS) {
	if (sc.group !== currentGroup) {
	currentGroup = sc.group;
	console.log(hdr(currentGroup));
	}
	process.stdout.write(` ${C.blue}▶${C.reset} ${C.bold}${sc.name}${C.reset}\n`);
	console.log(info(sc.description));

	const t0 = Date.now();
	try {
	const r = await runMultiTurn(sc.prompt, { toolChoice: sc.toolChoice });

	let testPassed = true;
	const failReasons = [];

	// 检查期望
	if (sc.expect.minTools && r.totalToolCalls < sc.expect.minTools) {
	testPassed = false; failReasons.push(`工具调用 ${r.totalToolCalls} < ${sc.expect.minTools}`);
	}
	if (sc.expect.completed && !r.completed) {
	testPassed = false; failReasons.push('任务未完成（未调用 attempt_completion）');
	}
	if (sc.expect.firstTurnAction && r.firstTurnToolCount === 0) {
	testPassed = false; failReasons.push('第一轮无工具调用');
	}
	if (sc.expect.maxFirstTurnText && r.firstTurnTextLen > sc.expect.maxFirstTurnText) {
	failReasons.push(`首轮文本 ${r.firstTurnTextLen} > ${sc.expect.maxFirstTurnText} (警告)`);
	}
	if (sc.expect.firstTurnMinTools && r.firstTurnToolCount < sc.expect.firstTurnMinTools) {
	testPassed = false; failReasons.push(`首轮工具 ${r.firstTurnToolCount} < ${sc.expect.firstTurnMinTools}`);
	}
	if (sc.expect.formatCorrect !== undefined && sc.expect.formatCorrect && r.totalToolCalls === 0) {
	testPassed = false; failReasons.push('无工具调用（无法验证格式）');
	}

	console.log(info(` 工具: ${r.totalToolCalls} 轮数: ${r.turns} 文本: ${r.totalTextChars}chars 叙述: ${Math.round(r.narrationRatio * 100)}% 完成: ${r.completed ? '✅' : '❌'}`));
	console.log(info(` 链: ${r.toolPath}`));

	const ms = ((Date.now() - t0) / 1000).toFixed(1);
	if (testPassed && failReasons.length === 0) {
	console.log(` ${ok('通过')} (${ms}s)`);
	passed++;
	} else if (testPassed) {
	console.log(` ${ok('通过')} (${ms}s) — ${failReasons.join(', ')}`);
	passed++;
	} else {
	console.log(` ${fail('未通过')} (${ms}s)`);
	failReasons.forEach(r2 => console.log(` ${C.yellow}→ ${r2}${C.reset}`));
	failedCount++;
	}

	scenarioResults.push({ id: sc.id, name: sc.name, group: sc.group, passed: testPassed, failReasons, metrics: r });
	} catch (err) {
	console.log(` ${fail('错误')}: ${err.message}`);
	failedCount++;
	scenarioResults.push({ id: sc.id, name: sc.name, group: sc.group, passed: false, failReasons: [err.message], metrics: null });
	}
	}

	const total = passed + failedCount;
	console.log(`\n${'═'.repeat(62)}`);
	console.log(`${C.bold} [${VARIANT}] 结果: ${C.green}${passed} 通过${C.reset}${C.bold} / ${failedCount > 0 ? C.red : ''}${failedCount} 未通过${C.reset}${C.bold} / ${total} 场景${C.reset}`);
	console.log('═'.repeat(62));

	const fs = await import('fs');
	const out = { variant: VARIANT, timestamp: new Date().toISOString(), model: MODEL, scenarios: scenarioResults, summary: { passed, failed: failedCount, total } };
	const outFile = `test/prompt-ab2-results-${VARIANT}.json`;
	fs.writeFileSync(outFile, JSON.stringify(out, null, 2));
	console.log(`\n${info(`已保存: ${outFile}`)}`);
	console.log(info('对比: node test/e2e-prompt-ab2.mjs --compare'));
	console.log();