Spaces:

evalstate
/

hf-papers

Sleeping

App Files Files Community

hf-papers / scripts /score_hf_hub_community_coverage.py

evalstate HF Staff

sync: promote hf_hub_community prompt v3 + add prompt/coverage harness

bba4fab verified 2 months ago

raw

history blame contribute delete

12 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import argparse
	import json
	import re
	import subprocess
	from pathlib import Path
	from typing import Any

	ROOT = Path(__file__).resolve().parents[1]
	DEFAULT_CARDS_DIR = ROOT / '.fast-agent' / 'tool-cards'
	DEFAULT_AGENT = 'hf_hub_community'
	PROMPTS_FILE = ROOT / 'scripts' / 'hf_hub_community_coverage_prompts.json'
	REPORT_MD = ROOT / 'docs' / 'hf_hub_community_coverage_report.md'
	REPORT_JSON = ROOT / 'docs' / 'hf_hub_community_coverage_report.json'

	ANSI_RE = re.compile(r"\x1B\[[0-?][ -/][@-~]")


	def strip_ansi(text: str) -> str:
	return ANSI_RE.sub('', text)


	def load_cases(path: Path) -> list[dict[str, Any]]:
	rows = json.loads(path.read_text(encoding='utf-8'))
	if not isinstance(rows, list):
	raise ValueError('coverage prompts file must be a JSON list')
	out: list[dict[str, Any]] = []
	for row in rows:
	if not isinstance(row, dict):
	continue
	out.append({
	'id': int(row['id']),
	'prompt': str(row['prompt']),
	'expected_endpoint_any': list(row.get('expected_endpoint_any', [])),
	'expected_method_any': [str(x).upper() for x in row.get('expected_method_any', ['GET'])],
	})
	return out


	def _session_extract(result_path: Path) -> dict[str, Any]:
	data = json.loads(result_path.read_text(encoding='utf-8'))
	messages = data.get('messages', []) if isinstance(data, dict) else []

	endpoints: list[str] = []
	methods: list[str] = []
	tool_names: list[str] = []
	tool_calls_count = 0
	merged_parts: list[str] = []
	tool_error = False

	usage_input_tokens = 0
	usage_output_tokens = 0
	usage_total_tokens = 0

	for msg in messages:
	if not isinstance(msg, dict):
	continue

	if msg.get('role') == 'assistant':
	for item in msg.get('content', []) or []:
	if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'):
	merged_parts.append(str(item['text']))

	channels = msg.get('channels') or {}
	for item in channels.get('reasoning', []) or []:
	if isinstance(item, dict) and item.get('text'):
	merged_parts.append(str(item['text']))

	for item in channels.get('fast-agent-usage', []) or []:
	if not isinstance(item, dict):
	continue
	txt = item.get('text')
	if not isinstance(txt, str):
	continue
	try:
	payload = json.loads(txt)
	except Exception:
	continue
	turn = payload.get('turn', {}) if isinstance(payload, dict) else {}
	if isinstance(turn, dict):
	usage_input_tokens += int(turn.get('input_tokens') or 0)
	usage_output_tokens += int(turn.get('output_tokens') or 0)
	usage_total_tokens += int(turn.get('total_tokens') or 0)

	tool_calls = msg.get('tool_calls') or {}
	if isinstance(tool_calls, dict):
	tool_calls_count += len(tool_calls)
	for tc in tool_calls.values():
	params = (tc or {}).get('params', {}) if isinstance(tc, dict) else {}
	name = params.get('name') if isinstance(params, dict) else None
	args = params.get('arguments', {}) if isinstance(params, dict) else {}
	if isinstance(name, str):
	tool_names.append(name)
	merged_parts.append(f'tool call - {name}')
	if isinstance(args, dict):
	ep = args.get('endpoint')
	if isinstance(ep, str):
	endpoints.append(ep)
	method = args.get('method')
	methods.append(str(method).upper() if method else 'GET')
	merged_parts.append(json.dumps(args, ensure_ascii=False))

	if msg.get('role') == 'user':
	tool_results = msg.get('tool_results') or {}
	if isinstance(tool_results, dict):
	for tr in tool_results.values():
	if bool((tr or {}).get('isError')):
	tool_error = True
	for item in (tr or {}).get('content', []) or []:
	if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'):
	text = str(item['text'])
	merged_parts.append(text)
	if 'Error executing tool' in text:
	tool_error = True

	return {
	'endpoints': endpoints,
	'methods': methods,
	'tool_names': tool_names,
	'tool_calls_count': tool_calls_count,
	'tool_error': tool_error,
	'merged': '\n'.join(merged_parts).strip(),
	'usage': {
	'input_tokens': usage_input_tokens,
	'output_tokens': usage_output_tokens,
	'total_tokens': usage_total_tokens,
	},
	}


	def run_case(
	case: dict[str, Any],
	timeout_sec: int,
	model: str,
	agent_cards: Path,
	agent: str,
	result_path: Path,
	) -> dict[str, Any]:
	prompt = case['prompt']
	result_path.parent.mkdir(parents=True, exist_ok=True)
	cmd = [
	'fast-agent', 'go',
	'--no-env',
	'--model', model,
	'--agent-cards', str(agent_cards),
	'--agent', agent,
	'--results', str(result_path),
	'-m', prompt,
	]

	proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_sec)
	out = strip_ansi(proc.stdout or '')
	err = strip_ansi(proc.stderr or '')

	if not result_path.exists():
	raise RuntimeError(f'Expected --results file not written: {result_path}')

	parsed = _session_extract(result_path)

	expected_ep = [re.compile(p) for p in case.get('expected_endpoint_any', [])]
	expected_methods = [m.upper() for m in case.get('expected_method_any', ['GET'])]

	endpoint_ok = any(p.search(ep) for p in expected_ep for ep in parsed['endpoints']) if expected_ep else False
	method_ok = any(m in expected_methods for m in parsed['methods']) if parsed['methods'] else ('GET' in expected_methods)
	tool_ok = any(t == 'hf_api_request' for t in parsed['tool_names'])
	success = proc.returncode == 0 and 'Traceback' not in (out + '\n' + err)
	clarity = len(parsed['merged']) > 20

	score = int(endpoint_ok) * 4 + int(method_ok) * 2 + int(tool_ok) * 2 + int(success) + int(clarity)

	return {
	'id': case['id'],
	'prompt': prompt,
	'returncode': proc.returncode,
	'result_file': str(result_path),
	'observed': {
	'endpoints': parsed['endpoints'],
	'methods': parsed['methods'],
	'tool_names': parsed['tool_names'],
	'tool_calls_count': parsed['tool_calls_count'],
	'tool_error': parsed['tool_error'],
	'usage': parsed['usage'],
	},
	'expected': {
	'endpoint_any': case.get('expected_endpoint_any', []),
	'method_any': expected_methods,
	},
	'eval': {
	'endpoint_ok': endpoint_ok,
	'method_ok': method_ok,
	'tool_ok': tool_ok,
	'success': success,
	'clarity': clarity,
	'score_total': score,
	},
	'merged': parsed['merged'],
	}


	def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]:
	n = len(rows)
	if n == 0:
	return {'n_cases': 0}

	endpoint_rate = sum(1 for r in rows if r['eval']['endpoint_ok']) / n
	method_rate = sum(1 for r in rows if r['eval']['method_ok']) / n
	tool_rate = sum(1 for r in rows if r['eval']['tool_ok']) / n
	success_rate = sum(1 for r in rows if r['eval']['success']) / n
	avg_score = sum(r['eval']['score_total'] for r in rows) / n
	avg_calls = sum(r['observed']['tool_calls_count'] for r in rows) / n
	avg_tokens = sum(int(r['observed']['usage'].get('total_tokens') or 0) for r in rows) / n
	tool_error_rate = sum(1 for r in rows if r['observed']['tool_error']) / n

	return {
	'n_cases': n,
	'endpoint_match_rate': round(endpoint_rate, 4),
	'method_match_rate': round(method_rate, 4),
	'tool_use_rate': round(tool_rate, 4),
	'success_rate': round(success_rate, 4),
	'tool_error_rate': round(tool_error_rate, 4),
	'avg_score_total': round(avg_score, 3),
	'avg_tool_calls': round(avg_calls, 3),
	'avg_total_tokens': round(avg_tokens, 1),
	}


	def render_markdown(rows: list[dict[str, Any]], summary: dict[str, Any], model: str, agent: str) -> str:
	out = [
	'# HF Hub Community Coverage Report',
	'',
	f'- Model: `{model}`',
	f'- Agent: `{agent}`',
	'',
	'## Summary',
	'',
	f"- Cases: {summary.get('n_cases', 0)}",
	f"- Endpoint match rate: {summary.get('endpoint_match_rate')}",
	f"- Method match rate: {summary.get('method_match_rate')}",
	f"- Tool use rate: {summary.get('tool_use_rate')}",
	f"- Success rate: {summary.get('success_rate')}",
	f"- Tool error rate: {summary.get('tool_error_rate')}",
	f"- Avg score (/10): {summary.get('avg_score_total')}",
	f"- Avg tool calls: {summary.get('avg_tool_calls')}",
	f"- Avg total tokens: {summary.get('avg_total_tokens')}",
	'',
	'\| # \| Score \| Endpoint OK \| Method OK \| Calls \| Tokens \| Prompt \|',
	'\|---\|------:\|------------:\|----------:\|------:\|-------:\|--------\|',
	]

	for r in rows:
	score = r['eval']['score_total']
	calls = r['observed']['tool_calls_count']
	tokens = int(r['observed']['usage'].get('total_tokens') or 0)
	out.append(
	f"\| {r['id']} \| {score} \| {int(r['eval']['endpoint_ok'])} \| {int(r['eval']['method_ok'])} \| {calls} \| {tokens} \| {r['prompt'][:72].replace('\|','/')} \|"
	)

	return '\n'.join(out) + '\n'


	def main() -> None:
	ap = argparse.ArgumentParser(description='Run endpoint-coverage pack for hf_hub_community')
	ap.add_argument('--model', default='gpt-oss')
	ap.add_argument('--agent', default=DEFAULT_AGENT)
	ap.add_argument('--agent-cards', type=Path, default=DEFAULT_CARDS_DIR)
	ap.add_argument('--cases', type=Path, default=PROMPTS_FILE)
	ap.add_argument('--timeout', type=int, default=240)
	ap.add_argument('--raw-results-dir', type=Path, default=ROOT / 'docs' / 'hf_hub_community_coverage_results')
	ap.add_argument('--json-out', type=Path, default=REPORT_JSON)
	ap.add_argument('--md-out', type=Path, default=REPORT_MD)
	args = ap.parse_args()

	cases = load_cases(args.cases)

	rows: list[dict[str, Any]] = []
	for case in cases:
	result_file = args.raw_results_dir / f"coverage_{args.agent}_{args.model.replace('/', '_')}_case_{case['id']:02d}.json"
	row = run_case(
	case=case,
	timeout_sec=args.timeout,
	model=args.model,
	agent_cards=args.agent_cards,
	agent=args.agent,
	result_path=result_file,
	)
	rows.append(row)
	print(f"[{case['id']}] score={row['eval']['score_total']}/10 endpoint_ok={row['eval']['endpoint_ok']} method_ok={row['eval']['method_ok']}")

	summary = summarize(rows)

	payload = {
	'summary': summary,
	'rows': rows,
	}

	args.json_out.parent.mkdir(parents=True, exist_ok=True)
	args.md_out.parent.mkdir(parents=True, exist_ok=True)
	args.json_out.write_text(json.dumps(payload, indent=2), encoding='utf-8')
	args.md_out.write_text(render_markdown(rows, summary, model=args.model, agent=args.agent), encoding='utf-8')

	print(f"\nWrote:\n- {args.json_out}\n- {args.md_out}")


	if __name__ == '__main__':
	main()