| |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import re |
| import subprocess |
| from pathlib import Path |
| from typing import Any |
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| DEFAULT_CARDS_DIR = ROOT / '.fast-agent' / 'tool-cards' |
| DEFAULT_AGENT = 'hf_hub_community' |
| PROMPTS_FILE = ROOT / 'scripts' / 'hf_hub_community_coverage_prompts.json' |
| REPORT_MD = ROOT / 'docs' / 'hf_hub_community_coverage_report.md' |
| REPORT_JSON = ROOT / 'docs' / 'hf_hub_community_coverage_report.json' |
|
|
| ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") |
|
|
|
|
| def strip_ansi(text: str) -> str: |
| return ANSI_RE.sub('', text) |
|
|
|
|
| def load_cases(path: Path) -> list[dict[str, Any]]: |
| rows = json.loads(path.read_text(encoding='utf-8')) |
| if not isinstance(rows, list): |
| raise ValueError('coverage prompts file must be a JSON list') |
| out: list[dict[str, Any]] = [] |
| for row in rows: |
| if not isinstance(row, dict): |
| continue |
| out.append({ |
| 'id': int(row['id']), |
| 'prompt': str(row['prompt']), |
| 'expected_endpoint_any': list(row.get('expected_endpoint_any', [])), |
| 'expected_method_any': [str(x).upper() for x in row.get('expected_method_any', ['GET'])], |
| }) |
| return out |
|
|
|
|
| def _session_extract(result_path: Path) -> dict[str, Any]: |
| data = json.loads(result_path.read_text(encoding='utf-8')) |
| messages = data.get('messages', []) if isinstance(data, dict) else [] |
|
|
| endpoints: list[str] = [] |
| methods: list[str] = [] |
| tool_names: list[str] = [] |
| tool_calls_count = 0 |
| merged_parts: list[str] = [] |
| tool_error = False |
|
|
| usage_input_tokens = 0 |
| usage_output_tokens = 0 |
| usage_total_tokens = 0 |
|
|
| for msg in messages: |
| if not isinstance(msg, dict): |
| continue |
|
|
| if msg.get('role') == 'assistant': |
| for item in msg.get('content', []) or []: |
| if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'): |
| merged_parts.append(str(item['text'])) |
|
|
| channels = msg.get('channels') or {} |
| for item in channels.get('reasoning', []) or []: |
| if isinstance(item, dict) and item.get('text'): |
| merged_parts.append(str(item['text'])) |
|
|
| for item in channels.get('fast-agent-usage', []) or []: |
| if not isinstance(item, dict): |
| continue |
| txt = item.get('text') |
| if not isinstance(txt, str): |
| continue |
| try: |
| payload = json.loads(txt) |
| except Exception: |
| continue |
| turn = payload.get('turn', {}) if isinstance(payload, dict) else {} |
| if isinstance(turn, dict): |
| usage_input_tokens += int(turn.get('input_tokens') or 0) |
| usage_output_tokens += int(turn.get('output_tokens') or 0) |
| usage_total_tokens += int(turn.get('total_tokens') or 0) |
|
|
| tool_calls = msg.get('tool_calls') or {} |
| if isinstance(tool_calls, dict): |
| tool_calls_count += len(tool_calls) |
| for tc in tool_calls.values(): |
| params = (tc or {}).get('params', {}) if isinstance(tc, dict) else {} |
| name = params.get('name') if isinstance(params, dict) else None |
| args = params.get('arguments', {}) if isinstance(params, dict) else {} |
| if isinstance(name, str): |
| tool_names.append(name) |
| merged_parts.append(f'tool call - {name}') |
| if isinstance(args, dict): |
| ep = args.get('endpoint') |
| if isinstance(ep, str): |
| endpoints.append(ep) |
| method = args.get('method') |
| methods.append(str(method).upper() if method else 'GET') |
| merged_parts.append(json.dumps(args, ensure_ascii=False)) |
|
|
| if msg.get('role') == 'user': |
| tool_results = msg.get('tool_results') or {} |
| if isinstance(tool_results, dict): |
| for tr in tool_results.values(): |
| if bool((tr or {}).get('isError')): |
| tool_error = True |
| for item in (tr or {}).get('content', []) or []: |
| if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'): |
| text = str(item['text']) |
| merged_parts.append(text) |
| if 'Error executing tool' in text: |
| tool_error = True |
|
|
| return { |
| 'endpoints': endpoints, |
| 'methods': methods, |
| 'tool_names': tool_names, |
| 'tool_calls_count': tool_calls_count, |
| 'tool_error': tool_error, |
| 'merged': '\n'.join(merged_parts).strip(), |
| 'usage': { |
| 'input_tokens': usage_input_tokens, |
| 'output_tokens': usage_output_tokens, |
| 'total_tokens': usage_total_tokens, |
| }, |
| } |
|
|
|
|
| def run_case( |
| case: dict[str, Any], |
| timeout_sec: int, |
| model: str, |
| agent_cards: Path, |
| agent: str, |
| result_path: Path, |
| ) -> dict[str, Any]: |
| prompt = case['prompt'] |
| result_path.parent.mkdir(parents=True, exist_ok=True) |
| cmd = [ |
| 'fast-agent', 'go', |
| '--no-env', |
| '--model', model, |
| '--agent-cards', str(agent_cards), |
| '--agent', agent, |
| '--results', str(result_path), |
| '-m', prompt, |
| ] |
|
|
| proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_sec) |
| out = strip_ansi(proc.stdout or '') |
| err = strip_ansi(proc.stderr or '') |
|
|
| if not result_path.exists(): |
| raise RuntimeError(f'Expected --results file not written: {result_path}') |
|
|
| parsed = _session_extract(result_path) |
|
|
| expected_ep = [re.compile(p) for p in case.get('expected_endpoint_any', [])] |
| expected_methods = [m.upper() for m in case.get('expected_method_any', ['GET'])] |
|
|
| endpoint_ok = any(p.search(ep) for p in expected_ep for ep in parsed['endpoints']) if expected_ep else False |
| method_ok = any(m in expected_methods for m in parsed['methods']) if parsed['methods'] else ('GET' in expected_methods) |
| tool_ok = any(t == 'hf_api_request' for t in parsed['tool_names']) |
| success = proc.returncode == 0 and 'Traceback' not in (out + '\n' + err) |
| clarity = len(parsed['merged']) > 20 |
|
|
| score = int(endpoint_ok) * 4 + int(method_ok) * 2 + int(tool_ok) * 2 + int(success) + int(clarity) |
|
|
| return { |
| 'id': case['id'], |
| 'prompt': prompt, |
| 'returncode': proc.returncode, |
| 'result_file': str(result_path), |
| 'observed': { |
| 'endpoints': parsed['endpoints'], |
| 'methods': parsed['methods'], |
| 'tool_names': parsed['tool_names'], |
| 'tool_calls_count': parsed['tool_calls_count'], |
| 'tool_error': parsed['tool_error'], |
| 'usage': parsed['usage'], |
| }, |
| 'expected': { |
| 'endpoint_any': case.get('expected_endpoint_any', []), |
| 'method_any': expected_methods, |
| }, |
| 'eval': { |
| 'endpoint_ok': endpoint_ok, |
| 'method_ok': method_ok, |
| 'tool_ok': tool_ok, |
| 'success': success, |
| 'clarity': clarity, |
| 'score_total': score, |
| }, |
| 'merged': parsed['merged'], |
| } |
|
|
|
|
| def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: |
| n = len(rows) |
| if n == 0: |
| return {'n_cases': 0} |
|
|
| endpoint_rate = sum(1 for r in rows if r['eval']['endpoint_ok']) / n |
| method_rate = sum(1 for r in rows if r['eval']['method_ok']) / n |
| tool_rate = sum(1 for r in rows if r['eval']['tool_ok']) / n |
| success_rate = sum(1 for r in rows if r['eval']['success']) / n |
| avg_score = sum(r['eval']['score_total'] for r in rows) / n |
| avg_calls = sum(r['observed']['tool_calls_count'] for r in rows) / n |
| avg_tokens = sum(int(r['observed']['usage'].get('total_tokens') or 0) for r in rows) / n |
| tool_error_rate = sum(1 for r in rows if r['observed']['tool_error']) / n |
|
|
| return { |
| 'n_cases': n, |
| 'endpoint_match_rate': round(endpoint_rate, 4), |
| 'method_match_rate': round(method_rate, 4), |
| 'tool_use_rate': round(tool_rate, 4), |
| 'success_rate': round(success_rate, 4), |
| 'tool_error_rate': round(tool_error_rate, 4), |
| 'avg_score_total': round(avg_score, 3), |
| 'avg_tool_calls': round(avg_calls, 3), |
| 'avg_total_tokens': round(avg_tokens, 1), |
| } |
|
|
|
|
| def render_markdown(rows: list[dict[str, Any]], summary: dict[str, Any], model: str, agent: str) -> str: |
| out = [ |
| '# HF Hub Community Coverage Report', |
| '', |
| f'- Model: `{model}`', |
| f'- Agent: `{agent}`', |
| '', |
| '## Summary', |
| '', |
| f"- Cases: **{summary.get('n_cases', 0)}**", |
| f"- Endpoint match rate: **{summary.get('endpoint_match_rate')}**", |
| f"- Method match rate: **{summary.get('method_match_rate')}**", |
| f"- Tool use rate: **{summary.get('tool_use_rate')}**", |
| f"- Success rate: **{summary.get('success_rate')}**", |
| f"- Tool error rate: **{summary.get('tool_error_rate')}**", |
| f"- Avg score (/10): **{summary.get('avg_score_total')}**", |
| f"- Avg tool calls: **{summary.get('avg_tool_calls')}**", |
| f"- Avg total tokens: **{summary.get('avg_total_tokens')}**", |
| '', |
| '| # | Score | Endpoint OK | Method OK | Calls | Tokens | Prompt |', |
| '|---|------:|------------:|----------:|------:|-------:|--------|', |
| ] |
|
|
| for r in rows: |
| score = r['eval']['score_total'] |
| calls = r['observed']['tool_calls_count'] |
| tokens = int(r['observed']['usage'].get('total_tokens') or 0) |
| out.append( |
| f"| {r['id']} | {score} | {int(r['eval']['endpoint_ok'])} | {int(r['eval']['method_ok'])} | {calls} | {tokens} | {r['prompt'][:72].replace('|','/')} |" |
| ) |
|
|
| return '\n'.join(out) + '\n' |
|
|
|
|
| def main() -> None: |
| ap = argparse.ArgumentParser(description='Run endpoint-coverage pack for hf_hub_community') |
| ap.add_argument('--model', default='gpt-oss') |
| ap.add_argument('--agent', default=DEFAULT_AGENT) |
| ap.add_argument('--agent-cards', type=Path, default=DEFAULT_CARDS_DIR) |
| ap.add_argument('--cases', type=Path, default=PROMPTS_FILE) |
| ap.add_argument('--timeout', type=int, default=240) |
| ap.add_argument('--raw-results-dir', type=Path, default=ROOT / 'docs' / 'hf_hub_community_coverage_results') |
| ap.add_argument('--json-out', type=Path, default=REPORT_JSON) |
| ap.add_argument('--md-out', type=Path, default=REPORT_MD) |
| args = ap.parse_args() |
|
|
| cases = load_cases(args.cases) |
|
|
| rows: list[dict[str, Any]] = [] |
| for case in cases: |
| result_file = args.raw_results_dir / f"coverage_{args.agent}_{args.model.replace('/', '_')}_case_{case['id']:02d}.json" |
| row = run_case( |
| case=case, |
| timeout_sec=args.timeout, |
| model=args.model, |
| agent_cards=args.agent_cards, |
| agent=args.agent, |
| result_path=result_file, |
| ) |
| rows.append(row) |
| print(f"[{case['id']}] score={row['eval']['score_total']}/10 endpoint_ok={row['eval']['endpoint_ok']} method_ok={row['eval']['method_ok']}") |
|
|
| summary = summarize(rows) |
|
|
| payload = { |
| 'summary': summary, |
| 'rows': rows, |
| } |
|
|
| args.json_out.parent.mkdir(parents=True, exist_ok=True) |
| args.md_out.parent.mkdir(parents=True, exist_ok=True) |
| args.json_out.write_text(json.dumps(payload, indent=2), encoding='utf-8') |
| args.md_out.write_text(render_markdown(rows, summary, model=args.model, agent=args.agent), encoding='utf-8') |
|
|
| print(f"\nWrote:\n- {args.json_out}\n- {args.md_out}") |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|