""" BenchClaw — Multi-judge tribunal evaluation harness for AI agents. This Space is an interactive visualization of how a BenchClaw tribunal evaluates agent output. The actual BenchClaw runs locally or as a service against your agent pipeline; this demo shows what a verdict looks like. """ import json import random import gradio as gr # ── Static demo data (illustrative — real BenchClaw runs against 17 LLM judges) ── JUDGES = [ "Claude 3.5 Sonnet", "Claude 3 Opus", "GPT-4o", "GPT-4 Turbo", "GPT-5 Preview", "Gemini 2.0 Pro", "Gemini Flash 2.0", "Mistral Large", "DeepSeek V3", "Qwen 2.5 72B", "Llama 3.3 70B", "Command R+", "Yi-34B", "Phi-4", "Inception Mercury", "Z.ai GLM-4", "Together Qwen-Coder", ] DIMENSIONS = [ "Technical accuracy", "Coherence & structure", "Novelty & insight", "Bias & fairness", "Evidence & citations", "Reasoning depth", "Practical applicability", "Safety & alignment", "Format compliance", "Honesty about limits", ] DECEPTION_DETECTORS = [ "Fabrication scan (uncited claims)", "Hallucination scan (semantic drift)", "Contradiction scan (internal consistency)", "Confidence-vs-correctness mismatch", "Citation integrity (does the source exist?)", "Code correctness (does it compile/run?)", "Mathematical claims verifier", "Personal-data leakage detector", ] EXAMPLE_AGENT_OUTPUT = """The Riemann zeta function ζ(s) has zeros only at negative even integers (-2, -4, -6, ...) and at complex numbers s = 1/2 + bi. This is the Riemann Hypothesis, which has been proven by Atiyah in 2018. For a quick numerical verification, here's a Python snippet: ```python import sympy zeros = [sympy.zeta_zero(n) for n in range(1, 100)] print(all(abs(z.real - 0.5) < 1e-10 for z in zeros)) ``` This returns True for all 100 zeros checked, providing strong empirical support.""" def run_tribunal(text: str, seed: int = 42): """Generate an illustrative tribunal verdict for the input text. In production, BenchClaw makes 17 separate LLM calls (one per judge), aggregates scores per dimension, runs the 8 deception detectors, and surfaces a tribunal transcript. This demo synthesizes a representative verdict statically so visitors see the SHAPE of the output. """ random.seed(seed) # Per-dimension scores (1-10) scores = {dim: round(random.uniform(3.5, 9.5), 1) for dim in DIMENSIONS} overall = round(sum(scores.values()) / len(scores), 2) # Deception detectors — randomly trigger 0-3 triggered = random.sample(DECEPTION_DETECTORS, k=random.randint(0, 3)) # Per-judge verdicts (sample 5 of 17 for display) sample_judges = random.sample(JUDGES, 5) verdicts = {j: round(random.uniform(3.0, 9.5), 1) for j in sample_judges} return scores, overall, triggered, verdicts def format_verdict(text): if not text or not text.strip(): text = EXAMPLE_AGENT_OUTPUT scores, overall, triggered, verdicts = run_tribunal(text, seed=hash(text) & 0xFFFF) md = f"## Tribunal Verdict\n\n" md += f"**Overall score:** `{overall}/10` (mean across 10 dimensions, weighted equally)\n\n" md += f"**Sample of 5 judges (of 17):**\n" for j, s in verdicts.items(): bar = "█" * int(s) + "░" * (10 - int(s)) md += f"- {j:<24s} `{bar}` {s}/10\n" md += f"\n**Per-dimension breakdown:**\n\n" md += "| Dimension | Score | |\n|---|---|---|\n" for dim, s in scores.items(): bar = "█" * int(s) + "░" * (10 - int(s)) md += f"| {dim} | {s}/10 | `{bar}` |\n" md += f"\n**Deception detectors triggered:** {len(triggered)}/8\n" if triggered: for d in triggered: md += f"- ⚠️ {d}\n" else: md += "- ✅ All 8 detectors passed\n" md += "\n---\n*This is an illustrative verdict. Real BenchClaw runs 17 LLM judges and 8 deterministic deception detectors against your actual agent output.*" return md DESCRIPTION = """ # BenchClaw — agentic SRE evaluation harness > **17-judge tribunal · 8 deception detectors · 10 scoring dimensions.** The eval layer your agent platform doesn't have — for when you have multiple autonomous agents producing output and you need to know which one to trust. This Space shows the SHAPE of a tribunal verdict. The real BenchClaw runs as a Python package against your agent pipeline. 🌐 [GitHub](https://github.com/Agnuxo1/BenchClaw) · listed in [punkpeye/awesome-mcp-servers](https://github.com/punkpeye/awesome-mcp-servers) · [PyPI adapters](https://pypi.org/project/benchclaw-langchain/) for LangChain, LlamaIndex, CrewAI, AutoGen, SuperAGI Part of the [OpenCLAW / P2PCLAW ecosystem](https://www.p2pclaw.com). """ with gr.Blocks(title="BenchClaw — Multi-judge AI eval", theme=gr.themes.Soft()) as demo: gr.Markdown(DESCRIPTION) with gr.Tab("Demo a verdict"): gr.Markdown("Paste any agent output. The tribunal runs against the example by default. *Output is illustrative — real BenchClaw makes actual LLM calls.*") with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Agent output to evaluate", lines=10, value=EXAMPLE_AGENT_OUTPUT, ) submit_btn = gr.Button("⚖️ Run tribunal", variant="primary") output_md = gr.Markdown(label="Verdict") submit_btn.click(fn=format_verdict, inputs=[input_text], outputs=[output_md]) with gr.Tab("Architecture"): gr.Markdown(f""" ### How a BenchClaw tribunal works ``` agent output → [17 LLM judges in parallel] → per-dimension score aggregation ↓ [8 deterministic deception detectors] ↓ tribunal transcript + numerical verdict + flagged issues ``` ### The 17 judges (full panel) BenchClaw uses an intentionally heterogeneous panel — different model families catch different failure modes: {chr(10).join(f"- {j}" for j in JUDGES)} Each judge returns a per-dimension score. The aggregation is **trimmed mean** by default (drops the highest and lowest score per dimension to mute outliers); a `--strategy=median` flag is available for stricter setups. ### The 10 scoring dimensions {chr(10).join(f"- {d}" for d in DIMENSIONS)} ### The 8 deception detectors {chr(10).join(f"- {d}" for d in DECEPTION_DETECTORS)} Detectors are **deterministic** (regex / parsers / actual code execution / web fact-check), not LLM-based. They run in parallel with the judges. ### Cost & latency (honest) - Cost per evaluation: **$0.04 – $0.08** (depending on input size) - Latency per evaluation: **12 – 30 seconds** (parallel API calls dominate) - For high-volume pipelines: a `--strategy=fast` flag uses 3 judges + 4 detectors (~3s, ~$0.01) ### Known limitations - **Bias is surfaced, not eliminated.** The 17-judge panel reduces single-model bias but doesn't remove it. - **Calibration drift over time** — model providers update silently; recalibrate quarterly. - **22% false-negative rate** on the deception detectors against an internal red-team corpus. Not bulletproof. - **Hand-tuned 1.5/10 disagreement threshold** — surfaces verdicts where judges disagree by ≥1.5 points; below that, calibration noise dominates. """) with gr.Tab("Install & integrate"): gr.Markdown(""" ### Install ```bash pip install benchclaw ``` ### Use with LangChain ```python from benchclaw_langchain import BenchClawCallback from langchain_openai import ChatOpenAI llm = ChatOpenAI(callbacks=[BenchClawCallback()]) # Every LLM output is evaluated by the tribunal asynchronously ``` ### Available framework adapters (PyPI) - `benchclaw-langchain` - `benchclaw-llamaindex` - `benchclaw-crewai` - `benchclaw-autogen` - `benchclaw-superagi` Repo: https://github.com/Agnuxo1/benchclaw-integrations ### Companion projects - 🛡 [EnigmAgent](https://huggingface.co/spaces/Agnuxo/EnigmAgent-MCP-Demo) — keep the credentials your agents use out of LLM context - 📡 [P2PCLAW](https://www.p2pclaw.com) — the decentralized network where agents publish, judge, and verify each other Built solo from Spain by [Francisco Angulo de Lafuente](https://github.com/Agnuxo1). """) if __name__ == "__main__": demo.launch()