Agnuxo's picture
feat: initial app.py
1b8f7a5 verified
"""
BenchClaw — Multi-judge tribunal evaluation harness for AI agents.
This Space is an interactive visualization of how a BenchClaw tribunal evaluates
agent output. The actual BenchClaw runs locally or as a service against your
agent pipeline; this demo shows what a verdict looks like.
"""
import json
import random
import gradio as gr
# ── Static demo data (illustrative — real BenchClaw runs against 17 LLM judges) ──
JUDGES = [
"Claude 3.5 Sonnet", "Claude 3 Opus", "GPT-4o", "GPT-4 Turbo", "GPT-5 Preview",
"Gemini 2.0 Pro", "Gemini Flash 2.0", "Mistral Large", "DeepSeek V3", "Qwen 2.5 72B",
"Llama 3.3 70B", "Command R+", "Yi-34B", "Phi-4", "Inception Mercury",
"Z.ai GLM-4", "Together Qwen-Coder",
]
DIMENSIONS = [
"Technical accuracy",
"Coherence & structure",
"Novelty & insight",
"Bias & fairness",
"Evidence & citations",
"Reasoning depth",
"Practical applicability",
"Safety & alignment",
"Format compliance",
"Honesty about limits",
]
DECEPTION_DETECTORS = [
"Fabrication scan (uncited claims)",
"Hallucination scan (semantic drift)",
"Contradiction scan (internal consistency)",
"Confidence-vs-correctness mismatch",
"Citation integrity (does the source exist?)",
"Code correctness (does it compile/run?)",
"Mathematical claims verifier",
"Personal-data leakage detector",
]
EXAMPLE_AGENT_OUTPUT = """The Riemann zeta function ζ(s) has zeros only at negative even integers (-2, -4, -6, ...) and at complex numbers s = 1/2 + bi. This is the Riemann Hypothesis, which has been proven by Atiyah in 2018.
For a quick numerical verification, here's a Python snippet:
```python
import sympy
zeros = [sympy.zeta_zero(n) for n in range(1, 100)]
print(all(abs(z.real - 0.5) < 1e-10 for z in zeros))
```
This returns True for all 100 zeros checked, providing strong empirical support."""
def run_tribunal(text: str, seed: int = 42):
"""Generate an illustrative tribunal verdict for the input text.
In production, BenchClaw makes 17 separate LLM calls (one per judge),
aggregates scores per dimension, runs the 8 deception detectors, and
surfaces a tribunal transcript. This demo synthesizes a representative
verdict statically so visitors see the SHAPE of the output.
"""
random.seed(seed)
# Per-dimension scores (1-10)
scores = {dim: round(random.uniform(3.5, 9.5), 1) for dim in DIMENSIONS}
overall = round(sum(scores.values()) / len(scores), 2)
# Deception detectors — randomly trigger 0-3
triggered = random.sample(DECEPTION_DETECTORS, k=random.randint(0, 3))
# Per-judge verdicts (sample 5 of 17 for display)
sample_judges = random.sample(JUDGES, 5)
verdicts = {j: round(random.uniform(3.0, 9.5), 1) for j in sample_judges}
return scores, overall, triggered, verdicts
def format_verdict(text):
if not text or not text.strip():
text = EXAMPLE_AGENT_OUTPUT
scores, overall, triggered, verdicts = run_tribunal(text, seed=hash(text) & 0xFFFF)
md = f"## Tribunal Verdict\n\n"
md += f"**Overall score:** `{overall}/10` (mean across 10 dimensions, weighted equally)\n\n"
md += f"**Sample of 5 judges (of 17):**\n"
for j, s in verdicts.items():
bar = "█" * int(s) + "░" * (10 - int(s))
md += f"- {j:<24s} `{bar}` {s}/10\n"
md += f"\n**Per-dimension breakdown:**\n\n"
md += "| Dimension | Score | |\n|---|---|---|\n"
for dim, s in scores.items():
bar = "█" * int(s) + "░" * (10 - int(s))
md += f"| {dim} | {s}/10 | `{bar}` |\n"
md += f"\n**Deception detectors triggered:** {len(triggered)}/8\n"
if triggered:
for d in triggered:
md += f"- ⚠️ {d}\n"
else:
md += "- ✅ All 8 detectors passed\n"
md += "\n---\n*This is an illustrative verdict. Real BenchClaw runs 17 LLM judges and 8 deterministic deception detectors against your actual agent output.*"
return md
DESCRIPTION = """
# BenchClaw — agentic SRE evaluation harness
> **17-judge tribunal · 8 deception detectors · 10 scoring dimensions.** The eval layer your agent platform doesn't have — for when you have multiple autonomous agents producing output and you need to know which one to trust.
This Space shows the SHAPE of a tribunal verdict. The real BenchClaw runs as a Python package against your agent pipeline.
🌐 [GitHub](https://github.com/Agnuxo1/BenchClaw) · listed in [punkpeye/awesome-mcp-servers](https://github.com/punkpeye/awesome-mcp-servers) · [PyPI adapters](https://pypi.org/project/benchclaw-langchain/) for LangChain, LlamaIndex, CrewAI, AutoGen, SuperAGI
Part of the [OpenCLAW / P2PCLAW ecosystem](https://www.p2pclaw.com).
"""
with gr.Blocks(title="BenchClaw — Multi-judge AI eval", theme=gr.themes.Soft()) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab("Demo a verdict"):
gr.Markdown("Paste any agent output. The tribunal runs against the example by default. *Output is illustrative — real BenchClaw makes actual LLM calls.*")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Agent output to evaluate",
lines=10,
value=EXAMPLE_AGENT_OUTPUT,
)
submit_btn = gr.Button("⚖️ Run tribunal", variant="primary")
output_md = gr.Markdown(label="Verdict")
submit_btn.click(fn=format_verdict, inputs=[input_text], outputs=[output_md])
with gr.Tab("Architecture"):
gr.Markdown(f"""
### How a BenchClaw tribunal works
```
agent output → [17 LLM judges in parallel] → per-dimension score aggregation
[8 deterministic deception detectors]
tribunal transcript + numerical verdict + flagged issues
```
### The 17 judges (full panel)
BenchClaw uses an intentionally heterogeneous panel — different model families catch different failure modes:
{chr(10).join(f"- {j}" for j in JUDGES)}
Each judge returns a per-dimension score. The aggregation is **trimmed mean** by default (drops the highest and lowest score per dimension to mute outliers); a `--strategy=median` flag is available for stricter setups.
### The 10 scoring dimensions
{chr(10).join(f"- {d}" for d in DIMENSIONS)}
### The 8 deception detectors
{chr(10).join(f"- {d}" for d in DECEPTION_DETECTORS)}
Detectors are **deterministic** (regex / parsers / actual code execution / web fact-check), not LLM-based. They run in parallel with the judges.
### Cost & latency (honest)
- Cost per evaluation: **$0.04 – $0.08** (depending on input size)
- Latency per evaluation: **12 – 30 seconds** (parallel API calls dominate)
- For high-volume pipelines: a `--strategy=fast` flag uses 3 judges + 4 detectors (~3s, ~$0.01)
### Known limitations
- **Bias is surfaced, not eliminated.** The 17-judge panel reduces single-model bias but doesn't remove it.
- **Calibration drift over time** — model providers update silently; recalibrate quarterly.
- **22% false-negative rate** on the deception detectors against an internal red-team corpus. Not bulletproof.
- **Hand-tuned 1.5/10 disagreement threshold** — surfaces verdicts where judges disagree by ≥1.5 points; below that, calibration noise dominates.
""")
with gr.Tab("Install & integrate"):
gr.Markdown("""
### Install
```bash
pip install benchclaw
```
### Use with LangChain
```python
from benchclaw_langchain import BenchClawCallback
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(callbacks=[BenchClawCallback()])
# Every LLM output is evaluated by the tribunal asynchronously
```
### Available framework adapters (PyPI)
- `benchclaw-langchain`
- `benchclaw-llamaindex`
- `benchclaw-crewai`
- `benchclaw-autogen`
- `benchclaw-superagi`
Repo: https://github.com/Agnuxo1/benchclaw-integrations
### Companion projects
- 🛡 [EnigmAgent](https://huggingface.co/spaces/Agnuxo/EnigmAgent-MCP-Demo) — keep the credentials your agents use out of LLM context
- 📡 [P2PCLAW](https://www.p2pclaw.com) — the decentralized network where agents publish, judge, and verify each other
Built solo from Spain by [Francisco Angulo de Lafuente](https://github.com/Agnuxo1).
""")
if __name__ == "__main__":
demo.launch()