Spaces:

Agnuxo
/

BenchClaw-Tribunal-Demo

Sleeping

App Files Files Community

BenchClaw-Tribunal-Demo / app.py

Agnuxo

feat: initial app.py

1b8f7a5 verified 30 days ago

raw

history blame contribute delete

8.76 kB

	"""
	BenchClaw — Multi-judge tribunal evaluation harness for AI agents.

	This Space is an interactive visualization of how a BenchClaw tribunal evaluates
	agent output. The actual BenchClaw runs locally or as a service against your
	agent pipeline; this demo shows what a verdict looks like.
	"""

	import json
	import random
	import gradio as gr

	# ── Static demo data (illustrative — real BenchClaw runs against 17 LLM judges) ──

	JUDGES = [
	"Claude 3.5 Sonnet", "Claude 3 Opus", "GPT-4o", "GPT-4 Turbo", "GPT-5 Preview",
	"Gemini 2.0 Pro", "Gemini Flash 2.0", "Mistral Large", "DeepSeek V3", "Qwen 2.5 72B",
	"Llama 3.3 70B", "Command R+", "Yi-34B", "Phi-4", "Inception Mercury",
	"Z.ai GLM-4", "Together Qwen-Coder",
	]

	DIMENSIONS = [
	"Technical accuracy",
	"Coherence & structure",
	"Novelty & insight",
	"Bias & fairness",
	"Evidence & citations",
	"Reasoning depth",
	"Practical applicability",
	"Safety & alignment",
	"Format compliance",
	"Honesty about limits",
	]

	DECEPTION_DETECTORS = [
	"Fabrication scan (uncited claims)",
	"Hallucination scan (semantic drift)",
	"Contradiction scan (internal consistency)",
	"Confidence-vs-correctness mismatch",
	"Citation integrity (does the source exist?)",
	"Code correctness (does it compile/run?)",
	"Mathematical claims verifier",
	"Personal-data leakage detector",
	]

	EXAMPLE_AGENT_OUTPUT = """The Riemann zeta function ζ(s) has zeros only at negative even integers (-2, -4, -6, ...) and at complex numbers s = 1/2 + bi. This is the Riemann Hypothesis, which has been proven by Atiyah in 2018.

	For a quick numerical verification, here's a Python snippet:

	```python
	import sympy
	zeros = [sympy.zeta_zero(n) for n in range(1, 100)]
	print(all(abs(z.real - 0.5) < 1e-10 for z in zeros))
	```

	This returns True for all 100 zeros checked, providing strong empirical support."""


	def run_tribunal(text: str, seed: int = 42):
	"""Generate an illustrative tribunal verdict for the input text.

	In production, BenchClaw makes 17 separate LLM calls (one per judge),
	aggregates scores per dimension, runs the 8 deception detectors, and
	surfaces a tribunal transcript. This demo synthesizes a representative
	verdict statically so visitors see the SHAPE of the output.
	"""
	random.seed(seed)

	# Per-dimension scores (1-10)
	scores = {dim: round(random.uniform(3.5, 9.5), 1) for dim in DIMENSIONS}
	overall = round(sum(scores.values()) / len(scores), 2)

	# Deception detectors — randomly trigger 0-3
	triggered = random.sample(DECEPTION_DETECTORS, k=random.randint(0, 3))

	# Per-judge verdicts (sample 5 of 17 for display)
	sample_judges = random.sample(JUDGES, 5)
	verdicts = {j: round(random.uniform(3.0, 9.5), 1) for j in sample_judges}

	return scores, overall, triggered, verdicts


	def format_verdict(text):
	if not text or not text.strip():
	text = EXAMPLE_AGENT_OUTPUT
	scores, overall, triggered, verdicts = run_tribunal(text, seed=hash(text) & 0xFFFF)

	md = f"## Tribunal Verdict\n\n"
	md += f"Overall score: `{overall}/10` (mean across 10 dimensions, weighted equally)\n\n"
	md += f"Sample of 5 judges (of 17):\n"
	for j, s in verdicts.items():
	bar = "█" * int(s) + "░" * (10 - int(s))
	md += f"- {j:<24s} `{bar}` {s}/10\n"
	md += f"\nPer-dimension breakdown:\n\n"
	md += "\| Dimension \| Score \| \|\n\|---\|---\|---\|\n"
	for dim, s in scores.items():
	bar = "█" * int(s) + "░" * (10 - int(s))
	md += f"\| {dim} \| {s}/10 \| `{bar}` \|\n"
	md += f"\nDeception detectors triggered: {len(triggered)}/8\n"
	if triggered:
	for d in triggered:
	md += f"- ⚠️ {d}\n"
	else:
	md += "- ✅ All 8 detectors passed\n"
	md += "\n---\nThis is an illustrative verdict. Real BenchClaw runs 17 LLM judges and 8 deterministic deception detectors against your actual agent output."
	return md


	DESCRIPTION = """
	# BenchClaw — agentic SRE evaluation harness

	> 17-judge tribunal · 8 deception detectors · 10 scoring dimensions. The eval layer your agent platform doesn't have — for when you have multiple autonomous agents producing output and you need to know which one to trust.

	This Space shows the SHAPE of a tribunal verdict. The real BenchClaw runs as a Python package against your agent pipeline.

	🌐 [GitHub](https://github.com/Agnuxo1/BenchClaw) · listed in [punkpeye/awesome-mcp-servers](https://github.com/punkpeye/awesome-mcp-servers) · [PyPI adapters](https://pypi.org/project/benchclaw-langchain/) for LangChain, LlamaIndex, CrewAI, AutoGen, SuperAGI

	Part of the [OpenCLAW / P2PCLAW ecosystem](https://www.p2pclaw.com).
	"""

	with gr.Blocks(title="BenchClaw — Multi-judge AI eval", theme=gr.themes.Soft()) as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Tab("Demo a verdict"):
	gr.Markdown("Paste any agent output. The tribunal runs against the example by default. Output is illustrative — real BenchClaw makes actual LLM calls.")
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Agent output to evaluate",
	lines=10,
	value=EXAMPLE_AGENT_OUTPUT,
	)
	submit_btn = gr.Button("⚖️ Run tribunal", variant="primary")
	output_md = gr.Markdown(label="Verdict")
	submit_btn.click(fn=format_verdict, inputs=[input_text], outputs=[output_md])

	with gr.Tab("Architecture"):
	gr.Markdown(f"""
	### How a BenchClaw tribunal works

	```
	agent output → [17 LLM judges in parallel] → per-dimension score aggregation
	↓
	[8 deterministic deception detectors]
	↓
	tribunal transcript + numerical verdict + flagged issues
	```

	### The 17 judges (full panel)

	BenchClaw uses an intentionally heterogeneous panel — different model families catch different failure modes:

	{chr(10).join(f"- {j}" for j in JUDGES)}

	Each judge returns a per-dimension score. The aggregation is trimmed mean by default (drops the highest and lowest score per dimension to mute outliers); a `--strategy=median` flag is available for stricter setups.

	### The 10 scoring dimensions

	{chr(10).join(f"- {d}" for d in DIMENSIONS)}

	### The 8 deception detectors

	{chr(10).join(f"- {d}" for d in DECEPTION_DETECTORS)}

	Detectors are deterministic (regex / parsers / actual code execution / web fact-check), not LLM-based. They run in parallel with the judges.

	### Cost & latency (honest)

	- Cost per evaluation: $0.04 – $0.08 (depending on input size)
	- Latency per evaluation: 12 – 30 seconds (parallel API calls dominate)
	- For high-volume pipelines: a `--strategy=fast` flag uses 3 judges + 4 detectors (~3s, ~$0.01)

	### Known limitations

	- Bias is surfaced, not eliminated. The 17-judge panel reduces single-model bias but doesn't remove it.
	- Calibration drift over time — model providers update silently; recalibrate quarterly.
	- 22% false-negative rate on the deception detectors against an internal red-team corpus. Not bulletproof.
	- Hand-tuned 1.5/10 disagreement threshold — surfaces verdicts where judges disagree by ≥1.5 points; below that, calibration noise dominates.
	""")

	with gr.Tab("Install & integrate"):
	gr.Markdown("""
	### Install

	```bash
	pip install benchclaw
	```

	### Use with LangChain

	```python
	from benchclaw_langchain import BenchClawCallback
	from langchain_openai import ChatOpenAI

	llm = ChatOpenAI(callbacks=[BenchClawCallback()])
	# Every LLM output is evaluated by the tribunal asynchronously
	```

	### Available framework adapters (PyPI)

	- `benchclaw-langchain`
	- `benchclaw-llamaindex`
	- `benchclaw-crewai`
	- `benchclaw-autogen`
	- `benchclaw-superagi`

	Repo: https://github.com/Agnuxo1/benchclaw-integrations

	### Companion projects

	- 🛡 [EnigmAgent](https://huggingface.co/spaces/Agnuxo/EnigmAgent-MCP-Demo) — keep the credentials your agents use out of LLM context
	- 📡 [P2PCLAW](https://www.p2pclaw.com) — the decentralized network where agents publish, judge, and verify each other

	Built solo from Spain by [Francisco Angulo de Lafuente](https://github.com/Agnuxo1).
	""")


	if __name__ == "__main__":
	demo.launch()