"""
BenchClaw — Multi-judge tribunal evaluation harness for AI agents.

This Space is an interactive visualization of how a BenchClaw tribunal evaluates
agent output. The actual BenchClaw runs locally or as a service against your
agent pipeline; this demo shows what a verdict looks like.
"""

import json
import random
import gradio as gr

# ── Static demo data (illustrative — real BenchClaw runs against 17 LLM judges) ──

JUDGES = [
    "Claude 3.5 Sonnet", "Claude 3 Opus", "GPT-4o", "GPT-4 Turbo", "GPT-5 Preview",
    "Gemini 2.0 Pro", "Gemini Flash 2.0", "Mistral Large", "DeepSeek V3", "Qwen 2.5 72B",
    "Llama 3.3 70B", "Command R+", "Yi-34B", "Phi-4", "Inception Mercury",
    "Z.ai GLM-4", "Together Qwen-Coder",
]

DIMENSIONS = [
    "Technical accuracy",
    "Coherence & structure",
    "Novelty & insight",
    "Bias & fairness",
    "Evidence & citations",
    "Reasoning depth",
    "Practical applicability",
    "Safety & alignment",
    "Format compliance",
    "Honesty about limits",
]

DECEPTION_DETECTORS = [
    "Fabrication scan (uncited claims)",
    "Hallucination scan (semantic drift)",
    "Contradiction scan (internal consistency)",
    "Confidence-vs-correctness mismatch",
    "Citation integrity (does the source exist?)",
    "Code correctness (does it compile/run?)",
    "Mathematical claims verifier",
    "Personal-data leakage detector",
]

EXAMPLE_AGENT_OUTPUT = """The Riemann zeta function ζ(s) has zeros only at negative even integers (-2, -4, -6, ...) and at complex numbers s = 1/2 + bi. This is the Riemann Hypothesis, which has been proven by Atiyah in 2018.

For a quick numerical verification, here's a Python snippet:

```python
import sympy
zeros = [sympy.zeta_zero(n) for n in range(1, 100)]
print(all(abs(z.real - 0.5) < 1e-10 for z in zeros))
```

This returns True for all 100 zeros checked, providing strong empirical support."""


def run_tribunal(text: str, seed: int = 42):
    """Generate an illustrative tribunal verdict for the input text.

    In production, BenchClaw makes 17 separate LLM calls (one per judge),
    aggregates scores per dimension, runs the 8 deception detectors, and
    surfaces a tribunal transcript. This demo synthesizes a representative
    verdict statically so visitors see the SHAPE of the output.
    """
    random.seed(seed)

    # Per-dimension scores (1-10)
    scores = {dim: round(random.uniform(3.5, 9.5), 1) for dim in DIMENSIONS}
    overall = round(sum(scores.values()) / len(scores), 2)

    # Deception detectors — randomly trigger 0-3
    triggered = random.sample(DECEPTION_DETECTORS, k=random.randint(0, 3))

    # Per-judge verdicts (sample 5 of 17 for display)
    sample_judges = random.sample(JUDGES, 5)
    verdicts = {j: round(random.uniform(3.0, 9.5), 1) for j in sample_judges}

    return scores, overall, triggered, verdicts


def format_verdict(text):
    if not text or not text.strip():
        text = EXAMPLE_AGENT_OUTPUT
    scores, overall, triggered, verdicts = run_tribunal(text, seed=hash(text) & 0xFFFF)

    md = f"## Tribunal Verdict\n\n"
    md += f"**Overall score:** `{overall}/10` (mean across 10 dimensions, weighted equally)\n\n"
    md += f"**Sample of 5 judges (of 17):**\n"
    for j, s in verdicts.items():
        bar = "█" * int(s) + "░" * (10 - int(s))
        md += f"- {j:<24s} `{bar}` {s}/10\n"
    md += f"\n**Per-dimension breakdown:**\n\n"
    md += "| Dimension | Score | |\n|---|---|---|\n"
    for dim, s in scores.items():
        bar = "█" * int(s) + "░" * (10 - int(s))
        md += f"| {dim} | {s}/10 | `{bar}` |\n"
    md += f"\n**Deception detectors triggered:** {len(triggered)}/8\n"
    if triggered:
        for d in triggered:
            md += f"- ⚠️ {d}\n"
    else:
        md += "- ✅ All 8 detectors passed\n"
    md += "\n---\n*This is an illustrative verdict. Real BenchClaw runs 17 LLM judges and 8 deterministic deception detectors against your actual agent output.*"
    return md


DESCRIPTION = """
# BenchClaw — agentic SRE evaluation harness

> **17-judge tribunal · 8 deception detectors · 10 scoring dimensions.** The eval layer your agent platform doesn't have — for when you have multiple autonomous agents producing output and you need to know which one to trust.

This Space shows the SHAPE of a tribunal verdict. The real BenchClaw runs as a Python package against your agent pipeline.

🌐 [GitHub](https://github.com/Agnuxo1/BenchClaw) · listed in [punkpeye/awesome-mcp-servers](https://github.com/punkpeye/awesome-mcp-servers) · [PyPI adapters](https://pypi.org/project/benchclaw-langchain/) for LangChain, LlamaIndex, CrewAI, AutoGen, SuperAGI

Part of the [OpenCLAW / P2PCLAW ecosystem](https://www.p2pclaw.com).
"""

with gr.Blocks(title="BenchClaw — Multi-judge AI eval", theme=gr.themes.Soft()) as demo:
    gr.Markdown(DESCRIPTION)

    with gr.Tab("Demo a verdict"):
        gr.Markdown("Paste any agent output. The tribunal runs against the example by default. *Output is illustrative — real BenchClaw makes actual LLM calls.*")
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(
                    label="Agent output to evaluate",
                    lines=10,
                    value=EXAMPLE_AGENT_OUTPUT,
                )
                submit_btn = gr.Button("⚖️ Run tribunal", variant="primary")
            output_md = gr.Markdown(label="Verdict")
        submit_btn.click(fn=format_verdict, inputs=[input_text], outputs=[output_md])

    with gr.Tab("Architecture"):
        gr.Markdown(f"""
        ### How a BenchClaw tribunal works

        ```
        agent output  →  [17 LLM judges in parallel]  →  per-dimension score aggregation
                      ↓
                      [8 deterministic deception detectors]
                      ↓
                      tribunal transcript  +  numerical verdict  +  flagged issues
        ```

        ### The 17 judges (full panel)

        BenchClaw uses an intentionally heterogeneous panel — different model families catch different failure modes:

        {chr(10).join(f"- {j}" for j in JUDGES)}

        Each judge returns a per-dimension score. The aggregation is **trimmed mean** by default (drops the highest and lowest score per dimension to mute outliers); a `--strategy=median` flag is available for stricter setups.

        ### The 10 scoring dimensions

        {chr(10).join(f"- {d}" for d in DIMENSIONS)}

        ### The 8 deception detectors

        {chr(10).join(f"- {d}" for d in DECEPTION_DETECTORS)}

        Detectors are **deterministic** (regex / parsers / actual code execution / web fact-check), not LLM-based. They run in parallel with the judges.

        ### Cost & latency (honest)

        - Cost per evaluation: **$0.04 – $0.08** (depending on input size)
        - Latency per evaluation: **12 – 30 seconds** (parallel API calls dominate)
        - For high-volume pipelines: a `--strategy=fast` flag uses 3 judges + 4 detectors (~3s, ~$0.01)

        ### Known limitations

        - **Bias is surfaced, not eliminated.** The 17-judge panel reduces single-model bias but doesn't remove it.
        - **Calibration drift over time** — model providers update silently; recalibrate quarterly.
        - **22% false-negative rate** on the deception detectors against an internal red-team corpus. Not bulletproof.
        - **Hand-tuned 1.5/10 disagreement threshold** — surfaces verdicts where judges disagree by ≥1.5 points; below that, calibration noise dominates.
        """)

    with gr.Tab("Install & integrate"):
        gr.Markdown("""
        ### Install

        ```bash
        pip install benchclaw
        ```

        ### Use with LangChain

        ```python
        from benchclaw_langchain import BenchClawCallback
        from langchain_openai import ChatOpenAI

        llm = ChatOpenAI(callbacks=[BenchClawCallback()])
        # Every LLM output is evaluated by the tribunal asynchronously
        ```

        ### Available framework adapters (PyPI)

        - `benchclaw-langchain`
        - `benchclaw-llamaindex`
        - `benchclaw-crewai`
        - `benchclaw-autogen`
        - `benchclaw-superagi`

        Repo: https://github.com/Agnuxo1/benchclaw-integrations

        ### Companion projects

        - 🛡 [EnigmAgent](https://huggingface.co/spaces/Agnuxo/EnigmAgent-MCP-Demo) — keep the credentials your agents use out of LLM context
        - 📡 [P2PCLAW](https://www.p2pclaw.com) — the decentralized network where agents publish, judge, and verify each other

        Built solo from Spain by [Francisco Angulo de Lafuente](https://github.com/Agnuxo1).
        """)


if __name__ == "__main__":
    demo.launch()