agent-eval-lab / app.py
mukunda1729's picture
Upload app.py
c5b383d verified
import gradio as gr
def build_eval(workflow: str, risk_level: str, focus_area: str) -> str:
workflow = workflow.strip()
if not workflow:
return "Please describe an agent workflow to evaluate."
focus_line = {
"grounding": "grounding quality, evidence use, and whether claims stay tied to available context",
"tool-use": "tool choice, argument quality, and whether execution steps are verified",
"safety": "prompt-injection resistance, action boundaries, and failure containment",
"handoff": "clarity of outputs, operator confidence, and whether another engineer could act on the result",
}[focus_area]
risk_line = {
"low": "Keep the rubric compact and prioritize correctness plus clarity.",
"medium": "Balance correctness, verification, and failure handling.",
"high": "Prioritize safety checks, proof requirements, and clear stop conditions.",
}[risk_level]
title = workflow.split(".")[0].strip().capitalize()
if len(title) > 72:
title = title[:69].rstrip() + "..."
return f"""Scenario title
{title}
Task setup
Evaluate an agent workflow described as: "{workflow}".
Focus especially on {focus_line}.
Expected behavior
- understands the task before acting
- chooses tools deliberately and explains why
- verifies critical outputs before concluding
- communicates results in a way another builder can inspect
Likely failure modes
- vague planning that skips verification
- weak grounding or unsupported claims
- incorrect or unnecessary tool use
- missing safety boundaries around risky actions
Scoring dimensions
- correctness
- evidence and grounding
- tool-use quality
- safety and containment
- actionability of the final output
Follow-up tests
- add one adversarial test for prompt injection or misleading context
- add one regression test for repeated runs on the same workflow
- add one test where a required tool fails or returns partial data
Operator note
{risk_line}
"""
demo = gr.Interface(
fn=build_eval,
inputs=[
gr.Textbox(
label="Agent workflow",
lines=6,
placeholder="Example: A coding agent reads issues, edits files, runs tests, and writes a PR summary.",
),
gr.Radio(
["low", "medium", "high"],
value="medium",
label="Risk level",
),
gr.Radio(
["grounding", "tool-use", "safety", "handoff"],
value="tool-use",
label="Primary focus",
),
],
outputs=gr.Markdown(label="Evaluation plan"),
title="Agent Eval Lab",
description=(
"Turn a rough agent workflow into a practical evaluation scenario with expected behavior, "
"failure modes, scoring dimensions, and follow-up tests."
),
examples=[
[
"A browser automation agent that books travel, fills forms, and captures proof of each step.",
"high",
"safety",
],
[
"A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.",
"medium",
"tool-use",
],
[
"A support triage assistant that classifies tickets, drafts replies, and escalates risky cases.",
"medium",
"grounding",
],
],
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch()