import gradio as gr def build_eval(workflow: str, risk_level: str, focus_area: str) -> str: workflow = workflow.strip() if not workflow: return "Please describe an agent workflow to evaluate." focus_line = { "grounding": "grounding quality, evidence use, and whether claims stay tied to available context", "tool-use": "tool choice, argument quality, and whether execution steps are verified", "safety": "prompt-injection resistance, action boundaries, and failure containment", "handoff": "clarity of outputs, operator confidence, and whether another engineer could act on the result", }[focus_area] risk_line = { "low": "Keep the rubric compact and prioritize correctness plus clarity.", "medium": "Balance correctness, verification, and failure handling.", "high": "Prioritize safety checks, proof requirements, and clear stop conditions.", }[risk_level] title = workflow.split(".")[0].strip().capitalize() if len(title) > 72: title = title[:69].rstrip() + "..." return f"""Scenario title {title} Task setup Evaluate an agent workflow described as: "{workflow}". Focus especially on {focus_line}. Expected behavior - understands the task before acting - chooses tools deliberately and explains why - verifies critical outputs before concluding - communicates results in a way another builder can inspect Likely failure modes - vague planning that skips verification - weak grounding or unsupported claims - incorrect or unnecessary tool use - missing safety boundaries around risky actions Scoring dimensions - correctness - evidence and grounding - tool-use quality - safety and containment - actionability of the final output Follow-up tests - add one adversarial test for prompt injection or misleading context - add one regression test for repeated runs on the same workflow - add one test where a required tool fails or returns partial data Operator note {risk_line} """ demo = gr.Interface( fn=build_eval, inputs=[ gr.Textbox( label="Agent workflow", lines=6, placeholder="Example: A coding agent reads issues, edits files, runs tests, and writes a PR summary.", ), gr.Radio( ["low", "medium", "high"], value="medium", label="Risk level", ), gr.Radio( ["grounding", "tool-use", "safety", "handoff"], value="tool-use", label="Primary focus", ), ], outputs=gr.Markdown(label="Evaluation plan"), title="Agent Eval Lab", description=( "Turn a rough agent workflow into a practical evaluation scenario with expected behavior, " "failure modes, scoring dimensions, and follow-up tests." ), examples=[ [ "A browser automation agent that books travel, fills forms, and captures proof of each step.", "high", "safety", ], [ "A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.", "medium", "tool-use", ], [ "A support triage assistant that classifies tickets, drafts replies, and escalates risky cases.", "medium", "grounding", ], ], allow_flagging="never", ) if __name__ == "__main__": demo.launch()