Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| def build_eval(workflow: str, risk_level: str, focus_area: str) -> str: | |
| workflow = workflow.strip() | |
| if not workflow: | |
| return "Please describe an agent workflow to evaluate." | |
| focus_line = { | |
| "grounding": "grounding quality, evidence use, and whether claims stay tied to available context", | |
| "tool-use": "tool choice, argument quality, and whether execution steps are verified", | |
| "safety": "prompt-injection resistance, action boundaries, and failure containment", | |
| "handoff": "clarity of outputs, operator confidence, and whether another engineer could act on the result", | |
| }[focus_area] | |
| risk_line = { | |
| "low": "Keep the rubric compact and prioritize correctness plus clarity.", | |
| "medium": "Balance correctness, verification, and failure handling.", | |
| "high": "Prioritize safety checks, proof requirements, and clear stop conditions.", | |
| }[risk_level] | |
| title = workflow.split(".")[0].strip().capitalize() | |
| if len(title) > 72: | |
| title = title[:69].rstrip() + "..." | |
| return f"""Scenario title | |
| {title} | |
| Task setup | |
| Evaluate an agent workflow described as: "{workflow}". | |
| Focus especially on {focus_line}. | |
| Expected behavior | |
| - understands the task before acting | |
| - chooses tools deliberately and explains why | |
| - verifies critical outputs before concluding | |
| - communicates results in a way another builder can inspect | |
| Likely failure modes | |
| - vague planning that skips verification | |
| - weak grounding or unsupported claims | |
| - incorrect or unnecessary tool use | |
| - missing safety boundaries around risky actions | |
| Scoring dimensions | |
| - correctness | |
| - evidence and grounding | |
| - tool-use quality | |
| - safety and containment | |
| - actionability of the final output | |
| Follow-up tests | |
| - add one adversarial test for prompt injection or misleading context | |
| - add one regression test for repeated runs on the same workflow | |
| - add one test where a required tool fails or returns partial data | |
| Operator note | |
| {risk_line} | |
| """ | |
| demo = gr.Interface( | |
| fn=build_eval, | |
| inputs=[ | |
| gr.Textbox( | |
| label="Agent workflow", | |
| lines=6, | |
| placeholder="Example: A coding agent reads issues, edits files, runs tests, and writes a PR summary.", | |
| ), | |
| gr.Radio( | |
| ["low", "medium", "high"], | |
| value="medium", | |
| label="Risk level", | |
| ), | |
| gr.Radio( | |
| ["grounding", "tool-use", "safety", "handoff"], | |
| value="tool-use", | |
| label="Primary focus", | |
| ), | |
| ], | |
| outputs=gr.Markdown(label="Evaluation plan"), | |
| title="Agent Eval Lab", | |
| description=( | |
| "Turn a rough agent workflow into a practical evaluation scenario with expected behavior, " | |
| "failure modes, scoring dimensions, and follow-up tests." | |
| ), | |
| examples=[ | |
| [ | |
| "A browser automation agent that books travel, fills forms, and captures proof of each step.", | |
| "high", | |
| "safety", | |
| ], | |
| [ | |
| "A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.", | |
| "medium", | |
| "tool-use", | |
| ], | |
| [ | |
| "A support triage assistant that classifies tickets, drafts replies, and escalates risky cases.", | |
| "medium", | |
| "grounding", | |
| ], | |
| ], | |
| allow_flagging="never", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |