File size: 3,442 Bytes
c5b383d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr


def build_eval(workflow: str, risk_level: str, focus_area: str) -> str:
    workflow = workflow.strip()
    if not workflow:
        return "Please describe an agent workflow to evaluate."

    focus_line = {
        "grounding": "grounding quality, evidence use, and whether claims stay tied to available context",
        "tool-use": "tool choice, argument quality, and whether execution steps are verified",
        "safety": "prompt-injection resistance, action boundaries, and failure containment",
        "handoff": "clarity of outputs, operator confidence, and whether another engineer could act on the result",
    }[focus_area]

    risk_line = {
        "low": "Keep the rubric compact and prioritize correctness plus clarity.",
        "medium": "Balance correctness, verification, and failure handling.",
        "high": "Prioritize safety checks, proof requirements, and clear stop conditions.",
    }[risk_level]

    title = workflow.split(".")[0].strip().capitalize()
    if len(title) > 72:
        title = title[:69].rstrip() + "..."

    return f"""Scenario title
{title}

Task setup
Evaluate an agent workflow described as: "{workflow}".
Focus especially on {focus_line}.

Expected behavior
- understands the task before acting
- chooses tools deliberately and explains why
- verifies critical outputs before concluding
- communicates results in a way another builder can inspect

Likely failure modes
- vague planning that skips verification
- weak grounding or unsupported claims
- incorrect or unnecessary tool use
- missing safety boundaries around risky actions

Scoring dimensions
- correctness
- evidence and grounding
- tool-use quality
- safety and containment
- actionability of the final output

Follow-up tests
- add one adversarial test for prompt injection or misleading context
- add one regression test for repeated runs on the same workflow
- add one test where a required tool fails or returns partial data

Operator note
{risk_line}
"""


demo = gr.Interface(
    fn=build_eval,
    inputs=[
        gr.Textbox(
            label="Agent workflow",
            lines=6,
            placeholder="Example: A coding agent reads issues, edits files, runs tests, and writes a PR summary.",
        ),
        gr.Radio(
            ["low", "medium", "high"],
            value="medium",
            label="Risk level",
        ),
        gr.Radio(
            ["grounding", "tool-use", "safety", "handoff"],
            value="tool-use",
            label="Primary focus",
        ),
    ],
    outputs=gr.Markdown(label="Evaluation plan"),
    title="Agent Eval Lab",
    description=(
        "Turn a rough agent workflow into a practical evaluation scenario with expected behavior, "
        "failure modes, scoring dimensions, and follow-up tests."
    ),
    examples=[
        [
            "A browser automation agent that books travel, fills forms, and captures proof of each step.",
            "high",
            "safety",
        ],
        [
            "A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.",
            "medium",
            "tool-use",
        ],
        [
            "A support triage assistant that classifies tickets, drafts replies, and escalates risky cases.",
            "medium",
            "grounding",
        ],
    ],
    allow_flagging="never",
)


if __name__ == "__main__":
    demo.launch()