File size: 3,427 Bytes
aa99d41
 
 
e344406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa99d41
e344406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa99d41
e344406
 
 
 
 
 
 
 
 
 
 
 
 
aa99d41
 
e344406
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr


def build_scorecard(workflow: str, operating_surface: str, review_priority: str) -> str:
    workflow = workflow.strip()
    if not workflow:
        return "Please describe an agent workflow to score."

    surface_line = {
        "internal-tools": "internal operator workflows where consistency and handoff quality matter most",
        "customer-facing": "customer-facing automation where safety, tone, and rollback clarity matter most",
        "engineering": "engineering workflows where tool correctness, verification, and repeatability matter most",
        "research": "research loops where source quality, reasoning traceability, and synthesis matter most",
    }[operating_surface]

    priority_line = {
        "routine": "Use a compact review loop and focus on repeatability plus clean handoff notes.",
        "important": "Balance delivery speed, verification, and rollback readiness.",
        "critical": "Demand explicit checks, approval gates, and clear stop conditions before rollout.",
    }[review_priority]

    title = workflow.split(".")[0].strip().capitalize()
    if len(title) > 72:
        title = title[:69].rstrip() + "..."

    return f"""Scorecard title
{title}

Operating surface
This workflow sits in {surface_line}.

Review dimensions
- coverage: does the workflow handle the common path and one failure path?
- verification: are critical outputs checked before they are trusted?
- observability: would another operator know what happened from the logs or summary?
- rollback: is there a clear fallback or manual recovery path?

Suggested rating
- coverage: medium
- verification: medium
- observability: medium
- rollback: medium

Operator notes
- capture one successful example run
- capture one failure example with a recovery note
- record the tool step that deserves the strongest verification

Rollout guidance
{priority_line}
"""


demo = gr.Interface(
    fn=build_scorecard,
    inputs=[
        gr.Textbox(
            label="Agent workflow",
            lines=6,
            placeholder="Example: A support automation agent classifies tickets, drafts replies, and escalates refund-risk cases.",
        ),
        gr.Radio(
            ["internal-tools", "customer-facing", "engineering", "research"],
            value="engineering",
            label="Operating surface",
        ),
        gr.Radio(
            ["routine", "important", "critical"],
            value="important",
            label="Review priority",
        ),
    ],
    outputs=gr.Markdown(label="Ops scorecard"),
    title="Ops Scorecard Lab",
    description=(
        "Turn a rough agent workflow into a practical operations scorecard with review dimensions, "
        "operator notes, and rollout guidance."
    ),
    examples=[
        [
            "A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.",
            "engineering",
            "important",
        ],
        [
            "A customer support copilot that drafts replies, suggests refunds, and escalates policy-sensitive cases.",
            "customer-facing",
            "critical",
        ],
        [
            "A browser research assistant that reads docs, compares sources, and prepares an implementation brief.",
            "research",
            "routine",
        ],
    ],
    allow_flagging="never",
)


if __name__ == "__main__":
    demo.launch()