mukunda1729's picture
Update app.py
e344406 verified
import gradio as gr
def build_scorecard(workflow: str, operating_surface: str, review_priority: str) -> str:
workflow = workflow.strip()
if not workflow:
return "Please describe an agent workflow to score."
surface_line = {
"internal-tools": "internal operator workflows where consistency and handoff quality matter most",
"customer-facing": "customer-facing automation where safety, tone, and rollback clarity matter most",
"engineering": "engineering workflows where tool correctness, verification, and repeatability matter most",
"research": "research loops where source quality, reasoning traceability, and synthesis matter most",
}[operating_surface]
priority_line = {
"routine": "Use a compact review loop and focus on repeatability plus clean handoff notes.",
"important": "Balance delivery speed, verification, and rollback readiness.",
"critical": "Demand explicit checks, approval gates, and clear stop conditions before rollout.",
}[review_priority]
title = workflow.split(".")[0].strip().capitalize()
if len(title) > 72:
title = title[:69].rstrip() + "..."
return f"""Scorecard title
{title}
Operating surface
This workflow sits in {surface_line}.
Review dimensions
- coverage: does the workflow handle the common path and one failure path?
- verification: are critical outputs checked before they are trusted?
- observability: would another operator know what happened from the logs or summary?
- rollback: is there a clear fallback or manual recovery path?
Suggested rating
- coverage: medium
- verification: medium
- observability: medium
- rollback: medium
Operator notes
- capture one successful example run
- capture one failure example with a recovery note
- record the tool step that deserves the strongest verification
Rollout guidance
{priority_line}
"""
demo = gr.Interface(
fn=build_scorecard,
inputs=[
gr.Textbox(
label="Agent workflow",
lines=6,
placeholder="Example: A support automation agent classifies tickets, drafts replies, and escalates refund-risk cases.",
),
gr.Radio(
["internal-tools", "customer-facing", "engineering", "research"],
value="engineering",
label="Operating surface",
),
gr.Radio(
["routine", "important", "critical"],
value="important",
label="Review priority",
),
],
outputs=gr.Markdown(label="Ops scorecard"),
title="Ops Scorecard Lab",
description=(
"Turn a rough agent workflow into a practical operations scorecard with review dimensions, "
"operator notes, and rollout guidance."
),
examples=[
[
"A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.",
"engineering",
"important",
],
[
"A customer support copilot that drafts replies, suggests refunds, and escalates policy-sensitive cases.",
"customer-facing",
"critical",
],
[
"A browser research assistant that reads docs, compares sources, and prepares an implementation brief.",
"research",
"routine",
],
],
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch()