import gradio as gr def build_scorecard(workflow: str, operating_surface: str, review_priority: str) -> str: workflow = workflow.strip() if not workflow: return "Please describe an agent workflow to score." surface_line = { "internal-tools": "internal operator workflows where consistency and handoff quality matter most", "customer-facing": "customer-facing automation where safety, tone, and rollback clarity matter most", "engineering": "engineering workflows where tool correctness, verification, and repeatability matter most", "research": "research loops where source quality, reasoning traceability, and synthesis matter most", }[operating_surface] priority_line = { "routine": "Use a compact review loop and focus on repeatability plus clean handoff notes.", "important": "Balance delivery speed, verification, and rollback readiness.", "critical": "Demand explicit checks, approval gates, and clear stop conditions before rollout.", }[review_priority] title = workflow.split(".")[0].strip().capitalize() if len(title) > 72: title = title[:69].rstrip() + "..." return f"""Scorecard title {title} Operating surface This workflow sits in {surface_line}. Review dimensions - coverage: does the workflow handle the common path and one failure path? - verification: are critical outputs checked before they are trusted? - observability: would another operator know what happened from the logs or summary? - rollback: is there a clear fallback or manual recovery path? Suggested rating - coverage: medium - verification: medium - observability: medium - rollback: medium Operator notes - capture one successful example run - capture one failure example with a recovery note - record the tool step that deserves the strongest verification Rollout guidance {priority_line} """ demo = gr.Interface( fn=build_scorecard, inputs=[ gr.Textbox( label="Agent workflow", lines=6, placeholder="Example: A support automation agent classifies tickets, drafts replies, and escalates refund-risk cases.", ), gr.Radio( ["internal-tools", "customer-facing", "engineering", "research"], value="engineering", label="Operating surface", ), gr.Radio( ["routine", "important", "critical"], value="important", label="Review priority", ), ], outputs=gr.Markdown(label="Ops scorecard"), title="Ops Scorecard Lab", description=( "Turn a rough agent workflow into a practical operations scorecard with review dimensions, " "operator notes, and rollout guidance." ), examples=[ [ "A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.", "engineering", "important", ], [ "A customer support copilot that drafts replies, suggests refunds, and escalates policy-sensitive cases.", "customer-facing", "critical", ], [ "A browser research assistant that reads docs, compares sources, and prepares an implementation brief.", "research", "routine", ], ], allow_flagging="never", ) if __name__ == "__main__": demo.launch()