Spaces:
Running
Running
| import gradio as gr | |
| def build_scorecard(workflow: str, operating_surface: str, review_priority: str) -> str: | |
| workflow = workflow.strip() | |
| if not workflow: | |
| return "Please describe an agent workflow to score." | |
| surface_line = { | |
| "internal-tools": "internal operator workflows where consistency and handoff quality matter most", | |
| "customer-facing": "customer-facing automation where safety, tone, and rollback clarity matter most", | |
| "engineering": "engineering workflows where tool correctness, verification, and repeatability matter most", | |
| "research": "research loops where source quality, reasoning traceability, and synthesis matter most", | |
| }[operating_surface] | |
| priority_line = { | |
| "routine": "Use a compact review loop and focus on repeatability plus clean handoff notes.", | |
| "important": "Balance delivery speed, verification, and rollback readiness.", | |
| "critical": "Demand explicit checks, approval gates, and clear stop conditions before rollout.", | |
| }[review_priority] | |
| title = workflow.split(".")[0].strip().capitalize() | |
| if len(title) > 72: | |
| title = title[:69].rstrip() + "..." | |
| return f"""Scorecard title | |
| {title} | |
| Operating surface | |
| This workflow sits in {surface_line}. | |
| Review dimensions | |
| - coverage: does the workflow handle the common path and one failure path? | |
| - verification: are critical outputs checked before they are trusted? | |
| - observability: would another operator know what happened from the logs or summary? | |
| - rollback: is there a clear fallback or manual recovery path? | |
| Suggested rating | |
| - coverage: medium | |
| - verification: medium | |
| - observability: medium | |
| - rollback: medium | |
| Operator notes | |
| - capture one successful example run | |
| - capture one failure example with a recovery note | |
| - record the tool step that deserves the strongest verification | |
| Rollout guidance | |
| {priority_line} | |
| """ | |
| demo = gr.Interface( | |
| fn=build_scorecard, | |
| inputs=[ | |
| gr.Textbox( | |
| label="Agent workflow", | |
| lines=6, | |
| placeholder="Example: A support automation agent classifies tickets, drafts replies, and escalates refund-risk cases.", | |
| ), | |
| gr.Radio( | |
| ["internal-tools", "customer-facing", "engineering", "research"], | |
| value="engineering", | |
| label="Operating surface", | |
| ), | |
| gr.Radio( | |
| ["routine", "important", "critical"], | |
| value="important", | |
| label="Review priority", | |
| ), | |
| ], | |
| outputs=gr.Markdown(label="Ops scorecard"), | |
| title="Ops Scorecard Lab", | |
| description=( | |
| "Turn a rough agent workflow into a practical operations scorecard with review dimensions, " | |
| "operator notes, and rollout guidance." | |
| ), | |
| examples=[ | |
| [ | |
| "A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.", | |
| "engineering", | |
| "important", | |
| ], | |
| [ | |
| "A customer support copilot that drafts replies, suggests refunds, and escalates policy-sensitive cases.", | |
| "customer-facing", | |
| "critical", | |
| ], | |
| [ | |
| "A browser research assistant that reads docs, compares sources, and prepares an implementation brief.", | |
| "research", | |
| "routine", | |
| ], | |
| ], | |
| allow_flagging="never", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |