Spaces:

mukunda1729
/

ops-scorecard-lab

Running

App Files Files Community

ops-scorecard-lab / app.py

mukunda1729

Update app.py

e344406 verified 16 days ago

raw

history blame contribute delete

3.43 kB

	import gradio as gr


	def build_scorecard(workflow: str, operating_surface: str, review_priority: str) -> str:
	workflow = workflow.strip()
	if not workflow:
	return "Please describe an agent workflow to score."

	surface_line = {
	"internal-tools": "internal operator workflows where consistency and handoff quality matter most",
	"customer-facing": "customer-facing automation where safety, tone, and rollback clarity matter most",
	"engineering": "engineering workflows where tool correctness, verification, and repeatability matter most",
	"research": "research loops where source quality, reasoning traceability, and synthesis matter most",
	}[operating_surface]

	priority_line = {
	"routine": "Use a compact review loop and focus on repeatability plus clean handoff notes.",
	"important": "Balance delivery speed, verification, and rollback readiness.",
	"critical": "Demand explicit checks, approval gates, and clear stop conditions before rollout.",
	}[review_priority]

	title = workflow.split(".")[0].strip().capitalize()
	if len(title) > 72:
	title = title[:69].rstrip() + "..."

	return f"""Scorecard title
	{title}

	Operating surface
	This workflow sits in {surface_line}.

	Review dimensions
	- coverage: does the workflow handle the common path and one failure path?
	- verification: are critical outputs checked before they are trusted?
	- observability: would another operator know what happened from the logs or summary?
	- rollback: is there a clear fallback or manual recovery path?

	Suggested rating
	- coverage: medium
	- verification: medium
	- observability: medium
	- rollback: medium

	Operator notes
	- capture one successful example run
	- capture one failure example with a recovery note
	- record the tool step that deserves the strongest verification

	Rollout guidance
	{priority_line}
	"""


	demo = gr.Interface(
	fn=build_scorecard,
	inputs=[
	gr.Textbox(
	label="Agent workflow",
	lines=6,
	placeholder="Example: A support automation agent classifies tickets, drafts replies, and escalates refund-risk cases.",
	),
	gr.Radio(
	["internal-tools", "customer-facing", "engineering", "research"],
	value="engineering",
	label="Operating surface",
	),
	gr.Radio(
	["routine", "important", "critical"],
	value="important",
	label="Review priority",
	),
	],
	outputs=gr.Markdown(label="Ops scorecard"),
	title="Ops Scorecard Lab",
	description=(
	"Turn a rough agent workflow into a practical operations scorecard with review dimensions, "
	"operator notes, and rollout guidance."
	),
	examples=[
	[
	"A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.",
	"engineering",
	"important",
	],
	[
	"A customer support copilot that drafts replies, suggests refunds, and escalates policy-sensitive cases.",
	"customer-facing",
	"critical",
	],
	[
	"A browser research assistant that reads docs, compares sources, and prepares an implementation brief.",
	"research",
	"routine",
	],
	],
	allow_flagging="never",
	)


	if __name__ == "__main__":
	demo.launch()