Spaces:

mukunda1729
/

agent-eval-lab

Sleeping

App Files Files Community

agent-eval-lab / app.py

mukunda1729

Upload app.py

c5b383d verified 16 days ago

raw

history blame contribute delete

3.44 kB

	import gradio as gr


	def build_eval(workflow: str, risk_level: str, focus_area: str) -> str:
	workflow = workflow.strip()
	if not workflow:
	return "Please describe an agent workflow to evaluate."

	focus_line = {
	"grounding": "grounding quality, evidence use, and whether claims stay tied to available context",
	"tool-use": "tool choice, argument quality, and whether execution steps are verified",
	"safety": "prompt-injection resistance, action boundaries, and failure containment",
	"handoff": "clarity of outputs, operator confidence, and whether another engineer could act on the result",
	}[focus_area]

	risk_line = {
	"low": "Keep the rubric compact and prioritize correctness plus clarity.",
	"medium": "Balance correctness, verification, and failure handling.",
	"high": "Prioritize safety checks, proof requirements, and clear stop conditions.",
	}[risk_level]

	title = workflow.split(".")[0].strip().capitalize()
	if len(title) > 72:
	title = title[:69].rstrip() + "..."

	return f"""Scenario title
	{title}

	Task setup
	Evaluate an agent workflow described as: "{workflow}".
	Focus especially on {focus_line}.

	Expected behavior
	- understands the task before acting
	- chooses tools deliberately and explains why
	- verifies critical outputs before concluding
	- communicates results in a way another builder can inspect

	Likely failure modes
	- vague planning that skips verification
	- weak grounding or unsupported claims
	- incorrect or unnecessary tool use
	- missing safety boundaries around risky actions

	Scoring dimensions
	- correctness
	- evidence and grounding
	- tool-use quality
	- safety and containment
	- actionability of the final output

	Follow-up tests
	- add one adversarial test for prompt injection or misleading context
	- add one regression test for repeated runs on the same workflow
	- add one test where a required tool fails or returns partial data

	Operator note
	{risk_line}
	"""


	demo = gr.Interface(
	fn=build_eval,
	inputs=[
	gr.Textbox(
	label="Agent workflow",
	lines=6,
	placeholder="Example: A coding agent reads issues, edits files, runs tests, and writes a PR summary.",
	),
	gr.Radio(
	["low", "medium", "high"],
	value="medium",
	label="Risk level",
	),
	gr.Radio(
	["grounding", "tool-use", "safety", "handoff"],
	value="tool-use",
	label="Primary focus",
	),
	],
	outputs=gr.Markdown(label="Evaluation plan"),
	title="Agent Eval Lab",
	description=(
	"Turn a rough agent workflow into a practical evaluation scenario with expected behavior, "
	"failure modes, scoring dimensions, and follow-up tests."
	),
	examples=[
	[
	"A browser automation agent that books travel, fills forms, and captures proof of each step.",
	"high",
	"safety",
	],
	[
	"A repo-maintainer agent that triages issues, patches code, runs tests, and drafts a release note.",
	"medium",
	"tool-use",
	],
	[
	"A support triage assistant that classifies tickets, drafts replies, and escalates risky cases.",
	"medium",
	"grounding",
	],
	],
	allow_flagging="never",
	)


	if __name__ == "__main__":
	demo.launch()