Spaces:

Clove25
/

ToolUseEnv

Sleeping

App Files Files Community

ToolUseEnv / tool_use_env /tests /test_tools.py

Clove25

Upload 53 files

18feac5 verified about 1 month ago

raw

history blame contribute delete

2.04 kB

	from tool_use_env.grader import grade_task
	from tool_use_env.models import ToolUseAction
	from tool_use_env.server.tool_use_env_environment import ToolUseEnvironment
	from tool_use_env.tasks import TASKS


	def test_easy_task_rewards_progress_before_submission():
	env = ToolUseEnvironment()
	reset_obs = env.reset(task_id="damaged-mug-replacement")
	assert reset_obs.task_id == "damaged-mug-replacement"
	assert "ticket" in reset_obs.collected_evidence

	obs = env.step(ToolUseAction(action_type="inspect_artifact", artifact_id="order"))
	assert obs.reward > 0
	assert "artifact:order" in obs.collected_evidence
	assert not obs.done


	def test_correct_resolution_finishes_with_high_score():
	env = ToolUseEnvironment()
	env.reset(task_id="duplicate-charge-refund")
	env.step(ToolUseAction(action_type="inspect_artifact", artifact_id="order"))
	env.step(ToolUseAction(action_type="inspect_artifact", artifact_id="payment"))
	env.step(ToolUseAction(action_type="search_policy", query="duplicate_charge"))
	env.step(
	ToolUseAction(
	action_type="draft_reply",
	message=(
	"We confirmed the duplicate charge and issued a refund. "
	"You should see it in 3-5 business days."
	),
	)
	)
	final_obs = env.step(
	ToolUseAction(
	action_type="submit_resolution",
	resolution_code="refund_duplicate_charge",
	)
	)

	assert final_obs.done is True
	assert final_obs.current_score >= 0.9
	assert final_obs.reward >= 0.9


	def test_grader_penalizes_wrong_resolution():
	task = TASKS["account-takeover-fraud"]
	result = grade_task(
	task=task,
	collected_evidence=["ticket", "artifact:account", "artifact:risk_log"],
	drafted_reply="We locked the account and the fraud team will contact you within 24 hours.",
	resolution_code="issue_refund",
	step_count=5,
	repeat_action_count=0,
	)
	assert 0.0 <= result["final_score"] < 0.6