Add pipeline code: config, tool_utils, prompt_builder, llm_client, react_loop, run_eval, virtual_api_server

334e866 verified 7 days ago

1.28 kB

	"""Configuration for the P2P StableToolBench evaluation pipeline."""
	import os

	# Paths
	STABLETOOLBENCH_DIR = os.path.join(os.path.dirname(__file__), "..", "StableToolBench")
	TOOL_ROOT_DIR = os.path.join(os.path.dirname(__file__), "..", "toolenv2404_filtered")
	SOLVABLE_QUERIES_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_instruction")
	QUERY_IDS_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_query_ids")
	OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "outputs")
	P2P_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "examples")
	P2P_DESCRIPTIONS_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "descriptions")

	# Models
	TASK_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
	EVAL_MODEL = "meta-llama/Llama-3.3-70B-Instruct" # For SoPR evaluation judge

	# Virtual API server
	API_SERVER_URL = "http://localhost"
	API_SERVER_PORT = 8080

	# Inference
	TEMPERATURE = 0.001 # Near-deterministic as in paper
	MAX_STEPS = 12 # Max ReAct steps per query
	MAX_OBSERVATION_LENGTH = 1024 # Truncate tool outputs

	# Test groups
	ALL_GROUPS = [
	"G1_instruction", "G1_category", "G1_tool",
	"G2_category", "G2_instruction", "G3_instruction"
	]

	# Conditions
	CONDITION_NAMES = ["baseline", "p2p_desc", "p2p_demo", "p2p_full"]