"""Configuration for the P2P StableToolBench evaluation pipeline.""" import os # Paths STABLETOOLBENCH_DIR = os.path.join(os.path.dirname(__file__), "..", "StableToolBench") TOOL_ROOT_DIR = os.path.join(os.path.dirname(__file__), "..", "toolenv2404_filtered") SOLVABLE_QUERIES_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_instruction") QUERY_IDS_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_query_ids") OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "outputs") P2P_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "examples") P2P_DESCRIPTIONS_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "descriptions") # Models TASK_MODEL = "meta-llama/Llama-3.1-8B-Instruct" EVAL_MODEL = "meta-llama/Llama-3.3-70B-Instruct" # For SoPR evaluation judge # Virtual API server API_SERVER_URL = "http://localhost" API_SERVER_PORT = 8080 # Inference TEMPERATURE = 0.001 # Near-deterministic as in paper MAX_STEPS = 12 # Max ReAct steps per query MAX_OBSERVATION_LENGTH = 1024 # Truncate tool outputs # Test groups ALL_GROUPS = [ "G1_instruction", "G1_category", "G1_tool", "G2_category", "G2_instruction", "G3_instruction" ] # Conditions CONDITION_NAMES = ["baseline", "p2p_desc", "p2p_demo", "p2p_full"]