| """Configuration for the P2P StableToolBench evaluation pipeline.""" | |
| import os | |
| # Paths | |
| STABLETOOLBENCH_DIR = os.path.join(os.path.dirname(__file__), "..", "StableToolBench") | |
| TOOL_ROOT_DIR = os.path.join(os.path.dirname(__file__), "..", "toolenv2404_filtered") | |
| SOLVABLE_QUERIES_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_instruction") | |
| QUERY_IDS_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_query_ids") | |
| OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "outputs") | |
| P2P_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "examples") | |
| P2P_DESCRIPTIONS_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "descriptions") | |
| # Models | |
| TASK_MODEL = "meta-llama/Llama-3.1-8B-Instruct" | |
| EVAL_MODEL = "meta-llama/Llama-3.3-70B-Instruct" # For SoPR evaluation judge | |
| # Virtual API server | |
| API_SERVER_URL = "http://localhost" | |
| API_SERVER_PORT = 8080 | |
| # Inference | |
| TEMPERATURE = 0.001 # Near-deterministic as in paper | |
| MAX_STEPS = 12 # Max ReAct steps per query | |
| MAX_OBSERVATION_LENGTH = 1024 # Truncate tool outputs | |
| # Test groups | |
| ALL_GROUPS = [ | |
| "G1_instruction", "G1_category", "G1_tool", | |
| "G2_category", "G2_instruction", "G3_instruction" | |
| ] | |
| # Conditions | |
| CONDITION_NAMES = ["baseline", "p2p_desc", "p2p_demo", "p2p_full"] | |