Dwootton's picture
Add pipeline code: config, tool_utils, prompt_builder, llm_client, react_loop, run_eval, virtual_api_server
334e866 verified
"""Configuration for the P2P StableToolBench evaluation pipeline."""
import os
# Paths
STABLETOOLBENCH_DIR = os.path.join(os.path.dirname(__file__), "..", "StableToolBench")
TOOL_ROOT_DIR = os.path.join(os.path.dirname(__file__), "..", "toolenv2404_filtered")
SOLVABLE_QUERIES_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_instruction")
QUERY_IDS_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_query_ids")
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "outputs")
P2P_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "examples")
P2P_DESCRIPTIONS_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "descriptions")
# Models
TASK_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
EVAL_MODEL = "meta-llama/Llama-3.3-70B-Instruct" # For SoPR evaluation judge
# Virtual API server
API_SERVER_URL = "http://localhost"
API_SERVER_PORT = 8080
# Inference
TEMPERATURE = 0.001 # Near-deterministic as in paper
MAX_STEPS = 12 # Max ReAct steps per query
MAX_OBSERVATION_LENGTH = 1024 # Truncate tool outputs
# Test groups
ALL_GROUPS = [
"G1_instruction", "G1_category", "G1_tool",
"G2_category", "G2_instruction", "G3_instruction"
]
# Conditions
CONDITION_NAMES = ["baseline", "p2p_desc", "p2p_demo", "p2p_full"]