"""Configuration for the P2P StableToolBench evaluation pipeline."""
import os

# Paths
STABLETOOLBENCH_DIR = os.path.join(os.path.dirname(__file__), "..", "StableToolBench")
TOOL_ROOT_DIR = os.path.join(os.path.dirname(__file__), "..", "toolenv2404_filtered")
SOLVABLE_QUERIES_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_instruction")
QUERY_IDS_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_query_ids")
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "outputs")
P2P_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "examples")
P2P_DESCRIPTIONS_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "descriptions")

# Models
TASK_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
EVAL_MODEL = "meta-llama/Llama-3.3-70B-Instruct"  # For SoPR evaluation judge

# Virtual API server
API_SERVER_URL = "http://localhost"
API_SERVER_PORT = 8080

# Inference
TEMPERATURE = 0.001  # Near-deterministic as in paper
MAX_STEPS = 12  # Max ReAct steps per query
MAX_OBSERVATION_LENGTH = 1024  # Truncate tool outputs

# Test groups
ALL_GROUPS = [
    "G1_instruction", "G1_category", "G1_tool",
    "G2_category", "G2_instruction", "G3_instruction"
]

# Conditions
CONDITION_NAMES = ["baseline", "p2p_desc", "p2p_demo", "p2p_full"]