Add pipeline code: config, tool_utils, prompt_builder, llm_client, react_loop, run_eval, virtual_api_server

Files changed (1) hide show

pipeline/config.py ADDED Viewed

+"""Configuration for the P2P StableToolBench evaluation pipeline."""
+import os
+# Paths
+STABLETOOLBENCH_DIR = os.path.join(os.path.dirname(__file__), "..", "StableToolBench")
+TOOL_ROOT_DIR = os.path.join(os.path.dirname(__file__), "..", "toolenv2404_filtered")
+SOLVABLE_QUERIES_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_instruction")
+QUERY_IDS_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_query_ids")
+OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "outputs")
+P2P_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "examples")
+P2P_DESCRIPTIONS_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "descriptions")
+# Models
+TASK_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
+EVAL_MODEL = "meta-llama/Llama-3.3-70B-Instruct"  # For SoPR evaluation judge
+# Virtual API server
+API_SERVER_URL = "http://localhost"
+API_SERVER_PORT = 8080
+# Inference
+TEMPERATURE = 0.001  # Near-deterministic as in paper
+MAX_STEPS = 12  # Max ReAct steps per query
+MAX_OBSERVATION_LENGTH = 1024  # Truncate tool outputs
+# Test groups
+ALL_GROUPS = [
+    "G1_instruction", "G1_category", "G1_tool",
+    "G2_category", "G2_instruction", "G3_instruction"
+]
+# Conditions
+CONDITION_NAMES = ["baseline", "p2p_desc", "p2p_demo", "p2p_full"]