Dwootton commited on
Commit
334e866
·
verified ·
1 Parent(s): d51d0a8

Add pipeline code: config, tool_utils, prompt_builder, llm_client, react_loop, run_eval, virtual_api_server

Browse files
Files changed (1) hide show
  1. pipeline/config.py +33 -0
pipeline/config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration for the P2P StableToolBench evaluation pipeline."""
2
+ import os
3
+
4
+ # Paths
5
+ STABLETOOLBENCH_DIR = os.path.join(os.path.dirname(__file__), "..", "StableToolBench")
6
+ TOOL_ROOT_DIR = os.path.join(os.path.dirname(__file__), "..", "toolenv2404_filtered")
7
+ SOLVABLE_QUERIES_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_instruction")
8
+ QUERY_IDS_DIR = os.path.join(STABLETOOLBENCH_DIR, "solvable_queries", "test_query_ids")
9
+ OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "outputs")
10
+ P2P_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "examples")
11
+ P2P_DESCRIPTIONS_DIR = os.path.join(os.path.dirname(__file__), "p2p_data", "descriptions")
12
+
13
+ # Models
14
+ TASK_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
15
+ EVAL_MODEL = "meta-llama/Llama-3.3-70B-Instruct" # For SoPR evaluation judge
16
+
17
+ # Virtual API server
18
+ API_SERVER_URL = "http://localhost"
19
+ API_SERVER_PORT = 8080
20
+
21
+ # Inference
22
+ TEMPERATURE = 0.001 # Near-deterministic as in paper
23
+ MAX_STEPS = 12 # Max ReAct steps per query
24
+ MAX_OBSERVATION_LENGTH = 1024 # Truncate tool outputs
25
+
26
+ # Test groups
27
+ ALL_GROUPS = [
28
+ "G1_instruction", "G1_category", "G1_tool",
29
+ "G2_category", "G2_instruction", "G3_instruction"
30
+ ]
31
+
32
+ # Conditions
33
+ CONDITION_NAMES = ["baseline", "p2p_desc", "p2p_demo", "p2p_full"]