anugrah55 commited on
Commit
63bb50c
·
verified ·
1 Parent(s): 7866ce6

Remove obsolete artifacts and trainer code from env Space

Browse files
.DS_Store DELETED
Binary file (8.2 kB)
 
__pycache__/server.cpython-313.pyc DELETED
Binary file (1.68 kB)
 
opensleuth_env/.DS_Store DELETED
Binary file (8.2 kB)
 
opensleuth_env/__pycache__/__init__.cpython-313.pyc DELETED
Binary file (159 Bytes)
 
opensleuth_env/__pycache__/black_box.cpython-313.pyc DELETED
Binary file (1.28 kB)
 
opensleuth_env/__pycache__/env.cpython-313.pyc DELETED
Binary file (5.11 kB)
 
opensleuth_env/__pycache__/models.cpython-313.pyc DELETED
Binary file (1.89 kB)
 
opensleuth_env/__pycache__/verifier.cpython-313.pyc DELETED
Binary file (4.43 kB)
 
test_client.py DELETED
@@ -1,29 +0,0 @@
1
- import requests
2
- import json
3
-
4
- # The exact code to be submitted, without shell escaping issues
5
- code_to_submit = """
6
- def fibonacci(n: int) -> int:
7
- if not isinstance(n, int) or n <= 0 or n > 90:
8
- raise ValueError("Input must be a positive integer less than or equal to 90.")
9
- if n == 1:
10
- return 1
11
- a, b = 0, 1
12
- for _ in range(n - 1):
13
- a, b = b, a + b
14
- return b
15
- """
16
-
17
- action = {
18
- "action_type": "submit",
19
- "code": code_to_submit
20
- }
21
-
22
- # Reset the environment first
23
- requests.post("http://127.0.0.1:8000/reset", json={"target_name": "fibonacci"})
24
-
25
- # Now send the step action
26
- response = requests.post("http://127.0.0.1:8000/step", json=action)
27
-
28
- print(response.status_code)
29
- print(response.json())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train.py DELETED
@@ -1,157 +0,0 @@
1
- import torch
2
- import requests
3
- from transformers import AutoTokenizer
4
- from unsloth import FastLanguageModel
5
- from trl import GPPOTrainer, PPOConfig
6
- import json
7
- import re
8
-
9
- # == 1. Constants ==
10
- MAX_STEPS_PER_EPISODE = 15
11
- ENV_URL = "http://127.0.0.1:8000"
12
- MODEL_NAME = "unsloth/qwen2-0.5b-instruct-sft-bnb-4bit"
13
-
14
- # == 2. Prompt Engineering ==
15
- def build_prompt(probe_history):
16
- """
17
- Creates the prompt for the LLM based on the probe history.
18
- """
19
- prompt = "You are a reverse-engineering AI. Your goal is to understand a hidden black-box function by probing it and then writing a Python replica.\\n\\n"
20
- prompt += "== Probe History ==\\n"
21
- if not probe_history:
22
- prompt += "No probes yet. Your first action should be a probe.\\n"
23
- else:
24
- for i, (inp, out) in enumerate(probe_history):
25
- prompt += f"{i+1}. IN: {inp} -> OUT: {out}\\n"
26
-
27
- prompt += "\\n== Your Action ==\\n"
28
- prompt += "You can either PROBE or SUBMIT.\\n"
29
- prompt += "To probe, respond with: PROBE(input)\\n"
30
- prompt += "To submit your code, respond with: SUBMIT\\n```python\\n[your code here]\\n```\\n"
31
- prompt += "Your decision: "
32
- return prompt
33
-
34
- # == 3. Action Parsing ==
35
- def parse_action_from_response(response_text):
36
- """
37
- Parses the model's text response to determine the action.
38
- """
39
- probe_match = re.search(r"PROBE\\((.*)\\)", response_text)
40
- if probe_match:
41
- inp = probe_match.group(1).strip()
42
- return {"action_type": "probe", "input": inp}
43
-
44
- submit_match = re.search(r"SUBMIT\\s*```python\\n(.*)```", response_text, re.DOTALL)
45
- if submit_match:
46
- code = submit_match.group(1).strip()
47
- return {"action_type": "submit", "code": code}
48
-
49
- # Default to a probe if parsing fails
50
- return {"action_type": "probe", "input": "1"}
51
-
52
-
53
- # == 4. Main Training Script ==
54
- def main():
55
- # --- Initialize Model ---
56
- model, tokenizer = FastLanguageModel.from_pretrained(
57
- model_name = MODEL_NAME,
58
- max_seq_length = 2048,
59
- dtype = None,
60
- load_in_4bit = True,
61
- )
62
- # LoRA configuration
63
- model = FastLanguageModel.get_peft_model(
64
- model,
65
- r = 16,
66
- target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
67
- lora_alpha = 16,
68
- lora_dropout = 0,
69
- bias = "none",
70
- use_gradient_checkpointing = True,
71
- random_state = 3407,
72
- use_rslora = False,
73
- loftq_config = None,
74
- )
75
-
76
- # --- Initialize GPPO Trainer ---
77
- # Note: GPPO is a new trainer in TRL and might require specific config.
78
- # This is a placeholder configuration.
79
- ppo_config = PPOConfig(
80
- batch_size=4,
81
- mini_batch_size=1,
82
- learning_rate=1.41e-5,
83
- adap_kl_ctrl=False,
84
- log_with="tensorboard",
85
- project_kwargs={"logging_dir": "./logs"}
86
- )
87
-
88
- # We need a dataset for the trainer, even if it's just a dummy one for initialization
89
- # In a real RL loop, we provide the experiences directly to the `step` method.
90
- dummy_dataset = [{"query": "dummy"}]
91
- gppo_trainer = GPPOTrainer(
92
- config=ppo_config,
93
- model=model,
94
- tokenizer=tokenizer,
95
- dataset=dummy_dataset,
96
- )
97
-
98
- # --- Training Loop ---
99
- for episode in range(10): # Run for 10 episodes for demonstration
100
- print(f"--- Episode {episode+1} ---")
101
-
102
- # Reset environment
103
- try:
104
- resp = requests.post(f"{ENV_URL}/reset", json={"target_name": "fibonacci"})
105
- obs = resp.json()
106
- except requests.exceptions.ConnectionError as e:
107
- print(f"ERROR: Could not connect to environment at {ENV_URL}. Is it running?")
108
- print("Please run 'uvicorn server:app --host 0.0.0.0 --port 8000' in the 'opensleuth_env' directory.")
109
- return
110
-
111
- queries, responses, rewards = [], [], []
112
-
113
- for step in range(MAX_STEPS_PER_EPISODE):
114
- # Build prompt and generate action
115
- prompt = build_prompt(obs.get("probe_history", []))
116
- query_tensor = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
117
-
118
- # Generate a response from the model
119
- generation_kwargs = {"min_new_tokens": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id, "max_new_tokens": 150}
120
- response_tensor = gppo_trainer.generate(query_tensor, **generation_kwargs)
121
- response_text = tokenizer.decode(response_tensor[0])
122
-
123
- # Parse action and execute in environment
124
- action = parse_action_from_response(response_text)
125
- step_resp = requests.post(f"{ENV_URL}/step", json=action)
126
- step_data = step_resp.json()
127
-
128
- reward = torch.tensor(step_data["reward"], dtype=torch.float32)
129
- obs = step_data["observation"]
130
- done = step_data["done"]
131
-
132
- # Store experience
133
- queries.append(query_tensor.squeeze())
134
- responses.append(response_tensor.squeeze())
135
- rewards.append(reward)
136
-
137
- print(f"Step {step+1}: Action: {action['action_type']}, Reward: {reward.item():.2f}")
138
-
139
- if done:
140
- break
141
-
142
- # --- Perform PPO Step ---
143
- # This is a simplified view. The actual step requires careful handling of tensors.
144
- # The `queries`, `responses`, `rewards` lists need to be formatted correctly.
145
- try:
146
- stats = gppo_trainer.step(queries, responses, rewards)
147
- gppo_trainer.log_stats(stats, {}, rewards)
148
- print(f" PPO Step done. Mean reward: {stats['ppo/returns/mean']:.2f}")
149
- except Exception as e:
150
- print(f"ERROR during trainer.step: {e}")
151
- print(" Skipping PPO step for this episode. This might happen if all trajectories are truncated.")
152
-
153
-
154
- if __name__ == "__main__":
155
- # Ensure the server is running before starting training.
156
- # We will run the server in the background from the CLI.
157
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
verifier_log.txt DELETED
@@ -1 +0,0 @@
1
- \n--- Verifier Fuzzing ---\nInput: 88, Target: 1100087778366101931, Submitted: 1100087778366101931\nInput: 24, Target: 46368, Submitted: 46368\nInput: 14, Target: 377, Submitted: 377\nInput: 67, Target: 44945570212853, Submitted: 44945570212853\nInput: 35, Target: 9227465, Submitted: 9227465\nInput: 82, Target: 61305790721611591, Submitted: 61305790721611591\nInput: 82, Target: 61305790721611591, Submitted: 61305790721611591\nInput: 25, Target: 75025, Submitted: 75025\nInput: 1, Target: 1, Submitted: 1\nInput: 86, Target: 420196140727489673, Submitted: 420196140727489673\n--- End Verifier Fuzzing ---\nExecution Reward: 100.0, Complexity Penalty: 1.6094379124341003\n