anugrah55 commited on
Commit
285599c
·
verified ·
1 Parent(s): ac19b81

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +157 -0
train.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import requests
3
+ from transformers import AutoTokenizer
4
+ from unsloth import FastLanguageModel
5
+ from trl import GPPOTrainer, PPOConfig
6
+ import json
7
+ import re
8
+
9
+ # == 1. Constants ==
10
+ MAX_STEPS_PER_EPISODE = 15
11
+ ENV_URL = "https://anugrah55-opensleuth-env-gemini-cli.hf.space"
12
+ MODEL_NAME = "unsloth/qwen2-0.5b-instruct-sft-bnb-4bit"
13
+
14
+ # == 2. Prompt Engineering ==
15
+ def build_prompt(probe_history):
16
+ """
17
+ Creates the prompt for the LLM based on the probe history.
18
+ """
19
+ prompt = "You are a reverse-engineering AI. Your goal is to understand a hidden black-box function by probing it and then writing a Python replica.\\n\\n"
20
+ prompt += "== Probe History ==\\n"
21
+ if not probe_history:
22
+ prompt += "No probes yet. Your first action should be a probe.\\n"
23
+ else:
24
+ for i, (inp, out) in enumerate(probe_history):
25
+ prompt += f"{i+1}. IN: {inp} -> OUT: {out}\\n"
26
+
27
+ prompt += "\\n== Your Action ==\\n"
28
+ prompt += "You can either PROBE or SUBMIT.\\n"
29
+ prompt += "To probe, respond with: PROBE(input)\\n"
30
+ prompt += "To submit your code, respond with: SUBMIT\\n```python\\n[your code here]\\n```\\n"
31
+ prompt += "Your decision: "
32
+ return prompt
33
+
34
+ # == 3. Action Parsing ==
35
+ def parse_action_from_response(response_text):
36
+ """
37
+ Parses the model's text response to determine the action.
38
+ """
39
+ probe_match = re.search(r"PROBE\\((.*)\\)", response_text)
40
+ if probe_match:
41
+ inp = probe_match.group(1).strip()
42
+ return {"action_type": "probe", "input": inp}
43
+
44
+ submit_match = re.search(r"SUBMIT\\s*```python\\n(.*)```", response_text, re.DOTALL)
45
+ if submit_match:
46
+ code = submit_match.group(1).strip()
47
+ return {"action_type": "submit", "code": code}
48
+
49
+ # Default to a probe if parsing fails
50
+ return {"action_type": "probe", "input": "1"}
51
+
52
+
53
+ # == 4. Main Training Script ==
54
+ def main():
55
+ # --- Initialize Model ---
56
+ model, tokenizer = FastLanguageModel.from_pretrained(
57
+ model_name = MODEL_NAME,
58
+ max_seq_length = 2048,
59
+ dtype = None,
60
+ load_in_4bit = True,
61
+ )
62
+ # LoRA configuration
63
+ model = FastLanguageModel.get_peft_model(
64
+ model,
65
+ r = 16,
66
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
67
+ lora_alpha = 16,
68
+ lora_dropout = 0,
69
+ bias = "none",
70
+ use_gradient_checkpointing = True,
71
+ random_state = 3407,
72
+ use_rslora = False,
73
+ loftq_config = None,
74
+ )
75
+
76
+ # --- Initialize GPPO Trainer ---
77
+ # Note: GPPO is a new trainer in TRL and might require specific config.
78
+ # This is a placeholder configuration.
79
+ ppo_config = PPOConfig(
80
+ batch_size=4,
81
+ mini_batch_size=1,
82
+ learning_rate=1.41e-5,
83
+ adap_kl_ctrl=False,
84
+ log_with="tensorboard",
85
+ project_kwargs={"logging_dir": "./logs"}
86
+ )
87
+
88
+ # We need a dataset for the trainer, even if it's just a dummy one for initialization
89
+ # In a real RL loop, we provide the experiences directly to the `step` method.
90
+ dummy_dataset = [{"query": "dummy"}]
91
+ gppo_trainer = GPPOTrainer(
92
+ config=ppo_config,
93
+ model=model,
94
+ tokenizer=tokenizer,
95
+ dataset=dummy_dataset,
96
+ )
97
+
98
+ # --- Training Loop ---
99
+ for episode in range(10): # Run for 10 episodes for demonstration
100
+ print(f"--- Episode {episode+1} ---")
101
+
102
+ # Reset environment
103
+ try:
104
+ resp = requests.post(f"{ENV_URL}/reset", json={"target_name": "fibonacci"})
105
+ obs = resp.json()
106
+ except requests.exceptions.ConnectionError as e:
107
+ print(f"ERROR: Could not connect to environment at {ENV_URL}. Is it running?")
108
+ print("Please run 'uvicorn server:app --host 0.0.0.0 --port 8000' in the 'opensleuth_env' directory.")
109
+ return
110
+
111
+ queries, responses, rewards = [], [], []
112
+
113
+ for step in range(MAX_STEPS_PER_EPISODE):
114
+ # Build prompt and generate action
115
+ prompt = build_prompt(obs.get("probe_history", []))
116
+ query_tensor = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
117
+
118
+ # Generate a response from the model
119
+ generation_kwargs = {"min_new_tokens": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id, "max_new_tokens": 150}
120
+ response_tensor = gppo_trainer.generate(query_tensor, **generation_kwargs)
121
+ response_text = tokenizer.decode(response_tensor[0])
122
+
123
+ # Parse action and execute in environment
124
+ action = parse_action_from_response(response_text)
125
+ step_resp = requests.post(f"{ENV_URL}/step", json=action)
126
+ step_data = step_resp.json()
127
+
128
+ reward = torch.tensor(step_data["reward"], dtype=torch.float32)
129
+ obs = step_data["observation"]
130
+ done = step_data["done"]
131
+
132
+ # Store experience
133
+ queries.append(query_tensor.squeeze())
134
+ responses.append(response_tensor.squeeze())
135
+ rewards.append(reward)
136
+
137
+ print(f"Step {step+1}: Action: {action['action_type']}, Reward: {reward.item():.2f}")
138
+
139
+ if done:
140
+ break
141
+
142
+ # --- Perform PPO Step ---
143
+ # This is a simplified view. The actual step requires careful handling of tensors.
144
+ # The `queries`, `responses`, `rewards` lists need to be formatted correctly.
145
+ try:
146
+ stats = gppo_trainer.step(queries, responses, rewards)
147
+ gppo_trainer.log_stats(stats, {}, rewards)
148
+ print(f" PPO Step done. Mean reward: {stats['ppo/returns/mean']:.2f}")
149
+ except Exception as e:
150
+ print(f"ERROR during trainer.step: {e}")
151
+ print(" Skipping PPO step for this episode. This might happen if all trajectories are truncated.")
152
+
153
+
154
+ if __name__ == "__main__":
155
+ # Ensure the server is running before starting training.
156
+ # We will run the server in the background from the CLI.
157
+ main()