Spaces:

anugrah55
/

opensleuth-training-gemini-cli

Paused

App Files Files Community

anugrah55 commited on 13 days ago

Commit

285599c

verified ·

1 Parent(s): ac19b81

Upload train.py with huggingface_hub

Browse files

Files changed (1) hide show

train.py +157 -0

train.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import torch
+import requests
+from transformers import AutoTokenizer
+from unsloth import FastLanguageModel
+from trl import GPPOTrainer, PPOConfig
+import json
+import re
+# == 1. Constants ==
+MAX_STEPS_PER_EPISODE = 15
+ENV_URL = "https://anugrah55-opensleuth-env-gemini-cli.hf.space"
+MODEL_NAME = "unsloth/qwen2-0.5b-instruct-sft-bnb-4bit"
+# == 2. Prompt Engineering ==
+def build_prompt(probe_history):
+    """
+    Creates the prompt for the LLM based on the probe history.
+    """
+    prompt = "You are a reverse-engineering AI. Your goal is to understand a hidden black-box function by probing it and then writing a Python replica.\\n\\n"
+    prompt += "== Probe History ==\\n"
+    if not probe_history:
+        prompt += "No probes yet. Your first action should be a probe.\\n"
+    else:
+        for i, (inp, out) in enumerate(probe_history):
+            prompt += f"{i+1}. IN: {inp} -> OUT: {out}\\n"
+    prompt += "\\n== Your Action ==\\n"
+    prompt += "You can either PROBE or SUBMIT.\\n"
+    prompt += "To probe, respond with: PROBE(input)\\n"
+    prompt += "To submit your code, respond with: SUBMIT\\n```python\\n[your code here]\\n```\\n"
+    prompt += "Your decision: "
+    return prompt
+# == 3. Action Parsing ==
+def parse_action_from_response(response_text):
+    """
+    Parses the model's text response to determine the action.
+    """
+    probe_match = re.search(r"PROBE\\((.*)\\)", response_text)
+    if probe_match:
+        inp = probe_match.group(1).strip()
+        return {"action_type": "probe", "input": inp}
+    submit_match = re.search(r"SUBMIT\\s*```python\\n(.*)```", response_text, re.DOTALL)
+    if submit_match:
+        code = submit_match.group(1).strip()
+        return {"action_type": "submit", "code": code}
+    # Default to a probe if parsing fails
+    return {"action_type": "probe", "input": "1"}
+# == 4. Main Training Script ==
+def main():
+    # --- Initialize Model ---
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name = MODEL_NAME,
+        max_seq_length = 2048,
+        dtype = None,
+        load_in_4bit = True,
+    )
+    # LoRA configuration
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r = 16,
+        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        lora_alpha = 16,
+        lora_dropout = 0,
+        bias = "none",
+        use_gradient_checkpointing = True,
+        random_state = 3407,
+        use_rslora = False,
+        loftq_config = None,
+    )
+    # --- Initialize GPPO Trainer ---
+    # Note: GPPO is a new trainer in TRL and might require specific config.
+    # This is a placeholder configuration.
+    ppo_config = PPOConfig(
+        batch_size=4,
+        mini_batch_size=1,
+        learning_rate=1.41e-5,
+        adap_kl_ctrl=False,
+        log_with="tensorboard",
+        project_kwargs={"logging_dir": "./logs"}
+    )
+    # We need a dataset for the trainer, even if it's just a dummy one for initialization
+    # In a real RL loop, we provide the experiences directly to the `step` method.
+    dummy_dataset = [{"query": "dummy"}]
+    gppo_trainer = GPPOTrainer(
+        config=ppo_config,
+        model=model,
+        tokenizer=tokenizer,
+        dataset=dummy_dataset,
+    )
+    # --- Training Loop ---
+    for episode in range(10): # Run for 10 episodes for demonstration
+        print(f"--- Episode {episode+1} ---")
+        # Reset environment
+        try:
+            resp = requests.post(f"{ENV_URL}/reset", json={"target_name": "fibonacci"})
+            obs = resp.json()
+        except requests.exceptions.ConnectionError as e:
+            print(f"ERROR: Could not connect to environment at {ENV_URL}. Is it running?")
+            print("Please run 'uvicorn server:app --host 0.0.0.0 --port 8000' in the 'opensleuth_env' directory.")
+            return
+        queries, responses, rewards = [], [], []
+        for step in range(MAX_STEPS_PER_EPISODE):
+            # Build prompt and generate action
+            prompt = build_prompt(obs.get("probe_history", []))
+            query_tensor = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
+            # Generate a response from the model
+            generation_kwargs = {"min_new_tokens": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id, "max_new_tokens": 150}
+            response_tensor = gppo_trainer.generate(query_tensor, **generation_kwargs)
+            response_text = tokenizer.decode(response_tensor[0])
+            # Parse action and execute in environment
+            action = parse_action_from_response(response_text)
+            step_resp = requests.post(f"{ENV_URL}/step", json=action)
+            step_data = step_resp.json()
+            reward = torch.tensor(step_data["reward"], dtype=torch.float32)
+            obs = step_data["observation"]
+            done = step_data["done"]
+            # Store experience
+            queries.append(query_tensor.squeeze())
+            responses.append(response_tensor.squeeze())
+            rewards.append(reward)
+            print(f"Step {step+1}: Action: {action['action_type']}, Reward: {reward.item():.2f}")
+            if done:
+                break
+        # --- Perform PPO Step ---
+        # This is a simplified view. The actual step requires careful handling of tensors.
+        # The `queries`, `responses`, `rewards` lists need to be formatted correctly.
+        try:
+            stats = gppo_trainer.step(queries, responses, rewards)
+            gppo_trainer.log_stats(stats, {}, rewards)
+            print(f"  PPO Step done. Mean reward: {stats['ppo/returns/mean']:.2f}")
+        except Exception as e:
+            print(f"ERROR during trainer.step: {e}")
+            print("  Skipping PPO step for this episode. This might happen if all trajectories are truncated.")
+if __name__ == "__main__":
+    # Ensure the server is running before starting training.
+    # We will run the server in the background from the CLI.
+    main()