Spaces:

muskansingh1101
/

orgos-training

Runtime error

muskan singh Claude Opus 4.7 commited on 16 days ago

Commit

2ab0fe0

1 Parent(s): 7a0b2ce

fix: pin trl<=0.24, multi-step reward, lower LR, reduce NUM_GEN

- requirements.txt: pin trl>=0.18.2,<=0.24.0 (trl 1.x breaks Unsloth patches → silent crash at step 21)
- train.py: multi-step reward fn (REWARD_STEPS=2) — cumulative score not single-step
- train.py: NUM_GEN 4→2 to halve VRAM pressure from G×reward_steps inference calls
- train.py: LR 5e-5→8e-6 (5e-5 was unstable, caused reward oscillation)
- train.py: switch to max_steps=150 training (more reliable than epoch-based)
- train.py: model.config.max_length=None to silence max_new_tokens warning
- train.py: reward_funcs=[orgos_reward_fn] as list (required by TRL)
- train.py: BATCH_SIZE 4→1 with GRAD_ACCUM=2 (matches memory budget with multi-step reward)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

requirements.txt +1 -1
train.py +101 -51

requirements.txt CHANGED Viewed

@@ -11,7 +11,7 @@ aiofiles>=23.0.0
 torch
 transformers
 datasets
-trl
 unsloth
 matplotlib
 numpy

 torch
 transformers
 datasets
+trl>=0.18.2,<=0.24.0
 unsloth
 matplotlib
 numpy

train.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 OrgOS GRPO Training Script
-Equivalent to training/grpo_orgos.ipynb but runs headlessly.
 Outputs:
   training_log.txt       — structured training log for submission
@@ -35,20 +35,22 @@ from unsloth import FastLanguageModel
 # Config
 # ------------------------------------------------------------------
-MODEL_NAME             = os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-3B-Instruct")
 ENV_URL                = "http://localhost:8000"
 LOG_FILE               = "training_log.txt"
 N_PROMPTS_PER_WORKFLOW = 20
 N_EVAL                 = 10
-NUM_EPOCHS             = 3
-BATCH_SIZE             = 4
 GRAD_ACCUM             = 2
-LR                     = 5e-5
-NUM_GEN                = 4
-TEMPERATURE            = 0.8
 BETA                   = 0.04
 LORA_R                 = 16
 MAX_SEQ_LEN            = 4096
 # ------------------------------------------------------------------
 # Logger
@@ -77,7 +79,6 @@ def start_env_server():
         stdout=None,
         stderr=None,
     )
-    # Wait until healthy
     for _ in range(20):
         time.sleep(2)
         try:
@@ -112,6 +113,9 @@ def load_model():
         use_gradient_checkpointing = "unsloth",
         random_state   = 42,
     )
     trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
     tlog(f"[TRAIN_CONFIG] model={MODEL_NAME} lora_r={LORA_R} "
          f"max_seq_len={MAX_SEQ_LEN} trainable_params={trainable:,} quantization=4bit")
@@ -160,6 +164,12 @@ CRITICAL RULES:
 6. Stop when pending_steps is empty or done=true.
 """
 def obs_to_text(obs: dict) -> str:
     hints   = obs.get("schema_hints", {})
@@ -186,25 +196,15 @@ def obs_to_text(obs: dict) -> str:
         "",
         "=== APP STATES ===",
     ]
-    # workflow-relevant apps only — skip apps the workflow doesn't touch
-    WORKFLOW_APPS = {
-        "A": {"jira", "zendesk", "salesforce", "workday"},
-        "B": {"zendesk", "salesforce", "workday"},
-        "C": {"jira", "zendesk", "salesforce"},
-    }
-    relevant = WORKFLOW_APPS.get(
-        obs.get("workflow_id", "A"),
-        {"jira", "zendesk", "salesforce", "workday"},
-    )
     for app_name, view in obs.get("app_states", {}).items():
         if app_name not in relevant:
             continue
-        lines.append(f"  [{app_name.upper()}]")
         view_str = str(view)
         if len(view_str) > 600:
             view_str = view_str[:600] + "...[truncated]"
-        lines.append(f"  {view_str}")
-        lines.append("")
     return "\n".join(lines)
@@ -244,36 +244,83 @@ def build_prompt_dataset(tokenizer) -> Dataset:
             rows.append({
                 "prompt":      build_prompt(obs_text, tokenizer),
                 "workflow_id": wf,
-                "obs_text":    obs_text,
             })
     tlog(f"[TRAIN_CONFIG] algorithm=GRPO prompts={len(rows)} "
          f"workflows=A,B,C prompts_per_workflow={N_PROMPTS_PER_WORKFLOW}")
     return Dataset.from_list(rows)
 # ------------------------------------------------------------------
-# Reward function
 # ------------------------------------------------------------------
-def orgos_reward_fn(completions: List[str], prompts: List[str], **kwargs) -> List[float]:
     workflow_ids = kwargs.get("workflow_id", ["A"] * len(completions))
     rewards = []
     for completion, wf_id in zip(completions, workflow_ids):
         action = parse_action(completion)
         if action is None:
             rewards.append(-0.1)
             continue
         try:
-            httpx.post(f"{ENV_URL}/reset", json={"workflow_id": wf_id}, timeout=10)
             result = httpx.post(f"{ENV_URL}/step", json=action, timeout=10).json()
-            rewards.append(float(result["reward"]))
         except Exception:
             rewards.append(-0.1)
     return rewards
 # ------------------------------------------------------------------
-# Episode evaluation
 # ------------------------------------------------------------------
 def run_episode_with_model(model, tokenizer, workflow_id: str, max_steps: int = 15) -> float:
@@ -284,20 +331,14 @@ def run_episode_with_model(model, tokenizer, workflow_id: str, max_steps: int =
         if obs["done"]:
             break
-        # Stateless single-turn prompt — matches the GRPO training format.
-        # obs["message"] already carries last-action feedback, so no history needed.
         obs_text = obs_to_text(obs)
-        messages = [{"role": "user",
-                     "content": SYSTEM_PROMPT + "\n\n---\n\n" + obs_text}]
-        text   = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = tokenizer(text, return_tensors="pt").to(model.device)
         with torch.no_grad():
             out = model.generate(
                 **inputs,
                 max_new_tokens = 256,
-                temperature    = 0.0,
                 do_sample      = False,
                 pad_token_id   = tokenizer.eos_token_id,
             )
@@ -314,7 +355,7 @@ def run_episode_with_model(model, tokenizer, workflow_id: str, max_steps: int =
         if obs["done"]:
             break
-    return obs.get("current_score", 0.001)
 def evaluate(model, tokenizer, phase: str) -> dict:
@@ -325,10 +366,10 @@ def evaluate(model, tokenizer, phase: str) -> dict:
             score = run_episode_with_model(model, tokenizer, wf)
             scores[wf].append(score)
             tlog(f"[EVAL] phase={phase} workflow={wf} episode={ep+1} score={score:.4f}")
-        wf_mean = np.mean(scores[wf])
         tlog(f"[EVAL_WORKFLOW] phase={phase} workflow={wf} "
              f"mean={wf_mean:.4f} min={min(scores[wf]):.4f} max={max(scores[wf]):.4f}")
-    overall = np.mean([s for v in scores.values() for s in v])
     tlog(f"[EVAL_END] phase={phase} overall_mean={overall:.4f}")
     return scores
@@ -426,46 +467,55 @@ class OrgOSLogCallback(TrainerCallback):
 # ------------------------------------------------------------------
 def main():
     server_proc = start_env_server()
     try:
         model, tokenizer = load_model()
         prompt_dataset = build_prompt_dataset(tokenizer)
         # Sanity-check reward function
         test_r = orgos_reward_fn(
-            completions = ['{"app": "zendesk", "operation": "list_tickets", "args": {"state": "new"}}',
-                           "not json"],
-            prompts     = ["", ""],
-            workflow_id = ["A", "A"],
         )
         tlog(f"[REWARD_FN_CHECK] valid_action={test_r[0]:.4f} invalid_action={test_r[1]:.4f}")
         # Baseline evaluation
         FastLanguageModel.for_inference(model)
         baseline_scores = evaluate(model, tokenizer, phase="baseline")
-        baseline_mean   = np.mean([s for v in baseline_scores.values() for s in v])
         # GRPO training
-        model.train()
-        tlog(f"[TRAIN_CONFIG] epochs={NUM_EPOCHS} batch_size={BATCH_SIZE} "
              f"grad_accum={GRAD_ACCUM} lr={LR} num_generations={NUM_GEN} "
-             f"temperature={TEMPERATURE} beta_kl={BETA}")
         grpo_config = GRPOConfig(
             output_dir                  = "./orgos_grpo_ckpt",
-            num_train_epochs            = NUM_EPOCHS,
             per_device_train_batch_size = BATCH_SIZE,
             gradient_accumulation_steps = GRAD_ACCUM,
             learning_rate               = LR,
             warmup_steps                = 10,
             logging_steps               = 5,
-            save_steps                  = 100,
             bf16                        = torch.cuda.is_bf16_supported(),
             fp16                        = not torch.cuda.is_bf16_supported(),
             max_grad_norm               = 1.0,
             num_generations             = NUM_GEN,
             temperature                 = TEMPERATURE,
             beta                        = BETA,
             report_to                   = "none",
@@ -475,7 +525,7 @@ def main():
         trainer = GRPOTrainer(
             model            = model,
             args             = grpo_config,
-            reward_funcs     = orgos_reward_fn,
             train_dataset    = prompt_dataset,
             processing_class = tokenizer,
             callbacks        = [OrgOSLogCallback()],
@@ -490,7 +540,7 @@ def main():
         # Post-training evaluation
         FastLanguageModel.for_inference(model)
         post_scores = evaluate(model, tokenizer, phase="post_training")
-        post_mean   = np.mean([s for v in post_scores.values() for s in v])
         improvement = post_mean - baseline_mean
         tlog(

 """
 OrgOS GRPO Training Script
+Runs headlessly on HuggingFace Spaces (A100/T4 GPU).
 Outputs:
   training_log.txt       — structured training log for submission
 # Config
 # ------------------------------------------------------------------
+MODEL_NAME             = os.environ.get("MODEL_NAME", "unsloth/Qwen2.5-3B-Instruct-bnb-4bit")
 ENV_URL                = "http://localhost:8000"
 LOG_FILE               = "training_log.txt"
 N_PROMPTS_PER_WORKFLOW = 20
 N_EVAL                 = 10
+MAX_TRAIN_STEPS        = 150     # step-based training (more reliable than epoch-based on Spaces)
+BATCH_SIZE             = 1
 GRAD_ACCUM             = 2
+LR                     = 8e-6   # stable LR — 5e-5 was too high
+NUM_GEN                = 2      # candidates per prompt — keep low to save VRAM
+TEMPERATURE            = 0.9
 BETA                   = 0.04
 LORA_R                 = 16
 MAX_SEQ_LEN            = 4096
+MAX_COMPLETION_LENGTH  = 256
+REWARD_STEPS           = 2      # multi-step rollout depth in reward fn
 # ------------------------------------------------------------------
 # Logger
         stdout=None,
         stderr=None,
     )
     for _ in range(20):
         time.sleep(2)
         try:
         use_gradient_checkpointing = "unsloth",
         random_state   = 42,
     )
+    # Clear max_length to avoid max_new_tokens vs max_length warnings during generate()
+    model.config.max_length = None
     trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
     tlog(f"[TRAIN_CONFIG] model={MODEL_NAME} lora_r={LORA_R} "
          f"max_seq_len={MAX_SEQ_LEN} trainable_params={trainable:,} quantization=4bit")
 6. Stop when pending_steps is empty or done=true.
 """
+WORKFLOW_APPS = {
+    "A": {"jira", "zendesk", "salesforce", "workday"},
+    "B": {"zendesk", "salesforce", "workday"},
+    "C": {"jira", "zendesk", "salesforce"},
+}
 def obs_to_text(obs: dict) -> str:
     hints   = obs.get("schema_hints", {})
         "",
         "=== APP STATES ===",
     ]
+    relevant = WORKFLOW_APPS.get(obs.get("workflow_id", "A"),
+                                 {"jira", "zendesk", "salesforce", "workday"})
     for app_name, view in obs.get("app_states", {}).items():
         if app_name not in relevant:
             continue
         view_str = str(view)
         if len(view_str) > 600:
             view_str = view_str[:600] + "...[truncated]"
+        lines += [f"  [{app_name.upper()}]", f"  {view_str}", ""]
     return "\n".join(lines)
             rows.append({
                 "prompt":      build_prompt(obs_text, tokenizer),
                 "workflow_id": wf,
             })
     tlog(f"[TRAIN_CONFIG] algorithm=GRPO prompts={len(rows)} "
          f"workflows=A,B,C prompts_per_workflow={N_PROMPTS_PER_WORKFLOW}")
+    sample_tokens = None  # set below after tokenizer is available
     return Dataset.from_list(rows)
 # ------------------------------------------------------------------
+# Reward function — multi-step live environment rollout
 # ------------------------------------------------------------------
+# The model reference is set in main() before training starts.
+_reward_model     = None
+_reward_tokenizer = None
+def orgos_reward_fn(completions: List[str], prompts: List[str] = None, **kwargs) -> List[float]:
+    """
+    For each GRPO candidate:
+      1. Parse as JSON action.
+      2. Reset env and apply the action (step 1).
+      3. Continue REWARD_STEPS-1 more greedy steps with the current model.
+      4. Return cumulative episode score — not just single-step reward.
+    Multi-step signal prevents the model from collapsing to always outputting
+    list_tickets (which gives a small single-step reward but never advances the workflow).
+    """
     workflow_ids = kwargs.get("workflow_id", ["A"] * len(completions))
     rewards = []
     for completion, wf_id in zip(completions, workflow_ids):
         action = parse_action(completion)
         if action is None:
             rewards.append(-0.1)
             continue
         try:
+            # Reset env and apply the GRPO-generated action (step 1)
+            obs = httpx.post(f"{ENV_URL}/reset",
+                             json={"workflow_id": wf_id}, timeout=10).json()["observation"]
             result = httpx.post(f"{ENV_URL}/step", json=action, timeout=10).json()
+            obs = result["observation"]
+            # Continue REWARD_STEPS-1 more steps with current model (greedy)
+            if _reward_model is not None:
+                for _ in range(REWARD_STEPS - 1):
+                    if obs.get("done"):
+                        break
+                    prompt_text = build_prompt(obs_to_text(obs), _reward_tokenizer)
+                    inputs = _reward_tokenizer(
+                        prompt_text, return_tensors="pt"
+                    ).to(_reward_model.device)
+                    with torch.no_grad():
+                        out = _reward_model.generate(
+                            **inputs,
+                            max_new_tokens = 128,
+                            do_sample      = False,
+                            pad_token_id   = _reward_tokenizer.eos_token_id,
+                        )
+                    cont_str = _reward_tokenizer.decode(
+                        out[0][inputs["input_ids"].shape[1]:],
+                        skip_special_tokens=True,
+                    ).strip()
+                    cont_action = parse_action(cont_str)
+                    if cont_action is None:
+                        break
+                    result = httpx.post(f"{ENV_URL}/step",
+                                        json=cont_action, timeout=10).json()
+                    obs = result["observation"]
+            rewards.append(float(obs.get("current_score", 0.001)))
         except Exception:
             rewards.append(-0.1)
     return rewards
 # ------------------------------------------------------------------
+# Episode evaluation (stateless — each step is a fresh single-turn prompt)
 # ------------------------------------------------------------------
 def run_episode_with_model(model, tokenizer, workflow_id: str, max_steps: int = 15) -> float:
         if obs["done"]:
             break
         obs_text = obs_to_text(obs)
+        text     = build_prompt(obs_text, tokenizer)
+        inputs   = tokenizer(text, return_tensors="pt").to(model.device)
         with torch.no_grad():
             out = model.generate(
                 **inputs,
                 max_new_tokens = 256,
                 do_sample      = False,
                 pad_token_id   = tokenizer.eos_token_id,
             )
         if obs["done"]:
             break
+    return float(obs.get("current_score", 0.001))
 def evaluate(model, tokenizer, phase: str) -> dict:
             score = run_episode_with_model(model, tokenizer, wf)
             scores[wf].append(score)
             tlog(f"[EVAL] phase={phase} workflow={wf} episode={ep+1} score={score:.4f}")
+        wf_mean = float(np.mean(scores[wf]))
         tlog(f"[EVAL_WORKFLOW] phase={phase} workflow={wf} "
              f"mean={wf_mean:.4f} min={min(scores[wf]):.4f} max={max(scores[wf]):.4f}")
+    overall = float(np.mean([s for v in scores.values() for s in v]))
     tlog(f"[EVAL_END] phase={phase} overall_mean={overall:.4f}")
     return scores
 # ------------------------------------------------------------------
 def main():
+    global _reward_model, _reward_tokenizer
     server_proc = start_env_server()
     try:
         model, tokenizer = load_model()
+        # Wire up the reward function's model reference (used for multi-step rollouts)
+        _reward_model     = model
+        _reward_tokenizer = tokenizer
         prompt_dataset = build_prompt_dataset(tokenizer)
+        tok_len = len(tokenizer(prompt_dataset[0]["prompt"]).input_ids)
+        tlog(f"[PROMPT_DEBUG] first_prompt_tokens={tok_len}")
         # Sanity-check reward function
         test_r = orgos_reward_fn(
+            completions  = ['{"app": "zendesk", "operation": "list_tickets", "args": {}}',
+                            "not json"],
+            prompts      = ["", ""],
+            workflow_id  = ["A", "A"],
         )
         tlog(f"[REWARD_FN_CHECK] valid_action={test_r[0]:.4f} invalid_action={test_r[1]:.4f}")
         # Baseline evaluation
         FastLanguageModel.for_inference(model)
         baseline_scores = evaluate(model, tokenizer, phase="baseline")
+        baseline_mean   = float(np.mean([s for v in baseline_scores.values() for s in v]))
         # GRPO training
+        FastLanguageModel.for_training(model)
+        tlog(f"[TRAIN_CONFIG] max_steps={MAX_TRAIN_STEPS} batch_size={BATCH_SIZE} "
              f"grad_accum={GRAD_ACCUM} lr={LR} num_generations={NUM_GEN} "
+             f"temperature={TEMPERATURE} beta_kl={BETA} reward_steps={REWARD_STEPS}")
         grpo_config = GRPOConfig(
             output_dir                  = "./orgos_grpo_ckpt",
+            num_train_epochs            = 1,
+            max_steps                   = MAX_TRAIN_STEPS,
             per_device_train_batch_size = BATCH_SIZE,
             gradient_accumulation_steps = GRAD_ACCUM,
             learning_rate               = LR,
             warmup_steps                = 10,
             logging_steps               = 5,
             bf16                        = torch.cuda.is_bf16_supported(),
             fp16                        = not torch.cuda.is_bf16_supported(),
             max_grad_norm               = 1.0,
             num_generations             = NUM_GEN,
+            max_new_tokens              = MAX_COMPLETION_LENGTH,
             temperature                 = TEMPERATURE,
             beta                        = BETA,
             report_to                   = "none",
         trainer = GRPOTrainer(
             model            = model,
             args             = grpo_config,
+            reward_funcs     = [orgos_reward_fn],
             train_dataset    = prompt_dataset,
             processing_class = tokenizer,
             callbacks        = [OrgOSLogCallback()],
         # Post-training evaluation
         FastLanguageModel.for_inference(model)
         post_scores = evaluate(model, tokenizer, phase="post_training")
+        post_mean   = float(np.mean([s for v in post_scores.values() for s in v]))
         improvement = post_mean - baseline_mean
         tlog(