Spaces:

akhiilll
/

claims-env

Sleeping

App Files Files Community

akhiilll commited on 15 days ago

Commit

eed849b

verified ·

1 Parent(s): 43372d5

add HF Jobs GRPO training script

Browse files

Files changed (1) hide show

training/train_grpo_hf_job.py +464 -0

training/train_grpo_hf_job.py ADDED Viewed

	@@ -0,0 +1,464 @@

+"""GRPO fine-tune Qwen2.5-1.5B on the ClaimSense gym - designed for HF Jobs.
+Drop-in replacement for the notebook's training loop, but configured to run
+inside a `huggingface_hub.HfApi.run_uv_job` invocation on `a10g-largex4`
+hardware. We:
+1. ``git clone`` the ClaimSense Space so the gym runs in-process (deterministic
+   per ``scenario_index``).
+2. Load ``Qwen/Qwen2.5-1.5B-Instruct`` in bf16 on cuda:0 (no Unsloth -- the
+   default ``uv`` image lacks the CUDA dev libs Unsloth's kernels need; on
+   A10G we have enough memory to run without it).
+3. Wrap with PEFT LoRA r=16, alpha=32, target_modules=q/k/v/o/gate/up/down.
+4. Build the prompt dataset, reward functions (format + env-replay).
+5. Run ``trl.GRPOTrainer.train()`` for ``NUM_GRPO_STEPS`` steps.
+6. Plot reward / KL / completion-length curves to ``grpo_training.png``.
+7. Upload artifacts to ``runs/grpo-<timestamp>/`` on the Space repo so they
+   show up in the README plots.
+Configuration (all env vars):
+* ``HF_TOKEN``           - mandatory, used for hub access + artifact upload
+* ``MODEL_ID``           - default ``Qwen/Qwen2.5-1.5B-Instruct``
+* ``NUM_GRPO_STEPS``     - default ``80``
+* ``NUM_GENERATIONS``    - default ``4``
+* ``BATCH_SIZE``         - default ``2`` (per-device)
+* ``GRAD_ACCUM``         - default ``2``
+* ``LEARNING_RATE``      - default ``5e-6``
+* ``CASE_REPEATS``       - default ``8`` (each of 8 cases x N -> dataset rows)
+* ``ARTIFACT_REPO``      - default ``akhiilll/claims-env``
+* ``ARTIFACT_REPO_TYPE`` - default ``space``
+* ``CLAIMS_ENV_REPO``    - default ``akhiilll/claims-env`` (gym source)
+"""
+from __future__ import annotations
+import datetime
+import json
+import os
+import re
+import statistics
+import subprocess
+import sys
+import traceback
+from pathlib import Path
+# ---------------------------------------------------------------------------
+# 1. Clone the gym repo so we can import AdjudicationGym in-process
+# ---------------------------------------------------------------------------
+CLAIMS_ENV_REPO = os.environ.get("CLAIMS_ENV_REPO", "akhiilll/claims-env")
+CLONE_DIR = Path("/tmp/claims-env-repo")
+if not CLONE_DIR.exists():
+    print(f"[setup] cloning https://huggingface.co/spaces/{CLAIMS_ENV_REPO} -> {CLONE_DIR}")
+    subprocess.check_call(
+        [
+            "git",
+            "clone",
+            "--depth",
+            "1",
+            f"https://huggingface.co/spaces/{CLAIMS_ENV_REPO}",
+            str(CLONE_DIR),
+        ]
+    )
+sys.path.insert(0, str(CLONE_DIR))
+sys.path.insert(0, str(CLONE_DIR / "server"))
+from server.claims_environment import ACTION_VOCABULARY, AdjudicationGym  # type: ignore  # noqa: E402
+from server.mock_systems import CASE_LIBRARY  # type: ignore  # noqa: E402
+from models import AdjudicatorAction  # type: ignore  # noqa: E402
+print(f"[setup] gym imported. {len(ACTION_VOCABULARY)} verbs, {len(CASE_LIBRARY)} cases.")
+# ---------------------------------------------------------------------------
+# 2. Heavy ML imports (after gym imports so import errors above are visible)
+# ---------------------------------------------------------------------------
+import matplotlib  # noqa: E402
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt  # noqa: E402
+import torch  # noqa: E402
+from datasets import Dataset  # noqa: E402
+from peft import LoraConfig, get_peft_model  # noqa: E402
+from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
+from trl import GRPOConfig, GRPOTrainer  # noqa: E402
+# ---------------------------------------------------------------------------
+# 3. Configuration
+# ---------------------------------------------------------------------------
+MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-1.5B-Instruct")
+NUM_GRPO_STEPS = int(os.environ.get("NUM_GRPO_STEPS", "80"))
+NUM_GENERATIONS = int(os.environ.get("NUM_GENERATIONS", "4"))
+BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "2"))
+GRAD_ACCUM = int(os.environ.get("GRAD_ACCUM", "2"))
+LEARNING_RATE = float(os.environ.get("LEARNING_RATE", "5e-6"))
+CASE_REPEATS = int(os.environ.get("CASE_REPEATS", "8"))
+MAX_PROMPT_LEN = int(os.environ.get("MAX_PROMPT_LEN", "512"))
+MAX_COMPLETION_LEN = int(os.environ.get("MAX_COMPLETION_LEN", "256"))
+ARTIFACT_REPO = os.environ.get("ARTIFACT_REPO", "akhiilll/claims-env")
+ARTIFACT_REPO_TYPE = os.environ.get("ARTIFACT_REPO_TYPE", "space")
+RUN_ID = datetime.datetime.utcnow().strftime("grpo-%Y%m%d-%H%M%S")
+print(f"[config] model={MODEL_ID}")
+print(f"[config] steps={NUM_GRPO_STEPS} gens={NUM_GENERATIONS} bsz={BATCH_SIZE} grad_accum={GRAD_ACCUM}")
+print(f"[config] lr={LEARNING_RATE} run_id={RUN_ID}")
+print(f"[config] cuda available: {torch.cuda.is_available()} | n_gpus: {torch.cuda.device_count()}")
+if torch.cuda.is_available():
+    for i in range(torch.cuda.device_count()):
+        print(f"          gpu[{i}]: {torch.cuda.get_device_name(i)}")
+OUT_DIR = Path("/tmp/grpo-claims")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+# ---------------------------------------------------------------------------
+# 4. Tokenizer + model + LoRA
+# ---------------------------------------------------------------------------
+token = os.environ.get("HF_TOKEN")
+print(f"[setup] loading tokenizer {MODEL_ID}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+print(f"[setup] loading model {MODEL_ID} in bfloat16")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    token=token,
+    dtype=torch.bfloat16,
+    device_map={"": 0},          # single device, GRPOTrainer handles rollouts
+    attn_implementation="eager",  # safest across versions
+)
+model.config.pad_token_id = tokenizer.pad_token_id
+model.gradient_checkpointing_enable()
+print("[setup] applying LoRA r=16, alpha=32")
+lora = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.0,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"],
+)
+model = get_peft_model(model, lora)
+model.print_trainable_parameters()
+# ---------------------------------------------------------------------------
+# 5. Build prompt dataset
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = (
+    "You are an expert insurance claims adjuster.\n\n"
+    "Available actions (one per line, lowercase, in this order of execution):\n"
+    "  query_policy\n"
+    "  query_claim_history\n"
+    "  check_fraud\n"
+    "  request_documents\n"
+    "  verify_coverage\n"
+    "  verify_purchase\n"
+    "  calculate_payout\n"
+    "  approve <amount>     (terminal)\n"
+    "  deny <reason>        (terminal)\n"
+    "  escalate <reason>    (terminal)\n\n"
+    "Information actions cost a small fee; correct terminal verdicts pay big.\n"
+    "Catching fraud via deny pays even more. Output up to 6 actions, one per\n"
+    "line, ending with a terminal action. Do not write anything else."
+)
+def claim_to_user_msg(scenario_index: int) -> str:
+    env = AdjudicationGym(scenario_index=scenario_index)
+    obs = env.reset()
+    return (
+        f"New claim arrived:\n"
+        f"  claim_id     : {obs.claim_id}\n"
+        f"  type         : {obs.claim_type}\n"
+        f"  amount       : ${obs.claim_amount_requested:,.2f}\n"
+        f"  claimant     : {obs.claimant_name}\n"
+        f"  incident_date: {obs.incident_date}\n"
+        f"  description  : {obs.description}\n\n"
+        f"What is your action plan?"
+    )
+def make_prompt(scenario_index: int) -> str:
+    msgs = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": claim_to_user_msg(scenario_index)},
+    ]
+    return tokenizer.apply_chat_template(
+        msgs, tokenize=False, add_generation_prompt=True
+    )
+print(f"[setup] building dataset (case_repeats={CASE_REPEATS})")
+rows = []
+for _ in range(CASE_REPEATS):
+    for sidx in range(len(CASE_LIBRARY)):
+        rows.append({"prompt": make_prompt(sidx), "scenario_index": sidx})
+train_ds = Dataset.from_list(rows).shuffle(seed=42)
+print(f"[setup] dataset rows: {len(train_ds)}")
+# ---------------------------------------------------------------------------
+# 6. Reward functions
+# ---------------------------------------------------------------------------
+ACTIONS_SET = set(ACTION_VOCABULARY)
+TERMINALS = {"approve", "deny", "escalate"}
+def _coerce(c) -> str:
+    if isinstance(c, list):
+        if not c:
+            return ""
+        return c[0].get("content", "") if isinstance(c[0], dict) else str(c[0])
+    return str(c)
+def parse_actions(completion: str) -> list[AdjudicatorAction]:
+    actions: list[AdjudicatorAction] = []
+    for raw in completion.strip().splitlines():
+        line = raw.strip().lstrip("-*0123456789. ").lower().strip()
+        if not line:
+            continue
+        parts = line.split(maxsplit=1)
+        verb = parts[0]
+        if verb not in ACTIONS_SET:
+            continue
+        params: dict = {}
+        rest = parts[1] if len(parts) > 1 else ""
+        if verb == "approve":
+            m = re.search(r"\d[\d,\.]*", rest)
+            if m:
+                try:
+                    params["amount"] = float(m.group().replace(",", ""))
+                except ValueError:
+                    pass
+        elif verb == "deny":
+            params["reason"] = (rest or "policy_violation")[:80]
+        elif verb == "escalate":
+            params["reason"] = (rest or "manager_review")[:80]
+        actions.append(AdjudicatorAction(action_type=verb, parameters=params))
+        if verb in TERMINALS:
+            break
+    return actions
+def replay(actions, sidx, max_steps=8):
+    env = AdjudicationGym(scenario_index=int(sidx))
+    env.reset()
+    total = 0.0
+    for act in actions[:max_steps]:
+        obs = env.step(act)
+        total += float(obs.reward)
+        if obs.done:
+            break
+    return total
+def format_reward_fn(prompts, completions, **_):
+    rewards = []
+    for c in completions:
+        actions = parse_actions(_coerce(c))
+        if not actions:
+            rewards.append(-1.0)
+            continue
+        rewards.append(0.5 if actions[-1].action_type in TERMINALS else -0.25)
+    return rewards
+def env_reward_fn(prompts, completions, scenario_index, **_):
+    return [
+        replay(parse_actions(_coerce(c)), s)
+        for c, s in zip(completions, scenario_index)
+    ]
+# Sanity check (so a broken reward fn fails fast, before the trainer starts)
+sane_text = "query_policy\nverify_coverage\napprove 3500"
+sane_r = replay(parse_actions(sane_text), 0)
+print(f"[sanity] optimal trace on case 0 -> env reward {sane_r:+.2f}")
+assert sane_r > 0, f"reward fn broken (expected >0 on case 0, got {sane_r})"
+# ---------------------------------------------------------------------------
+# 7. GRPO training
+# ---------------------------------------------------------------------------
+training_args = GRPOConfig(
+    output_dir=str(OUT_DIR),
+    learning_rate=LEARNING_RATE,
+    adam_beta1=0.9,
+    adam_beta2=0.99,
+    weight_decay=0.1,
+    warmup_ratio=0.1,
+    lr_scheduler_type="cosine",
+    optim="adamw_torch",
+    logging_steps=1,
+    per_device_train_batch_size=BATCH_SIZE,
+    gradient_accumulation_steps=GRAD_ACCUM,
+    num_generations=NUM_GENERATIONS,
+    max_prompt_length=MAX_PROMPT_LEN,
+    max_completion_length=MAX_COMPLETION_LEN,
+    max_steps=NUM_GRPO_STEPS,
+    save_steps=999_999,
+    report_to="none",
+    bf16=True,
+    temperature=0.9,
+    top_p=0.95,
+    epsilon=0.2,
+    beta=0.04,
+)
+trainer = GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    reward_funcs=[format_reward_fn, env_reward_fn],
+    args=training_args,
+    train_dataset=train_ds,
+)
+print("[train] launching GRPOTrainer.train()")
+try:
+    trainer.train()
+    print("[train] done")
+except Exception:
+    traceback.print_exc()
+    raise
+# ---------------------------------------------------------------------------
+# 8. Plot training curves
+# ---------------------------------------------------------------------------
+log = trainer.state.log_history
+def series(key: str):
+    xs, ys = [], []
+    for entry in log:
+        if key in entry and "step" in entry:
+            xs.append(entry["step"])
+            ys.append(entry[key])
+    return xs, ys
+fig, axes = plt.subplots(2, 2, figsize=(13, 8))
+xs, ys = series("reward")
+axes[0, 0].plot(xs, ys, color="#1f77b4")
+axes[0, 0].set_title("mean group reward")
+axes[0, 0].set_xlabel("training step")
+axes[0, 0].set_ylabel("reward")
+axes[0, 0].grid(alpha=0.3)
+fmt_xs, fmt_ys = series("rewards/format_reward_fn")
+env_xs, env_ys = series("rewards/env_reward_fn")
+if not fmt_ys:
+    fmt_xs, fmt_ys = series("rewards/format_reward_fn/mean")
+    env_xs, env_ys = series("rewards/env_reward_fn/mean")
+axes[0, 1].plot(fmt_xs, fmt_ys, label="format reward", color="#2ca02c")
+axes[0, 1].plot(env_xs, env_ys, label="env reward", color="#d62728")
+axes[0, 1].set_title("per-reward-function score")
+axes[0, 1].set_xlabel("training step")
+axes[0, 1].set_ylabel("reward")
+axes[0, 1].legend()
+axes[0, 1].grid(alpha=0.3)
+xs, ys = series("kl")
+axes[1, 0].plot(xs, ys, color="#9467bd")
+axes[1, 0].set_title("KL(model || reference)")
+axes[1, 0].set_xlabel("training step")
+axes[1, 0].set_ylabel("kl")
+axes[1, 0].grid(alpha=0.3)
+xs, ys = series("completion_length")
+if not ys:
+    xs, ys = series("completions/mean_length")
+axes[1, 1].plot(xs, ys, color="#ff7f0e")
+axes[1, 1].set_title("mean completion length (tokens)")
+axes[1, 1].set_xlabel("training step")
+axes[1, 1].set_ylabel("tokens")
+axes[1, 1].grid(alpha=0.3)
+fig.tight_layout()
+png_path = OUT_DIR / "grpo_training.png"
+fig.savefig(png_path, dpi=120)
+print(f"[plot] saved {png_path}")
+log_path = OUT_DIR / "training_log.json"
+with log_path.open("w") as fh:
+    json.dump(log, fh, indent=2, default=str)
+print(f"[plot] saved {log_path}")
+summary = {
+    "run_id": RUN_ID,
+    "base_model": MODEL_ID,
+    "trainer": "trl.GRPOTrainer",
+    "num_steps": NUM_GRPO_STEPS,
+    "num_generations": NUM_GENERATIONS,
+    "batch_size": BATCH_SIZE,
+    "grad_accum": GRAD_ACCUM,
+    "learning_rate": LEARNING_RATE,
+    "case_repeats": CASE_REPEATS,
+    "dataset_rows": len(train_ds),
+    "reward_functions": ["format_reward_fn", "env_reward_fn"],
+    "env": "ClaimSense (https://huggingface.co/spaces/akhiilll/claims-env)",
+}
+last_reward = ys[-1] if ys else None
+xs2, ys2 = series("reward")
+if ys2:
+    summary["first_reward"] = ys2[0]
+    summary["last_reward"] = ys2[-1]
+    summary["best_reward"] = max(ys2)
+    summary["worst_reward"] = min(ys2)
+    summary["mean_reward"] = statistics.mean(ys2)
+summary_path = OUT_DIR / "run_summary.json"
+with summary_path.open("w") as fh:
+    json.dump(summary, fh, indent=2)
+print(json.dumps(summary, indent=2))
+# ---------------------------------------------------------------------------
+# 9. Save adapter + upload artifacts back to the Space
+# ---------------------------------------------------------------------------
+adapter_dir = OUT_DIR / "lora-adapter"
+trainer.model.save_pretrained(str(adapter_dir))
+tokenizer.save_pretrained(str(adapter_dir))
+print(f"[save] LoRA adapter -> {adapter_dir}")
+try:
+    from huggingface_hub import HfApi
+    api = HfApi(token=token)
+    target_dir = f"runs/{RUN_ID}"
+    uploads = [
+        (png_path, f"{target_dir}/grpo_training.png"),
+        (log_path, f"{target_dir}/training_log.json"),
+        (summary_path, f"{target_dir}/run_summary.json"),
+    ]
+    for src, dst in uploads:
+        api.upload_file(
+            path_or_fileobj=str(src),
+            path_in_repo=dst,
+            repo_id=ARTIFACT_REPO,
+            repo_type=ARTIFACT_REPO_TYPE,
+            commit_message=f"GRPO run: {RUN_ID}",
+        )
+        print(f"[upload] {dst}")
+    api.upload_folder(
+        folder_path=str(adapter_dir),
+        path_in_repo=f"{target_dir}/lora-adapter",
+        repo_id=ARTIFACT_REPO,
+        repo_type=ARTIFACT_REPO_TYPE,
+        commit_message=f"GRPO LoRA adapter: {RUN_ID}",
+    )
+    print(f"[upload] {target_dir}/lora-adapter (folder)")
+    print(f"[done] artifacts at https://huggingface.co/spaces/{ARTIFACT_REPO}/tree/main/{target_dir}")
+except Exception as exc:
+    print(f"[upload] skipped: {type(exc).__name__}: {exc}")
+    traceback.print_exc()