Spaces:

aamrinder
/

subtext-arena

Sleeping

App Files Files Community

aamrinder commited on 13 days ago

Commit

7f60dea

verified ·

1 Parent(s): 225e725

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

openenv_subtext_arena.egg-info/PKG-INFO +1 -1
openenv_subtext_arena.egg-info/SOURCES.txt +7 -1
openenv_subtext_arena.egg-info/requires.txt +1 -1
pyproject.toml +2 -2
train/__init__.py +1 -0
train/eval_pivot_set.py +19 -11
train/hour1_smoke.py +167 -0
train/train_grpo.py +19 -13

openenv_subtext_arena.egg-info/PKG-INFO CHANGED Viewed

@@ -3,7 +3,7 @@ Name: openenv-subtext_arena
 Version: 0.1.0
 Summary: Subtext Arena environment for OpenEnv
 Requires-Python: >=3.10
-Requires-Dist: openenv-core[core]>=0.2.2
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

 Version: 0.1.0
 Summary: Subtext Arena environment for OpenEnv
 Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.3
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

openenv_subtext_arena.egg-info/SOURCES.txt CHANGED Viewed

@@ -17,4 +17,10 @@ server/app.py
 server/audio_tools.py
 server/grader.py
 server/scenarios.py
-server/subtext_arena_environment.py

 server/audio_tools.py
 server/grader.py
 server/scenarios.py
+server/subtext_arena_environment.py
+train/__init__.py
+train/curate_pivot_set.py
+train/eval_pivot_set.py
+train/hour1_smoke.py
+train/plot_reward_decomp.py
+train/train_grpo.py

openenv_subtext_arena.egg-info/requires.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-openenv-core[core]>=0.2.2
 [dev]
 pytest>=8.0.0

+openenv-core[core]>=0.2.3
 [dev]
 pytest>=8.0.0

pyproject.toml CHANGED Viewed

@@ -35,5 +35,5 @@ server = "subtext_arena.server.app:main"
 [tool.setuptools]
 include-package-data = true
-packages = ["subtext_arena", "subtext_arena.server"]
-package-dir = { "subtext_arena" = ".", "subtext_arena.server" = "server" }

 [tool.setuptools]
 include-package-data = true
+packages = ["subtext_arena", "subtext_arena.server", "subtext_arena.train"]
+package-dir = { "subtext_arena" = ".", "subtext_arena.server" = "server", "subtext_arena.train" = "train" }

train/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Subtext Arena training scripts (GRPO + eval + plotting + Pivot Set curation)."""

train/eval_pivot_set.py CHANGED Viewed

@@ -28,17 +28,25 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List
-ROOT = Path(__file__).resolve().parent.parent
-if str(ROOT) not in sys.path:
-    sys.path.insert(0, str(ROOT))
-from server.scenarios import load_scenarios
-from train.train_grpo import (
-    SYSTEM_PROMPT,
-    build_full_observation,
-    parse_final,
-    reward_decomposition,
-)
 def main():

 from pathlib import Path
 from typing import Any, Dict, List
+try:
+    from subtext_arena.server.scenarios import load_scenarios
+    from subtext_arena.train.train_grpo import (
+        SYSTEM_PROMPT,
+        build_full_observation,
+        parse_final,
+        reward_decomposition,
+    )
+except ImportError:
+    ROOT = Path(__file__).resolve().parent.parent
+    if str(ROOT) not in sys.path:
+        sys.path.insert(0, str(ROOT))
+    from server.scenarios import load_scenarios  # type: ignore[no-redef]
+    from train.train_grpo import (  # type: ignore[no-redef]
+        SYSTEM_PROMPT,
+        build_full_observation,
+        parse_final,
+        reward_decomposition,
+    )
 def main():

train/hour1_smoke.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""Hour-1 smoke test for Path A.
+Validates the entire training stack on a T4 in ~10-15 minutes:
+  1. Unsloth + Qwen2.5-3B loads with 4-bit + LoRA
+  2. Our env package installs from the HF Space and prompts build correctly
+  3. TRL GRPOTrainer runs 2 steps end-to-end
+  4. Reward function fires; rewards are non-zero
+  5. LoRA weights actually update
+If this passes -> commit to the full 200-step run.
+If this fails -> the error tells us exactly what to fix before spending more.
+Run on HF Jobs T4-medium ($0.60/hr, ~$0.15 for this script):
+    hf jobs uv run --flavor t4-medium -s HF_TOKEN \\
+        --with unsloth --with "trl>=0.11" --with datasets --with accelerate \\
+        --with "git+https://huggingface.co/spaces/aamrinder/subtext-arena" \\
+        -- python -m subtext_arena.train.hour1_smoke
+"""
+from __future__ import annotations
+import sys
+import time
+import traceback
+def main():
+    t_start = time.time()
+    print("=" * 60)
+    print("Subtext Arena hour-1 smoke test (Path A)")
+    print("=" * 60)
+    # 1. PyTorch + GPU
+    print("\n[1/6] checking PyTorch + GPU")
+    try:
+        import torch
+        assert torch.cuda.is_available(), "CUDA not available"
+        gpu_name = torch.cuda.get_device_name(0)
+        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
+        print(f"   ✓ {gpu_name} ({gpu_mem:.1f} GB)")
+    except Exception as e:
+        print(f"   ✗ {e}")
+        traceback.print_exc()
+        sys.exit(1)
+    # 2. Unsloth + TRL imports
+    print("\n[2/6] importing Unsloth + TRL")
+    try:
+        from unsloth import FastLanguageModel
+        from trl import GRPOTrainer, GRPOConfig
+        from datasets import Dataset
+        print("   ✓ Unsloth + TRL + datasets imported")
+    except Exception as e:
+        print(f"   ✗ {e}")
+        traceback.print_exc()
+        sys.exit(1)
+    # 3. Subtext Arena env package
+    print("\n[3/6] importing subtext_arena package")
+    try:
+        from subtext_arena import SubtextArenaEnv, SubtextArenaAction
+        from subtext_arena.server.scenarios import load_scenarios
+        from subtext_arena.train.train_grpo import (
+            SYSTEM_PROMPT, build_dataset, make_reward_fn, reward_decomposition,
+        )
+        scenarios = load_scenarios()
+        print(f"   ✓ {len(scenarios)} MUStARD scenarios loaded")
+    except Exception as e:
+        print(f"   ✗ {e}")
+        traceback.print_exc()
+        sys.exit(1)
+    # 4. Build a TINY dataset (8 rows is enough for smoke)
+    print("\n[4/6] building tiny dataset (8 rows)")
+    try:
+        ds = build_dataset(scenarios, n_rows=8, seed=0)
+        print(f"   ✓ dataset cols={ds.column_names}, len={len(ds)}")
+        print(f"   first prompt user-msg first 200 chars: {ds[0]['prompt'][1]['content'][:200]!r}")
+    except Exception as e:
+        print(f"   ✗ {e}")
+        traceback.print_exc()
+        sys.exit(1)
+    # 5. Load Qwen2.5-3B-Instruct + LoRA
+    print("\n[5/6] loading Qwen2.5-3B-Instruct (4-bit + LoRA)")
+    try:
+        model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name="unsloth/Qwen2.5-3B-Instruct",
+            max_seq_length=2048,  # smaller than full 4096 for speed
+            load_in_4bit=True,
+        )
+        model = FastLanguageModel.get_peft_model(
+            model,
+            r=8,                   # smaller r for the smoke test
+            lora_alpha=16,
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+            use_gradient_checkpointing="unsloth",
+        )
+        n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print(f"   ✓ model loaded; {n_trainable / 1e6:.1f}M LoRA params trainable")
+    except Exception as e:
+        print(f"   ✗ {e}")
+        traceback.print_exc()
+        sys.exit(1)
+    # 6. Run 2 GRPO steps
+    print("\n[6/6] running 2 GRPO steps")
+    try:
+        reward_fn = make_reward_fn()
+        # Wrap reward_fn to print per-rollout decomposition for the smoke test
+        last_rewards = []
+        def smoke_reward_fn(prompts, completions, **kwargs):
+            rewards = reward_fn(prompts, completions, **kwargs)
+            last_rewards.append(list(rewards))
+            for i, (c, r, gold) in enumerate(zip(completions, rewards, kwargs.get("gold", []))):
+                text = c[0]["content"] if isinstance(c, list) else str(c)
+                d = reward_decomposition(text, gold)
+                print(f"      rollout {i}: reward={r:.3f}  pred={d['_predicted']!s:>10} gold={gold:>10}  "
+                      f"correct={d['_correct']}  well_formed={d['_well_formed']}")
+            return rewards
+        config = GRPOConfig(
+            output_dir="/tmp/smoke_out",
+            num_generations=2,                 # keep small for speed
+            max_completion_length=384,
+            per_device_train_batch_size=1,
+            learning_rate=5e-6,
+            max_steps=2,
+            logging_steps=1,
+            save_steps=10,                     # never saves in 2 steps
+            bf16=True,
+            report_to="none",
+            gradient_checkpointing=True,
+        )
+        trainer = GRPOTrainer(
+            model=model,
+            reward_funcs=smoke_reward_fn,
+            args=config,
+            train_dataset=ds,
+            processing_class=tokenizer,
+        )
+        trainer.train()
+        print(f"   ✓ 2 GRPO steps completed")
+        if last_rewards:
+            all_r = [r for batch in last_rewards for r in batch]
+            mean_r = sum(all_r) / len(all_r)
+            n_well_formed = sum(1 for r in all_r if r > 0.05)
+            print(f"   ✓ {len(all_r)} rollouts, mean reward = {mean_r:.3f}, {n_well_formed} well-formed")
+            if n_well_formed == 0:
+                print("   ⚠ WARNING: zero well-formed completions. The base model isn't following the format.")
+                print("     Consider an SFT warmup pass before GRPO.")
+    except Exception as e:
+        print(f"   ✗ {e}")
+        traceback.print_exc()
+        sys.exit(1)
+    elapsed = time.time() - t_start
+    print()
+    print("=" * 60)
+    print(f"✓ ALL CHECKS PASS in {elapsed:.1f}s — Path A stack is alive")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

train/train_grpo.py CHANGED Viewed

@@ -36,19 +36,25 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-# We import the env's prosody/transcript renderers and scenario loader so
-# the prompt format the model trains on is IDENTICAL to what an inference-
-# time agent would see if it called the tools.
-ROOT = Path(__file__).resolve().parent.parent
-if str(ROOT) not in sys.path:
-    sys.path.insert(0, str(ROOT))
-from server.scenarios import load_scenarios
-from server.audio_tools import (
-    render_transcript,
-    render_prosody_features,
-    render_pitch_contour,
-)
 # ---------------------------------------------------------------------------

 from pathlib import Path
 from typing import Any, Dict, List, Optional
+# Dual import path: works whether this script is run locally (with
+# subtext_arena/ on sys.path) or after `pip install` (subtext_arena.* package).
+try:
+    from subtext_arena.server.scenarios import load_scenarios
+    from subtext_arena.server.audio_tools import (
+        render_transcript,
+        render_prosody_features,
+        render_pitch_contour,
+    )
+except ImportError:
+    ROOT = Path(__file__).resolve().parent.parent
+    if str(ROOT) not in sys.path:
+        sys.path.insert(0, str(ROOT))
+    from server.scenarios import load_scenarios  # type: ignore[no-redef]
+    from server.audio_tools import (  # type: ignore[no-redef]
+        render_transcript,
+        render_prosody_features,
+        render_pitch_contour,
+    )
 # ---------------------------------------------------------------------------