Phase 3: Fix GRPO learning signal with continuous rewards and multi-reward

- Smooth V1 scoring: continuous claim-strength within discrete bands [0.80-1.0]
- Smooth V4 scoring: continuous confidence-distance function peaking at 0.5
- Add make_individual_reward_functions() for TRL multi-reward support
- Add use_multi_reward config option with reward_weights in GRPOConfig
- Add eval_batch_size config to fix G=16 divisibility requirement
- Create grpo_full_v2.json: G=16, beta=0.02, num_iterations=2, 3 epochs
- Update run_grpo_full.sh to use v2 config
- Update .gitignore with BioGRPO model output dirs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (7) hide show

.gitignore +7 -0
configs/grpo_full_v2.json +47 -0
scripts/run_grpo_full.sh +7 -7
src/biorlhf/training/grpo.py +38 -8
src/biorlhf/verifiers/composer.py +87 -0
src/biorlhf/verifiers/pathway.py +18 -9
src/biorlhf/verifiers/uncertainty.py +7 -7

.gitignore CHANGED Viewed

@@ -186,3 +186,10 @@ logs/
 # Uncomment below if you want to exclude datasets from git:
 # *.json
 # !kmp_test_set.json

 # Uncomment below if you want to exclude datasets from git:
 # *.json
 # !kmp_test_set.json
+# BioGRPO / ecosystem model outputs
+ecosystem_improved_model/
+biogrpo_mve_model/
+biogrpo_full_model/
+biogrpo_full_v2_model/
+data/*.json

configs/grpo_full_v2.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "model_name": "mistralai/Mistral-7B-v0.3",
+    "sft_model_path": "./kmp_sft_model_final",
+    "output_dir": "./biogrpo_full_v2_model",
+    "num_generations": 16,
+    "beta": 0.02,
+    "num_iterations": 2,
+    "scale_rewards": "group",
+    "loss_type": "grpo",
+    "num_epochs": 3,
+    "batch_size": 1,
+    "eval_batch_size": 16,
+    "gradient_accumulation_steps": 8,
+    "learning_rate": 5e-7,
+    "max_completion_length": 1024,
+    "max_prompt_length": 512,
+    "warmup_ratio": 0.1,
+    "lora_r": 32,
+    "lora_alpha": 64,
+    "lora_dropout": 0.05,
+    "use_multi_reward": true,
+    "active_verifiers": ["V1", "V2", "V3", "V4"],
+    "verifier_weights": {"V1": 0.35, "V2": 0.30, "V3": 0.15, "V4": 0.20},
+    "pathway_db": "hallmark",
+    "hold_out_tissues": ["eye", "thymus"],
+    "seed": 42,
+    "use_4bit": true,
+    "wandb_project": "biogrpo",
+    "wandb_run_name": "grpo_full_v2_G16_multireward",
+    "use_wandb": true,
+    "logging_steps": 10,
+    "save_steps": 50,
+    "eval_steps": 50,
+    "save_total_limit": 3,
+    "log_completions": true,
+    "use_vllm": false,
+    "gradient_checkpointing": true,
+    "bf16": true
+}

scripts/run_grpo_full.sh CHANGED Viewed

@@ -5,13 +5,13 @@
 #SBATCH --gres=gpu:1
 #SBATCH --mem=96G
 #SBATCH --cpus-per-task=8
-#SBATCH --time=24:00:00
 #SBATCH --output=logs/grpo_full_%j.log
 #SBATCH --error=logs/grpo_full_%j.err
 # ============================================================
-# BioGRPO Full Experiment
-# All V1-V4 verifiers, G=8, from SFT checkpoint
 # ============================================================
 SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
@@ -59,14 +59,14 @@ if [ ! -e "${WORKDIR}/kmp_sft_model_final" ]; then
     echo "Symlinked kmp_sft_model_final"
 fi
-echo "Starting BioGRPO Full training..."
-biorlhf-grpo --config configs/grpo_full.json
 if [ $? -eq 0 ]; then
     echo ""
     echo "============================================================"
-    echo "BioGRPO Full training completed!"
-    echo "Model saved to: ./biogrpo_full_model"
     echo "End time: $(date)"
     echo "============================================================"
 else

 #SBATCH --gres=gpu:1
 #SBATCH --mem=96G
 #SBATCH --cpus-per-task=8
+#SBATCH --time=48:00:00
 #SBATCH --output=logs/grpo_full_%j.log
 #SBATCH --error=logs/grpo_full_%j.err
 # ============================================================
+# BioGRPO Full Experiment v2
+# All V1-V4 verifiers, G=16, multi-reward, from SFT checkpoint
 # ============================================================
 SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
     echo "Symlinked kmp_sft_model_final"
 fi
+echo "Starting BioGRPO Full v2 training..."
+biorlhf-grpo --config configs/grpo_full_v2.json
 if [ $? -eq 0 ]; then
     echo ""
     echo "============================================================"
+    echo "BioGRPO Full v2 training completed!"
+    echo "Model saved to: ./biogrpo_full_v2_model"
     echo "End time: $(date)"
     echo "============================================================"
 else

src/biorlhf/training/grpo.py CHANGED Viewed

@@ -14,7 +14,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import LoraConfig, PeftModel
 from trl import GRPOTrainer, GRPOConfig
-from biorlhf.verifiers.composer import make_grpo_reward_function
 from biorlhf.data.grpo_dataset import build_grpo_dataset, get_dataset_stats
@@ -37,6 +37,7 @@ class BioGRPOConfig:
     # Training hyperparameters
     num_epochs: int = 1
     batch_size: int = 2
     gradient_accumulation_steps: int = 8
     learning_rate: float = 1e-6
     max_completion_length: int = 1024
@@ -57,6 +58,9 @@ class BioGRPOConfig:
     hold_out_tissues: Optional[List[str]] = None
     seed: int = 42
     # Quantization
     use_4bit: bool = True
@@ -137,12 +141,21 @@ def run_grpo_training(config: Optional[BioGRPOConfig] = None) -> str:
     print(f"    By type:   {train_stats['by_question_type']}")
     print(f"  Eval:  {eval_stats['total']} samples")
-    # 2. Create reward function
     print("\n[2/5] Initializing verifier stack...")
-    reward_func = make_grpo_reward_function(
-        weights=config.verifier_weights,
-        active_verifiers=config.active_verifiers,
-    )
     print(f"  Active: {config.active_verifiers or ['V1', 'V2', 'V3', 'V4']}")
     # 3. Load tokenizer (always from base model; adapter dirs lack config.json)
@@ -214,10 +227,11 @@ def run_grpo_training(config: Optional[BioGRPOConfig] = None) -> str:
     # 6. Configure GRPOTrainer
     print("\n[5/6] Configuring GRPOTrainer...")
-    grpo_config = GRPOConfig(
         output_dir=config.output_dir,
         num_train_epochs=config.num_epochs,
         per_device_train_batch_size=config.batch_size,
         gradient_accumulation_steps=config.gradient_accumulation_steps,
         learning_rate=config.learning_rate,
         warmup_ratio=config.warmup_ratio,
@@ -250,10 +264,26 @@ def run_grpo_training(config: Optional[BioGRPOConfig] = None) -> str:
         log_completions=config.log_completions,
     )
     trainer = GRPOTrainer(
         model=model,
         args=grpo_config,
-        reward_funcs=reward_func,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         peft_config=peft_config,

 from peft import LoraConfig, PeftModel
 from trl import GRPOTrainer, GRPOConfig
+from biorlhf.verifiers.composer import make_grpo_reward_function, make_individual_reward_functions
 from biorlhf.data.grpo_dataset import build_grpo_dataset, get_dataset_stats
     # Training hyperparameters
     num_epochs: int = 1
     batch_size: int = 2
+    eval_batch_size: Optional[int] = None
     gradient_accumulation_steps: int = 8
     learning_rate: float = 1e-6
     max_completion_length: int = 1024
     hold_out_tissues: Optional[List[str]] = None
     seed: int = 42
+    # Multi-reward (per-verifier TRL reward functions)
+    use_multi_reward: bool = False
     # Quantization
     use_4bit: bool = True
     print(f"    By type:   {train_stats['by_question_type']}")
     print(f"  Eval:  {eval_stats['total']} samples")
+    # 2. Create reward function(s)
     print("\n[2/5] Initializing verifier stack...")
+    reward_weights = None
+    if config.use_multi_reward:
+        reward_funcs, reward_weights = make_individual_reward_functions(
+            active_verifiers=config.active_verifiers,
+            weights=config.verifier_weights,
+        )
+        print(f"  Mode: multi-reward ({len(reward_funcs)} per-verifier functions)")
+        print(f"  Weights: {reward_weights}")
+    else:
+        reward_funcs = make_grpo_reward_function(
+            weights=config.verifier_weights,
+            active_verifiers=config.active_verifiers,
+        )
     print(f"  Active: {config.active_verifiers or ['V1', 'V2', 'V3', 'V4']}")
     # 3. Load tokenizer (always from base model; adapter dirs lack config.json)
     # 6. Configure GRPOTrainer
     print("\n[5/6] Configuring GRPOTrainer...")
+    grpo_kwargs = dict(
         output_dir=config.output_dir,
         num_train_epochs=config.num_epochs,
         per_device_train_batch_size=config.batch_size,
+        per_device_eval_batch_size=config.eval_batch_size or config.batch_size,
         gradient_accumulation_steps=config.gradient_accumulation_steps,
         learning_rate=config.learning_rate,
         warmup_ratio=config.warmup_ratio,
         log_completions=config.log_completions,
     )
+    # Add reward_weights for multi-reward mode if TRL supports it
+    if reward_weights is not None:
+        try:
+            # Check if GRPOConfig accepts reward_weights (TRL >= 0.27)
+            import inspect
+            if "reward_weights" in inspect.signature(GRPOConfig).parameters:
+                grpo_kwargs["reward_weights"] = reward_weights
+                print(f"  reward_weights set in GRPOConfig: {reward_weights}")
+            else:
+                print("  Warning: TRL version does not support reward_weights in GRPOConfig")
+                print("  Per-verifier functions will still be used, but with equal weights")
+        except Exception:
+            pass
+    grpo_config = GRPOConfig(**grpo_kwargs)
     trainer = GRPOTrainer(
         model=model,
         args=grpo_config,
+        reward_funcs=reward_funcs,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         peft_config=peft_config,

src/biorlhf/verifiers/composer.py CHANGED Viewed

@@ -202,6 +202,93 @@ def make_grpo_reward_function(
     return reward_func
 def make_single_verifier_reward(verifier_name: str) -> Callable:
     """Create a reward function using only one verifier (for ablation)."""
     return make_grpo_reward_function(active_verifiers=[verifier_name])

     return reward_func
+def make_individual_reward_functions(
+    active_verifiers: Optional[List[str]] = None,
+    weights: Optional[Dict[str, float]] = None,
+) -> tuple:
+    """Return (list_of_reward_funcs, list_of_weights) for TRL multi-reward.
+    Each reward function wraps a single verifier and returns List[float | None].
+    Non-applicable verifiers return None for samples where they don't apply;
+    TRL natively excludes None rewards from the GRPO calculation.
+    This enables per-verifier reward normalization in TRL, preventing a single
+    low-variance verifier from dominating the gradient signal.
+    """
+    all_verifiers = {
+        "V1": PathwayDirectionVerifier(),
+        "V2": BiologicalFactVerifier(),
+        "V3": CrossContextConsistencyVerifier(),
+        "V4": UncertaintyVerifier(),
+    }
+    if active_verifiers:
+        verifiers = {k: v for k, v in all_verifiers.items() if k in active_verifiers}
+    else:
+        verifiers = all_verifiers
+    w = dict(weights or DEFAULT_WEIGHTS)
+    weight_list = [w.get(k, 0) for k in verifiers]
+    def _make_single_reward_fn(verifier: BaseVerifier, vname: str) -> Callable:
+        """Create a closure-safe reward function for a single verifier."""
+        def reward_func(
+            completions: List,
+            ground_truth: Optional[List[str]] = None,
+            question_type: Optional[List[str]] = None,
+            applicable_verifiers: Optional[List[str]] = None,
+            **kwargs,
+        ) -> List:
+            n = len(completions)
+            if ground_truth is None:
+                ground_truth = ["{}"] * n
+            if question_type is None:
+                question_type = ["unknown"] * n
+            if applicable_verifiers is None:
+                applicable_verifiers = [json.dumps(list(all_verifiers.keys()))] * n
+            prompts = kwargs.get("prompts", kwargs.get("prompt", [""] * n))
+            if isinstance(prompts, str):
+                prompts = [prompts] * n
+            rewards = []
+            for i in range(n):
+                app = (
+                    json.loads(applicable_verifiers[i])
+                    if isinstance(applicable_verifiers[i], str)
+                    else applicable_verifiers[i]
+                )
+                if not verifier.is_applicable(app):
+                    rewards.append(None)
+                    continue
+                completion_text = _extract_text(completions[i])
+                prompt_text = _extract_text(prompts[i]) if i < len(prompts) else ""
+                gt = (
+                    json.loads(ground_truth[i])
+                    if isinstance(ground_truth[i], str)
+                    else ground_truth[i]
+                )
+                result = verifier.score(prompt_text, completion_text, gt, question_type[i])
+                if not result.applicable:
+                    rewards.append(None)
+                else:
+                    rewards.append(result.score)
+            return rewards
+        reward_func.__name__ = f"reward_{vname}"
+        return reward_func
+    reward_funcs = [
+        _make_single_reward_fn(v, name) for name, v in verifiers.items()
+    ]
+    return reward_funcs, weight_list
 def make_single_verifier_reward(verifier_name: str) -> Callable:
     """Create a reward function using only one verifier (for ablation)."""
     return make_grpo_reward_function(active_verifiers=[verifier_name])

src/biorlhf/verifiers/pathway.py CHANGED Viewed

@@ -4,11 +4,11 @@ V1: Pathway Direction Verifier.
 Extracts directional claims about biological pathways from model responses
 and compares them against fGSEA NES direction ground truth.
-Scoring:
-  1.0 — correct direction claimed
-  0.5 — mixed/contradictory claims
-  0.3 — no directional claim extracted
-  0.0 — wrong direction claimed
 """
 import re
@@ -229,15 +229,23 @@ class PathwayDirectionVerifier(BaseVerifier):
         contradicting = [
             c for c in claims if c[1] != expected_dir and c[1] != "AMBIGUOUS"
         ]
         if matching and not contradicting:
-            score = 1.0
         elif matching and contradicting:
-            score = 0.5
         elif contradicting:
-            score = 0.0
         else:
-            score = 0.3  # Only ambiguous claims
         return VerifierResult(
             score=score,
@@ -248,6 +256,7 @@ class PathwayDirectionVerifier(BaseVerifier):
                 "claims": [(v, d) for v, d in claims],
                 "n_matching": len(matching),
                 "n_contradicting": len(contradicting),
             },
         )

 Extracts directional claims about biological pathways from model responses
 and compares them against fGSEA NES direction ground truth.
+Scoring (continuous within bands to provide GRPO gradient signal):
+  [0.80, 1.00] — correct direction (strength = n_matching / (n_matching + 1))
+  [0.40, 0.60] — mixed/contradictory claims (ratio of matching to total)
+  0.30         — no directional claim extracted (fixed)
+  [0.00, 0.10] — wrong direction (modulated by ambiguity ratio)
 """
 import re
         contradicting = [
             c for c in claims if c[1] != expected_dir and c[1] != "AMBIGUOUS"
         ]
+        ambiguous = [c for c in claims if c[1] == "AMBIGUOUS"]
+        # Continuous scoring within discrete bands for GRPO gradient signal
         if matching and not contradicting:
+            # Correct direction: [0.80, 1.00] based on claim strength
+            strength = len(matching) / (len(matching) + 1)
+            score = 0.8 + 0.2 * strength
         elif matching and contradicting:
+            # Mixed claims: [0.40, 0.60] based on matching ratio
+            total = len(matching) + len(contradicting)
+            score = 0.4 + 0.2 * (len(matching) / total)
         elif contradicting:
+            # Wrong direction: [0.00, 0.10] modulated by ambiguity
+            ambiguity_ratio = len(ambiguous) / len(claims) if claims else 0
+            score = 0.1 * ambiguity_ratio
         else:
+            score = 0.3  # Only ambiguous claims — no variance possible
         return VerifierResult(
             score=score,
                 "claims": [(v, d) for v, d in claims],
                 "n_matching": len(matching),
                 "n_contradicting": len(contradicting),
+                "n_ambiguous": len(ambiguous),
             },
         )

src/biorlhf/verifiers/uncertainty.py CHANGED Viewed

@@ -250,13 +250,13 @@ class UncertaintyVerifier(BaseVerifier):
         conf_score: float,
         stated: str,
     ) -> VerifierResult:
-        """Default scoring: penalize extreme overconfidence."""
-        if conf_score > 0.90:
-            score = 0.4  # Overconfidence penalty
-        elif conf_score < 0.10:
-            score = 0.3  # Extreme underconfidence penalty
-        else:
-            score = 0.7  # Moderate confidence is good default
         return VerifierResult(
             score=score,

         conf_score: float,
         stated: str,
     ) -> VerifierResult:
+        """Default scoring: continuous function rewarding moderate confidence.
+        Peaks at conf=0.5 (score=1.0), smoothly penalizes extremes.
+        Range: [0.25, 1.0] — provides GRPO gradient signal even when
+        generations extract slightly different confidence values.
+        """
+        score = max(0.2, 1.0 - abs(conf_score - 0.5) * 1.5)
         return VerifierResult(
             score=score,