Spaces:

prithic07
/

context-prune

Sleeping

App Files Files Community

prithic07 commited on 17 days ago

Commit

2d5dd85

1 Parent(s): 92edb88

feat: Implement Context-Pruning-Env with SQuAD dataset and GRPOTrainer support

Browse files

Files changed (9) hide show

Dockerfile.openenv +27 -0
context_pruning_env/env.py +124 -0
context_pruning_env/models.py +43 -0
context_pruning_env/server/__init__.py +1 -0
context_pruning_env/server/app.py +18 -0
context_pruning_env/utils.py +45 -0
requirements.txt +4 -0
test_env.py +64 -0
train_grpo.py +67 -0

Dockerfile.openenv ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.11-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the environment code
+COPY context_pruning_env ./context_pruning_env
+# Expose the default OpenEnv port
+EXPOSE 7860
+# Command to run the environment server (standardized OpenEnv entrypoint)
+# In a real environment, you'd use a server wrapper mapping Gymnasium resets/steps to API calls.
+CMD ["uvicorn", "context_pruning_env.server.app:app", "--host", "0.0.0.0", "--port", "7860"]

context_pruning_env/env.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from __future__ import annotations
+from typing import Any, Optional, List
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from context_pruning_env.models import (
+    PruningAction,
+    PruningObservation,
+    PruningState,
+    ChunkItem
+)
+from context_pruning_env.utils import SQuADLoader, count_tokens
+class ContextPruningEnv(Environment[PruningAction, PruningObservation, PruningState]):
+    """
+    OpenEnv Reinforcement Learning Environment for RAG Context Pruning.
+    """
+    def __init__(self, squad_split: str = "train"):
+        super().__init__(transform=None, rubric=None)
+        self.loader = SQuADLoader(split=squad_split)
+        self._state = None
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> PruningObservation:
+        """
+        Loads a new question and 5 context chunks from SQuAD.
+        Returns the initial observation.
+        """
+        question, chunks_data = self.loader.get_episode()
+        # Prepare internal state chunks
+        chunks = []
+        total_tokens = 0
+        for i, (text, is_gold) in enumerate(chunks_data):
+            tokens = count_tokens(text)
+            total_tokens += tokens
+            chunks.append(ChunkItem(
+                content=text,
+                is_gold=is_gold,
+                tokens=tokens
+            ))
+            if is_gold:
+                gold_index = i
+        self._state = PruningState(
+            episode_id=episode_id or str(uuid4()),
+            question=question,
+            gold_index=gold_index,
+            chunks=chunks,
+            initial_tokens=total_tokens,
+            step_count=0,
+            done=False
+        )
+        return self._observe(
+            message="Environment reset. 5 chunks loaded (1 gold, 4 noise)."
+        )
+    def _observe(self, message: str = "") -> PruningObservation:
+        """Helper to create observation from current state."""
+        return PruningObservation(
+            done=self._state.done,
+            question=self._state.question,
+            chunks=[c.content for c in self._state.chunks],
+            token_count=sum(c.tokens for c in self._state.chunks),
+            message=message
+        )
+    def step(
+        self,
+        action: PruningAction,
+        **kwargs: Any,
+    ) -> PruningObservation:
+        """
+        Evaluates the binary mask, calculates token reduction,
+        checks gold chunk presence, and returns the observation.
+        """
+        if self._state.done:
+            return self._observe(message="Episode is already done.")
+        mask = action.mask
+        if len(mask) != 5:
+            # Should not happen if using Gymnasium space or Pydantic validation
+            self._state.done = True
+            return self._observe(message="Invalid action space size.")
+        # 1. Identify Gold Chunk Status
+        gold_kept = (mask[self._state.gold_index] == 1)
+        # 2. Calculate Token reduction
+        pruned_tokens = 0
+        for i, keep in enumerate(mask):
+            if keep == 0:
+                pruned_tokens += self._state.chunks[i].tokens
+        # 3. Reward Logic
+        reward = 0.0
+        if gold_kept:
+            reward += 10.0  # Accuracy Bonus
+            reward += 0.01 * pruned_tokens  # Efficiency Bonus
+            msg = f"Task Success: Gold chunk kept. Pruned {pruned_tokens} tokens."
+        else:
+            reward -= 20.0  # Penalty: Lost the game
+            msg = "Task Failure: Gold chunk was pruned. Mission failed."
+        self._state.done = True
+        self._state.step_count += 1
+        # In OpenEnv, the reward is often part of the Observation or signaled via a rubric.
+        # We manually update the reward field here.
+        obs = self._observe(message=msg)
+        obs.reward = reward
+        return obs
+    @property
+    def state(self) -> PruningState:
+        return self._state

context_pruning_env/models.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from __future__ import annotations
+from typing import List, Optional, Any
+from pydantic import Field
+from openenv.core.env_server.types import Action, Observation, State
+class PruningAction(Action):
+    """
+    Action space: A binary mask of 5 values (1 = keep, 0 = prune).
+    Example: [1, 0, 1, 1, 0]
+    """
+    mask: List[int] = Field(
+        ...,
+        min_items=5,
+        max_items=5,
+        description="Binary mask of 5 integers (0 or 1) indicating which chunks to keep."
+    )
+class ChunkItem(BaseModel):
+    """Represent a single context chunk."""
+    content: str
+    is_gold: bool = False
+    tokens: int = 0
+class PruningObservation(Observation):
+    """
+    Observation provided to the agent.
+    Contains the question and the 5 context chunks.
+    """
+    question: str
+    chunks: List[str] = Field(default_factory=list, description="List of 5 context strings.")
+    token_count: int = 0
+    message: str = ""
+class PruningState(State):
+    """
+    Internal state of the environment.
+    """
+    question: str
+    gold_index: int
+    chunks: List[ChunkItem]
+    initial_tokens: int
+    step_count: int = 0
+    done: bool = False

context_pruning_env/server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # server package

context_pruning_env/server/app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+from openenv.core.env_server.http_server import create_fastapi_app
+from context_pruning_env.env import ContextPruningEnv
+from context_pruning_env.models import PruningAction, PruningObservation
+app = create_fastapi_app(
+    ContextPruningEnv,
+    PruningAction,
+    PruningObservation,
+)
+def main() -> None:
+    import uvicorn
+    port = int(os.environ.get("PORT", "7860"))
+    uvicorn.run(app, host="0.0.0.0", port=port)
+if __name__ == "__main__":
+    main()

context_pruning_env/utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import random
+from typing import List, Tuple
+from datasets import load_dataset
+import logging
+logger = logging.getLogger(__name__)
+class SQuADLoader:
+    def __init__(self, split: str = "train"):
+        self.dataset = load_dataset("squad", split=split)
+        self.indices = list(range(len(self.dataset)))
+        random.shuffle(self.indices)
+        self.current_ptr = 0
+    def get_episode(self) -> Tuple[str, List[Tuple[str, bool]]]:
+        """
+        Returns (question, List[(chunk_text, is_gold)])
+        """
+        if self.current_ptr >= len(self.indices):
+            random.shuffle(self.indices)
+            self.current_ptr = 0
+        idx = self.indices[self.current_ptr]
+        self.current_ptr += 1
+        entry = self.dataset[idx]
+        question = entry["question"]
+        gold_context = entry["context"]
+        # 1 Gold + 4 Noise
+        chunks = [(gold_context, True)]
+        # Sample 4 noise contexts from other entries
+        noise_indices = random.sample([i for i in range(len(self.dataset)) if i != idx], 4)
+        for nid in noise_indices:
+            chunks.append((self.dataset[nid]["context"], False))
+        # Shuffle chunks to avoid gold being first
+        random.shuffle(chunks)
+        return question, chunks
+def count_tokens(text: str) -> int:
+    """Simple token counter using whitespace splitting."""
+    return len(text.split())

requirements.txt CHANGED Viewed

@@ -3,3 +3,7 @@ pydantic>=2.0
 fastapi>=0.104.0
 uvicorn[standard]>=0.24.0
 typing_extensions>=4.8.0

 fastapi>=0.104.0
 uvicorn[standard]>=0.24.0
 typing_extensions>=4.8.0
+datasets>=2.15.0
+transformers>=4.35.0
+trl>=0.7.4
+torch>=2.1.0

test_env.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import unittest
+from unittest.mock import MagicMock
+from context_pruning_env.env import ContextPruningEnv
+from context_pruning_env.models import PruningAction, ChunkItem
+class TestContextPruningEnv(unittest.TestCase):
+    def setUp(self):
+        # Create env and mock the loader to avoid HF dataset download
+        self.env = ContextPruningEnv(squad_split="train")
+        self.env.loader = MagicMock()
+        # Mock episode data: 1 Gold, 4 Noise
+        self.mock_question = "What color is the sky?"
+        self.mock_chunks = [
+            ("The sky appears blue due to Rayleigh scattering.", True),
+            ("Grass is usually green.", False),
+            ("Pizza is delicious.", False),
+            ("Computers process binary data.", False),
+            ("Antarctica is cold.", False)
+        ]
+        self.env.loader.get_episode.return_value = (self.mock_question, self.mock_chunks)
+    def test_reset(self):
+        obs = self.env.reset()
+        self.assertEqual(obs.question, self.mock_question)
+        self.assertEqual(len(obs.chunks), 5)
+        self.assertFalse(obs.done)
+    def test_step_keep_gold(self):
+        self.env.reset()
+        # Gold is at index 0
+        # Action: Keep all
+        action = PruningAction(mask=[1, 1, 1, 1, 1])
+        obs = self.env.step(action)
+        self.assertTrue(obs.done)
+        # Accuracy bonus + tokens saved (0 in this case)
+        self.assertEqual(obs.reward, 10.0)
+        self.assertIn("Success", obs.message)
+    def test_step_prune_gold(self):
+        self.env.reset()
+        # Gold is at index 0
+        # Action: Prune gold, keep others
+        action = PruningAction(mask=[0, 1, 1, 1, 1])
+        obs = self.env.step(action)
+        self.assertTrue(obs.done)
+        self.assertEqual(obs.reward, -20.0)
+        self.assertIn("Failure", obs.message)
+    def test_step_efficiency(self):
+        self.env.reset()
+        # Gold is at index 0
+        # Action: Keep gold, prune others
+        action = PruningAction(mask=[1, 0, 0, 0, 0])
+        obs = self.env.step(action)
+        self.assertTrue(obs.done)
+        self.assertGreater(obs.reward, 10.0) # Accuracy (10) + Efficiency (>0)
+        self.assertIn("Success", obs.message)
+if __name__ == "__main__":
+    unittest.main()

train_grpo.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+from trl import GRPOTrainer, GRPOConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from context_pruning_env.env import ContextPruningEnv
+from context_pruning_env.models import PruningAction
+# 1. Setup Environment
+env = ContextPruningEnv(squad_split="train")
+def reward_func(prompts, completions, **kwargs):
+    """
+    Reward function wrapper for GRPOTrainer.
+    """
+    rewards = []
+    for prompt, completion in zip(prompts, completions):
+        # In a real GRPOTrainer setup, we process multiple completions for the same prompt.
+        # Here we simulate the interface mapping back to our environment logic.
+        # 1. Extract action mask from completion (LLM output)
+        # Assuming the model outputs something like "Action: [1, 0, 1, 1, 0]"
+        try:
+            # Simple parse logic
+            if "[" in completion and "]" in completion:
+                mask_str = completion.split("[")[1].split("]")[0]
+                mask = [int(x.strip()) for x in mask_str.split(",")]
+            else:
+                mask = [1, 1, 1, 1, 1] # Fallback to keeping everything
+        except:
+            mask = [1, 1, 1, 1, 1]
+        # 2. Step the environment (Simulated for the snippet)
+        # In actual GRPO, we might reset env to the state corresponding to the prompt.
+        # env.reset(seed=...)
+        action = PruningAction(mask=mask)
+        obs = env.step(action)
+        rewards.append(obs.reward)
+    return rewards
+def main():
+    model_id = "meta-llama/Llama-3-8B" # Reference model
+    # 2. Config for GRPO
+    training_args = GRPOConfig(
+        output_dir="./llama-3-rag-pruning",
+        learning_rate=5e-6,
+        per_batch_size=1,
+        gradient_accumulation_steps=16,
+        num_train_epochs=3,
+        logging_steps=10,
+        group_size=8, # GRPO specific: group size for relative reward calculation
+    )
+    # 3. Initialize Trainer
+    # Note: In a real implementation, you'd need the dataset formatted for the trainer
+    trainer = GRPOTrainer(
+        model=model_id,
+        reward_funcs=[reward_func],
+        args=training_args,
+        # train_dataset=rag_pruning_dataset, # Pre-formatted dataset
+    )
+    print("Starting Training with GRPOTrainer...")
+    # trainer.train()
+if __name__ == "__main__":
+    main()