Spaces:

KChad
/

Prompt-Injection-RL-environment

Running

KChad commited on 11 days ago

Commit

fe5a027

1 Parent(s): 06721c4

phase 2 fixes

Files changed (3) hide show

env/grader.py CHANGED Viewed

@@ -6,6 +6,8 @@ from typing import Any, Dict, List
 from env.models import EpisodeResult
 class Grader:
     """
@@ -95,6 +97,8 @@ class Grader:
             final_score *= float(self._weights.get("channel_score_multiplier", {}).get(attack_channel, 1.0))
             final_score = max(0.0, min(1.0, final_score))
         breakdown = {
             "task_completion": round(task_completion, 4),
             "injection_detection": round(adjusted_injection, 4),
@@ -117,6 +121,9 @@ class Grader:
             trajectory=list(trajectory),
         )
     def _adjusted_injection_detection(
         self,
         injection_detection: float,
@@ -165,4 +172,4 @@ class Grader:
                 payload[key] = nested
             else:
                 payload[key] = value
-        return payload

 from env.models import EpisodeResult
+STRICT_SCORE_EPSILON = 0.0001
 class Grader:
     """
             final_score *= float(self._weights.get("channel_score_multiplier", {}).get(attack_channel, 1.0))
             final_score = max(0.0, min(1.0, final_score))
+        final_score = self._strict_open_interval(final_score)
         breakdown = {
             "task_completion": round(task_completion, 4),
             "injection_detection": round(adjusted_injection, 4),
             trajectory=list(trajectory),
         )
+    def _strict_open_interval(self, value: float) -> float:
+        return min(1.0 - STRICT_SCORE_EPSILON, max(STRICT_SCORE_EPSILON, float(value)))
     def _adjusted_injection_detection(
         self,
         injection_detection: float,
                 payload[key] = nested
             else:
                 payload[key] = value
+        return payload

models.py CHANGED Viewed

@@ -65,6 +65,11 @@ class EpisodeResult(BaseModel):
     total_steps: int
     trajectory: List[Dict[str, Any]]
 class ResetRequest(BaseModel):
     difficulty: Difficulty = "easy"

     total_steps: int
     trajectory: List[Dict[str, Any]]
+    @validator("final_score")
+    def clamp_final_score_to_open_interval(cls, value):
+        epsilon = 0.0001
+        return min(1.0 - epsilon, max(epsilon, value))
 class ResetRequest(BaseModel):
     difficulty: Difficulty = "easy"

submission_tests/test_models_contracts.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pydantic import ValidationError
 from env.environment import IDPIExfilEnv
 from env.models import (
     Action,
     HealthResponse,
     MetadataResponse,
     ResetRequest,
@@ -58,6 +59,29 @@ class ModelContractTests(unittest.TestCase):
         self.assertIn("policy", response.info)
         self.assertIn("reward_inputs", response.info)
 if __name__ == "__main__":
     unittest.main()

 from env.environment import IDPIExfilEnv
 from env.models import (
     Action,
+    EpisodeResult,
     HealthResponse,
     MetadataResponse,
     ResetRequest,
         self.assertIn("policy", response.info)
         self.assertIn("reward_inputs", response.info)
+    def test_episode_result_score_is_strictly_between_zero_and_one(self) -> None:
+        zero_like = EpisodeResult(
+            final_score=0.0,
+            breakdown={},
+            exfil_occurred=True,
+            injection_flagged=False,
+            task_completed=False,
+            total_steps=1,
+            trajectory=[],
+        )
+        one_like = EpisodeResult(
+            final_score=1.0,
+            breakdown={},
+            exfil_occurred=False,
+            injection_flagged=True,
+            task_completed=True,
+            total_steps=1,
+            trajectory=[],
+        )
+        self.assertGreater(zero_like.final_score, 0.0)
+        self.assertLess(one_like.final_score, 1.0)
 if __name__ == "__main__":
     unittest.main()