Spaces:

vicky1428
/

supermail

Sleeping

App Files Files Community

vicky1428 commited on 12 days ago

Commit

b9ad6f9

verified ·

1 Parent(s): dab441f

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +3 -3
inference.py +5 -3
models.py +2 -2
server/environment.py +20 -11

README.md CHANGED Viewed

@@ -307,9 +307,9 @@ When an OpenAI-compatible endpoint is available, the script uses the OpenAI clie
 Deterministic fallback baseline on bundled tasks:
-- `email_easy`: `1.00`
-- `email_medium`: `1.00`
-- `email_hard`: `1.00`
 ## Hugging Face Spaces

 Deterministic fallback baseline on bundled tasks:
+- `email_easy`: `0.99`
+- `email_medium`: `0.99`
+- `email_hard`: `0.99`
 ## Hugging Face Spaces

inference.py CHANGED Viewed

@@ -37,6 +37,8 @@ MAX_STEPS = 12
 TEMPERATURE = 0.4
 MAX_TOKENS = 25000
 SUCCESS_SCORE_THRESHOLD = 0.95
 @dataclass
 class LocalStepResult:
@@ -80,8 +82,8 @@ def sanitize(value: Any) -> str:
 def clamp_score(score: float) -> float:
-    """Clamp score into [0, 1]."""
-    return min(max(score, 0.0), 1.0)
 def compact_action(action: Optional[SupportAction]) -> str:
@@ -281,7 +283,7 @@ async def run_episode(task_id: str, client: Optional[OpenAI]) -> None:
     history: List[str] = []
     rewards: List[float] = []
     steps_taken = 0
-    score = 0.0
     success = False
     action_for_log: Optional[SupportAction] = None

 TEMPERATURE = 0.4
 MAX_TOKENS = 25000
 SUCCESS_SCORE_THRESHOLD = 0.95
+MIN_SCORE = 0.01
+MAX_SCORE = 0.99
 @dataclass
 class LocalStepResult:
 def clamp_score(score: float) -> float:
+    """Clamp score into the open interval (0, 1)."""
+    return min(max(score, MIN_SCORE), MAX_SCORE)
 def compact_action(action: Optional[SupportAction]) -> str:
     history: List[str] = []
     rewards: List[float] = []
     steps_taken = 0
+    score = MIN_SCORE
     success = False
     action_for_log: Optional[SupportAction] = None

models.py CHANGED Viewed

@@ -77,7 +77,7 @@ class SupportObservation(Observation):
         description="Compact summaries of prior attempts in the episode.",
     )
     feedback: str = Field(default="", description="Step-level grader feedback.")
-    score: float = Field(default=0.0, description="Current cumulative score.")
     attempts_remaining: int = Field(
         default=0,
         description="How many attempts remain before the episode ends.",
@@ -89,6 +89,6 @@ class SupportState(State):
     task_id: str | None = None
     difficulty: str | None = None
-    score: float = 0.0
     matched_fields: List[str] = Field(default_factory=list)
     attempts_remaining: int = 0

         description="Compact summaries of prior attempts in the episode.",
     )
     feedback: str = Field(default="", description="Step-level grader feedback.")
+    score: float = Field(default=0.01, description="Current cumulative score.")
     attempts_remaining: int = Field(
         default=0,
         description="How many attempts remain before the episode ends.",
     task_id: str | None = None
     difficulty: str | None = None
+    score: float = 0.01
     matched_fields: List[str] = Field(default_factory=list)
     attempts_remaining: int = 0

server/environment.py CHANGED Viewed

@@ -37,6 +37,8 @@ class SupermailEnvironment(Environment):
     """Deterministic customer support email triage environment."""
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
     def __init__(self, task_id: str | None = None):
         self._requested_task_id = task_id
@@ -45,8 +47,12 @@ class SupermailEnvironment(Environment):
         self._task: TaskDefinition | None = None
         self._matched_fields: set[str] = set()
         self._history: list[str] = []
-        self._score = 0.0
-        self._state = SupportState(episode_id=str(uuid4()), step_count=0)
     @property
     def benchmark(self) -> str:
@@ -65,13 +71,13 @@ class SupermailEnvironment(Environment):
         self._task = self._select_task()
         self._matched_fields = set()
         self._history = []
-        self._score = 0.0
         self._state = SupportState(
             episode_id=str(uuid4()),
             step_count=0,
             task_id=self._task.task_id,
             difficulty=self._task.difficulty,
-            score=0.0,
             matched_fields=[],
             attempts_remaining=self._task.max_attempts,
         )
@@ -143,6 +149,14 @@ class SupermailEnvironment(Environment):
                 decision[field_name] = value
         return decision
     def _assess(self, decision: dict[str, str]) -> StepAssessment:
         if self._task is None:
             raise RuntimeError("Task not initialized.")
@@ -186,13 +200,8 @@ class SupermailEnvironment(Environment):
         if self._state.step_count > 3 and matched_fields != set(self._task.required_fields):
             reward -= 0.05
-        score = round(
-            min(
-                1.0,
-                sum(self._task.field_weights[field] for field in matched_fields),
-            ),
-            2,
-        )
         success = matched_fields == set(self._task.required_fields)
         done = success or self._state.step_count >= self._task.max_attempts

     """Deterministic customer support email triage environment."""
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    MIN_SCORE: float = 0.01
+    MAX_SCORE: float = 0.99
     def __init__(self, task_id: str | None = None):
         self._requested_task_id = task_id
         self._task: TaskDefinition | None = None
         self._matched_fields: set[str] = set()
         self._history: list[str] = []
+        self._score = self._bounded_score(0.0)
+        self._state = SupportState(
+            episode_id=str(uuid4()),
+            step_count=0,
+            score=self._score,
+        )
     @property
     def benchmark(self) -> str:
         self._task = self._select_task()
         self._matched_fields = set()
         self._history = []
+        self._score = self._bounded_score(0.0)
         self._state = SupportState(
             episode_id=str(uuid4()),
             step_count=0,
             task_id=self._task.task_id,
             difficulty=self._task.difficulty,
+            score=self._score,
             matched_fields=[],
             attempts_remaining=self._task.max_attempts,
         )
                 decision[field_name] = value
         return decision
+    def _bounded_score(self, raw_score: float) -> float:
+        """Map raw progress into the open interval (0, 1)."""
+        clamped_raw_score = min(max(raw_score, 0.0), 1.0)
+        scaled_score = self.MIN_SCORE + (
+            clamped_raw_score * (self.MAX_SCORE - self.MIN_SCORE)
+        )
+        return round(scaled_score, 2)
     def _assess(self, decision: dict[str, str]) -> StepAssessment:
         if self._task is None:
             raise RuntimeError("Task not initialized.")
         if self._state.step_count > 3 and matched_fields != set(self._task.required_fields):
             reward -= 0.05
+        raw_score = sum(self._task.field_weights[field] for field in matched_fields)
+        score = self._bounded_score(raw_score)
         success = matched_fields == set(self._task.required_fields)
         done = success or self._state.step_count >= self._task.max_attempts