Spaces:

samrat-rm
/

WhyDidItFail

Sleeping

App Files Files Community

samrat-rm Claude Sonnet 4.6 commited on 13 days ago

Commit

d3b224f

1 Parent(s): bf98c78

fix: clamp all rewards and scores to [0.10, 0.90]

Browse files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (7) hide show

client.py +1 -1
inference.py +9 -9
models.py +2 -2
openenv.yaml +21 -21
server/WhyDidItFail_environment.py +14 -12
server/graders.py +4 -4
server/llm_judge.py +6 -6

client.py CHANGED Viewed

@@ -50,7 +50,7 @@ class WhyDidItFailEnv(EnvClient[WhyDidItFailAction, WhyDidItFailObservation, Why
             visible_data=obs_data.get("visible_data", {}),
             available_actions=obs_data.get("available_actions", []),
             steps_taken=obs_data.get("steps_taken", 0),
-            reward=obs_data.get("reward", 0.01),
             done=obs_data.get("done", False),
             feedback=obs_data.get("feedback", ""),
         )

             visible_data=obs_data.get("visible_data", {}),
             available_actions=obs_data.get("available_actions", []),
             steps_taken=obs_data.get("steps_taken", 0),
+            reward=obs_data.get("reward", 0.10),
             done=obs_data.get("done", False),
             feedback=obs_data.get("feedback", ""),
         )

inference.py CHANGED Viewed

@@ -13,7 +13,7 @@ Stdout format:
     [END]     success=<bool> steps=<n> reward=<float>             (per episode)
     [END]     score=<float>                                        (final overall)
-All reward and score values are strictly in (0.01, 0.99).
 """
 import asyncio
@@ -197,7 +197,7 @@ async def run_episode(
     rewards: List[float] = []
     inspection_order: List[str] = []
     submit_action: WhyDidItFailAction | None = None
-    score    = 0.01
     success  = False
     try:
@@ -209,10 +209,10 @@ async def run_episode(
             try:
                 result = await env.step(action)
             except ConnectionClosedError as e:
-                print(f"[STEP] step={step} action={action.action_type} reward=0.01 done=true error={e}", flush=True)
                 break
             obs    = result.observation
-            reward = round(max(0.01, min(0.99, result.reward or 0.01)), 2)
             done   = result.done
             if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
                 source = action.action_type.replace("inspect_", "")
@@ -231,7 +231,7 @@ async def run_episode(
                 break
         # WebSocket is closed — safe to call the judge now
-        keyword_score = max(0.01, min(0.99, rewards[-1])) if rewards else 0.01
         judge_score: float | None = None
         if submit_action is not None:
             judge_score = llm_judge(
@@ -244,17 +244,17 @@ async def run_episode(
                 inspection_order=inspection_order,
             )
         if judge_score is None:
-            score = round(max(0.01, min(0.99, keyword_score)), 2)
             # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.2f} reasoning=n/a total={score:.2f}", file=sys.stderr, flush=True)
         else:
-            score = round(max(0.01, min(0.99, 0.85 * keyword_score + 0.15 * judge_score)), 2)
             # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
         success = score >= SUCCESS_THRESHOLD
     finally:
         steps_taken = len(rewards)
-        final_reward = f"{rewards[-1]:.2f}" if rewards else "0.01"
         print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_reward}", flush=True)
     return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
@@ -280,7 +280,7 @@ async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEn
         results.append(res)
     scores = [r["score"] for r in results]
-    task_score = round(max(0.01, min(0.99, sum(scores) / len(scores))), 2) if scores else 0.01
     print(f"[END] score={task_score}", flush=True)
     return scores

     [END]     success=<bool> steps=<n> reward=<float>             (per episode)
     [END]     score=<float>                                        (final overall)
+All reward and score values are strictly in (0.10, 0.90).
 """
 import asyncio
     rewards: List[float] = []
     inspection_order: List[str] = []
     submit_action: WhyDidItFailAction | None = None
+    score    = 0.10
     success  = False
     try:
             try:
                 result = await env.step(action)
             except ConnectionClosedError as e:
+                print(f"[STEP] step={step} action={action.action_type} reward=0.10 done=true error={e}", flush=True)
                 break
             obs    = result.observation
+            reward = round(max(0.10, min(0.90, result.reward or 0.10)), 2)
             done   = result.done
             if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
                 source = action.action_type.replace("inspect_", "")
                 break
         # WebSocket is closed — safe to call the judge now
+        keyword_score = max(0.10, min(0.90, rewards[-1])) if rewards else 0.10
         judge_score: float | None = None
         if submit_action is not None:
             judge_score = llm_judge(
                 inspection_order=inspection_order,
             )
         if judge_score is None:
+            score = round(max(0.10, min(0.90, keyword_score)), 2)
             # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.2f} reasoning=n/a total={score:.2f}", file=sys.stderr, flush=True)
         else:
+            score = round(max(0.10, min(0.90, 0.85 * keyword_score + 0.15 * judge_score)), 2)
             # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
         success = score >= SUCCESS_THRESHOLD
     finally:
         steps_taken = len(rewards)
+        final_reward = f"{rewards[-1]:.2f}" if rewards else "0.10"
         print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_reward}", flush=True)
     return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
         results.append(res)
     scores = [r["score"] for r in results]
+    task_score = round(max(0.10, min(0.90, sum(scores) / len(scores))), 2) if scores else 0.10
     print(f"[END] score={task_score}", flush=True)
     return scores

models.py CHANGED Viewed

@@ -42,8 +42,8 @@ class WhyDidItFailObservation(Observation):
         "Which action_types are valid on this step.")
     steps_taken: int = Field(..., description=
         "Number of actions taken so far in this episode.")
-    reward: float = Field(default=0.01, description=    # type: ignore[override]
-        "Score for the current step. 0.99 = max.")
     done: bool = Field(default=False, description=
         "True when the episode has ended.")
     feedback: str = Field(..., description=

         "Which action_types are valid on this step.")
     steps_taken: int = Field(..., description=
         "Number of actions taken so far in this episode.")
+    reward: float = Field(default=0.10, description=    # type: ignore[override]
+        "Score for the current step. 0.90 = max.")
     done: bool = Field(default=False, description=
         "True when the episode has ended.")
     feedback: str = Field(..., description=

openenv.yaml CHANGED Viewed

@@ -25,17 +25,17 @@ tasks:
         {response}
         You MUST reply with exactly one of these four numbers and nothing else:
-        0.95
-        0.70
         0.30
-        0.05
         Rules:
-        - 0.95: Correct failure mode with reasoning that cites specific numeric values from the logs
-        - 0.70: Correct failure mode but reasoning is vague or missing specific numbers
         - 0.30: Wrong label but description matches a related concept
-        - 0.05: Wrong failure mode, no diagnosis submitted, or empty response
-        - If in doubt, return 0.05. NEVER return 0, 1, 0.0, 1.0, or any value not in the list above.
   - id: task_medium
     difficulty: medium
@@ -56,17 +56,17 @@ tasks:
         {response}
         You MUST reply with exactly one of these four numbers and nothing else:
-        0.95
-        0.70
         0.30
-        0.05
         Rules:
-        - 0.95: Correct failure mode with reasoning citing both log values AND config parameters
-        - 0.70: Correct failure mode but reasoning only references logs or config, not both
         - 0.30: Wrong label but description matches a related concept
-        - 0.05: Wrong failure mode, no diagnosis submitted, or empty response
-        - If in doubt, return 0.05. NEVER return 0, 1, 0.0, 1.0, or any value not in the list above.
   - id: task_hard
     difficulty: hard
@@ -88,16 +88,16 @@ tasks:
         {response}
         You MUST reply with exactly one of these five numbers and nothing else:
-        0.95
-        0.80
         0.50
         0.20
-        0.05
         Rules:
-        - 0.95: Correct failure mode AND a specific actionable fix addressing the root cause
-        - 0.80: Correct failure mode with a reasonable fix that lacks specifics
         - 0.50: Correct failure mode but fix is vague, wrong, or missing
         - 0.20: Wrong failure mode but fix is incidentally relevant
-        - 0.05: Wrong failure mode, no useful fix, no diagnosis submitted, or empty response
-        - If in doubt, return 0.05. NEVER return 0, 1, 0.0, 1.0, or any value not in the list above.

         {response}
         You MUST reply with exactly one of these four numbers and nothing else:
+        0.85
+        0.65
         0.30
+        0.15
         Rules:
+        - 0.85: Correct failure mode with reasoning that cites specific numeric values from the logs
+        - 0.65: Correct failure mode but reasoning is vague or missing specific numbers
         - 0.30: Wrong label but description matches a related concept
+        - 0.15: Wrong failure mode, no diagnosis submitted, or empty response
+        - If in doubt, return 0.15. NEVER return 0, 1, 0.0, 1.0, or any value outside [0.10, 0.90].
   - id: task_medium
     difficulty: medium
         {response}
         You MUST reply with exactly one of these four numbers and nothing else:
+        0.85
+        0.65
         0.30
+        0.15
         Rules:
+        - 0.85: Correct failure mode with reasoning citing both log values AND config parameters
+        - 0.65: Correct failure mode but reasoning only references logs or config, not both
         - 0.30: Wrong label but description matches a related concept
+        - 0.15: Wrong failure mode, no diagnosis submitted, or empty response
+        - If in doubt, return 0.15. NEVER return 0, 1, 0.0, 1.0, or any value outside [0.10, 0.90].
   - id: task_hard
     difficulty: hard
         {response}
         You MUST reply with exactly one of these five numbers and nothing else:
+        0.85
+        0.75
         0.50
         0.20
+        0.15
         Rules:
+        - 0.85: Correct failure mode AND a specific actionable fix addressing the root cause
+        - 0.75: Correct failure mode with a reasonable fix that lacks specifics
         - 0.50: Correct failure mode but fix is vague, wrong, or missing
         - 0.20: Wrong failure mode but fix is incidentally relevant
+        - 0.15: Wrong failure mode, no useful fix, no diagnosis submitted, or empty response
+        - If in doubt, return 0.15. NEVER return 0, 1, 0.0, 1.0, or any value outside [0.10, 0.90].

server/WhyDidItFail_environment.py CHANGED Viewed

@@ -56,7 +56,7 @@ class WhyDidItFailEnvironment(Environment):
             visible_data={"hint": "Start by inspecting the training logs."},
             available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
             steps_taken=0,
-            reward=0.01,
             done=False,
             feedback="Investigation started.",
         )
@@ -67,18 +67,18 @@ class WhyDidItFailEnvironment(Environment):
         self._state.step_count += 1
-        # Hard step limit — terminate immediately, grade() will return 0.01.
         if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
             return WhyDidItFailObservation(
                 task_description="Step limit reached. Episode terminated.",
                 visible_data={},
                 available_actions=[],
                 steps_taken=self._state.step_count,
-                reward=0.01,
                 done=True,
                 feedback=(
                     f"Step limit ({self.max_steps}) reached without a diagnosis. "
-                    f"Score: 0.01. Actual failure: '{self.scenario['correct_diagnosis']}'."
                 ),
             )
         required: list[str] = self.scenario.get("required_sources", ["logs"])
@@ -143,30 +143,32 @@ class WhyDidItFailEnvironment(Environment):
                 visible_data={},
                 available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
                 steps_taken=self._state.step_count,
-                reward=-0.05,
                 done=False,
-                feedback=f"Unknown action '{action.action_type}'. No reward.",
             )
     # Rewards decay as more required sources are discovered — first clue is worth most.
-    _REQUIRED_STEP_REWARDS = [0.10, 0.07, 0.05]
     def _inspect_reward(self, source: str, required: list[str]) -> float:
         """Return step reward for an inspect action.
-        Required sources:   progressive — +0.10 / +0.07 / +0.05 for 1st/2nd/3rd discovery.
-        Irrelevant sources: -0.03 (mild; some exploration is acceptable).
-        Re-inspection:      -0.05 (waste).
         """
         if source in self.inspection_order:
-            return -0.05   # redundant inspection
         if source in required:
             n_found = sum(1 for s in self.inspection_order if s in required)
             idx = min(n_found, len(self._REQUIRED_STEP_REWARDS) - 1)
             return self._REQUIRED_STEP_REWARDS[idx]
-        return -0.03       # irrelevant source
     def _inspect_feedback(self, source: str, required: list[str], reward: float) -> str:
         label = {"logs": "training logs", "config": "hyperparameter config", "gradients": "gradient statistics"}[source]

             visible_data={"hint": "Start by inspecting the training logs."},
             available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
             steps_taken=0,
+            reward=0.10,
             done=False,
             feedback="Investigation started.",
         )
         self._state.step_count += 1
+        # Hard step limit — terminate immediately, grade() will return 0.10.
         if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
             return WhyDidItFailObservation(
                 task_description="Step limit reached. Episode terminated.",
                 visible_data={},
                 available_actions=[],
                 steps_taken=self._state.step_count,
+                reward=0.10,
                 done=True,
                 feedback=(
                     f"Step limit ({self.max_steps}) reached without a diagnosis. "
+                    f"Score: 0.10. Actual failure: '{self.scenario['correct_diagnosis']}'."
                 ),
             )
         required: list[str] = self.scenario.get("required_sources", ["logs"])
                 visible_data={},
                 available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
                 steps_taken=self._state.step_count,
+                reward=0.10,
                 done=False,
+                feedback=f"Unknown action '{action.action_type}'. Minimum reward.",
             )
     # Rewards decay as more required sources are discovered — first clue is worth most.
+    # All values are in [0.10, 0.90] — no negative rewards.
+    _REQUIRED_STEP_REWARDS = [0.50, 0.30, 0.15]
     def _inspect_reward(self, source: str, required: list[str]) -> float:
         """Return step reward for an inspect action.
+        Required sources:   progressive — 0.50 / 0.30 / 0.15 for 1st/2nd/3rd discovery.
+        Irrelevant sources: 0.10 (minimum; mild penalty via contrast with required rewards).
+        Re-inspection:      0.10 (minimum; waste with no new information).
+        All values are strictly in [0.10, 0.90].
         """
         if source in self.inspection_order:
+            return 0.10   # redundant inspection — minimum reward
         if source in required:
             n_found = sum(1 for s in self.inspection_order if s in required)
             idx = min(n_found, len(self._REQUIRED_STEP_REWARDS) - 1)
             return self._REQUIRED_STEP_REWARDS[idx]
+        return 0.10       # irrelevant source — minimum reward
     def _inspect_feedback(self, source: str, required: list[str], reward: float) -> str:
         label = {"logs": "training logs", "config": "hyperparameter config", "gradients": "gradient statistics"}[source]

server/graders.py CHANGED Viewed

@@ -6,7 +6,7 @@ grade() is the single entry point. It scores the full episode trajectory:
   diagnosis_score  (0.00 – 0.70)  was the diagnosis correct?
   evidence_score   (0.00 – 0.15)  did the agent inspect the right sources?
   efficiency_score (0.00 – 0.15)  did the agent act without waste?
-  fix_bonus        (0.00 – 0.15)  did the agent suggest a valid fix? (bonus, capped at 0.99)
 Step-level partial rewards are returned by the environment's step() on every action,
 giving the agent a signal over the full trajectory before the episode ends.
@@ -197,7 +197,7 @@ def grade(
     Single unified grade function. Scores every scenario identically.
     Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
-                  clamped to [0.01, 0.99].
     Max achievable without fix:  0.70 + 0.15 + 0.15       = 1.00
     Max achievable with fix:     0.70 + 0.15 + 0.15 + 0.15 = 1.00  (capped)
@@ -210,7 +210,7 @@ def grade(
     max_steps        = len(required) * 3 + 2      # hard ceiling; exceeding it = total failure
     if steps_taken > max_steps:
-        return 0.01
     d_score  = _diagnosis_score(diagnosis, scenario)
     ed_penalty = _evidence_diagnosis_penalty(diagnosis, scenario, inspection_order)
@@ -221,4 +221,4 @@ def grade(
     total = d_score + ed_penalty + e_score + f_score + b_score + o_bonus
-    return round(max(0.01, min(0.99, total)), 2)

   diagnosis_score  (0.00 – 0.70)  was the diagnosis correct?
   evidence_score   (0.00 – 0.15)  did the agent inspect the right sources?
   efficiency_score (0.00 – 0.15)  did the agent act without waste?
+  fix_bonus        (0.00 – 0.15)  did the agent suggest a valid fix? (bonus, capped at 0.90)
 Step-level partial rewards are returned by the environment's step() on every action,
 giving the agent a signal over the full trajectory before the episode ends.
     Single unified grade function. Scores every scenario identically.
     Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
+                  clamped to [0.10, 0.90].
     Max achievable without fix:  0.70 + 0.15 + 0.15       = 1.00
     Max achievable with fix:     0.70 + 0.15 + 0.15 + 0.15 = 1.00  (capped)
     max_steps        = len(required) * 3 + 2      # hard ceiling; exceeding it = total failure
     if steps_taken > max_steps:
+        return 0.10
     d_score  = _diagnosis_score(diagnosis, scenario)
     ed_penalty = _evidence_diagnosis_penalty(diagnosis, scenario, inspection_order)
     total = d_score + ed_penalty + e_score + f_score + b_score + o_bonus
+    return round(max(0.10, min(0.90, total)), 2)

server/llm_judge.py CHANGED Viewed

@@ -3,16 +3,16 @@ LLM Judge — reasoning quality scorer for WhyDidItFail.
 Called from inference.py after submit_diagnosis.
 Uses the same OpenAI-compatible client and model as the agent.
-Returns a normalized score in [0.0, 1.0] representing reasoning quality.
-Returns 0.0 silently if reasoning is absent or the call fails.
-Scoring criteria (0–5 each, total 0–15 → normalized to 0.0–1.0):
   evidence_grounding  — does the reasoning cite specific observed values?
   causal_chain        — does it connect evidence to the failure mode logically?
   fix_rationale       — is the fix justified by the evidence?
 Final score in inference.py:
-  total = 0.85 * keyword_score + 0.15 * judge_score  → always in [0.0, 1.0]
 """
 import json
@@ -90,8 +90,8 @@ def judge(
             + data.get("causal_chain", 0)
             + data.get("fix_rationale", 0)
         )
-        # normalize: raw 0–15 → 0.01–0.99 (never exact 0 or 1)
-        return round(max(0.01, min(0.99, raw / 15)), 2)
     except Exception as exc:
         print(f"  [JUDGE] failed: {exc}", flush=True)

 Called from inference.py after submit_diagnosis.
 Uses the same OpenAI-compatible client and model as the agent.
+Returns a normalized score in [0.10, 0.90] representing reasoning quality.
+Returns None silently if reasoning is absent or the call fails.
+Scoring criteria (0–5 each, total 0–15 → normalized to 0.10–0.90):
   evidence_grounding  — does the reasoning cite specific observed values?
   causal_chain        — does it connect evidence to the failure mode logically?
   fix_rationale       — is the fix justified by the evidence?
 Final score in inference.py:
+  total = 0.85 * keyword_score + 0.15 * judge_score  → always in [0.10, 0.90]
 """
 import json
             + data.get("causal_chain", 0)
             + data.get("fix_rationale", 0)
         )
+        # normalize: raw 0–15 → 0.10–0.90 (never below 0.10 or above 0.90)
+        return round(max(0.10, min(0.90, raw / 15)), 2)
     except Exception as exc:
         print(f"  [JUDGE] failed: {exc}", flush=True)