Spaces:
Sleeping
Sleeping
chore: update doc string
Browse files- models.py +1 -1
- server/llm_judge.py +1 -1
models.py
CHANGED
|
@@ -29,7 +29,7 @@ class WhyDidItFailState(State):
|
|
| 29 |
required_sources: list[str] = Field(default_factory=list, description=
|
| 30 |
"Sources the agent must inspect before submitting a valid diagnosis.")
|
| 31 |
max_steps: int = Field(0, description=
|
| 32 |
-
"Hard step ceiling for this episode. Exceeding it terminates with score 0.")
|
| 33 |
|
| 34 |
|
| 35 |
class WhyDidItFailObservation(Observation):
|
|
|
|
| 29 |
required_sources: list[str] = Field(default_factory=list, description=
|
| 30 |
"Sources the agent must inspect before submitting a valid diagnosis.")
|
| 31 |
max_steps: int = Field(0, description=
|
| 32 |
+
"Hard step ceiling for this episode. Exceeding it terminates with score 0.10.")
|
| 33 |
|
| 34 |
|
| 35 |
class WhyDidItFailObservation(Observation):
|
server/llm_judge.py
CHANGED
|
@@ -63,7 +63,7 @@ def judge(
|
|
| 63 |
scenario: dict,
|
| 64 |
inspection_order: list[str],
|
| 65 |
) -> float | None:
|
| 66 |
-
"""Score reasoning quality. Returns 0.
|
| 67 |
|
| 68 |
None signals the caller to skip judge weighting entirely and use the
|
| 69 |
keyword score at full weight (1.0) rather than 0.85.
|
|
|
|
| 63 |
scenario: dict,
|
| 64 |
inspection_order: list[str],
|
| 65 |
) -> float | None:
|
| 66 |
+
"""Score reasoning quality. Returns 0.10–0.90, or None if unavailable/failed.
|
| 67 |
|
| 68 |
None signals the caller to skip judge weighting entirely and use the
|
| 69 |
keyword score at full weight (1.0) rather than 0.85.
|