Spaces:
Running
Running
Update baseline scores, fix inference.py, remove junk file
Browse files- README.md +6 -12
- inference.py +2 -2
README.md
CHANGED
|
@@ -463,18 +463,12 @@ Scores produced by `inference.py` using `llama-3.3-70b-versatile` via Groq API (
|
|
| 463 |
|---|---|---|
|
| 464 |
| Single Service Crash | Easy | 1.0000 |
|
| 465 |
| Cascading Failure | Medium | 0.6500 |
|
| 466 |
-
| Silent Degradation | Hard |
|
| 467 |
-
| **Average** | | **0.
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
-
|
| 472 |
-
- Silent degradation: 0.20β0.40 β **Below range (0.0000 β see note)**
|
| 473 |
-
|
| 474 |
-
> **Note:** LLM-based scoring varies across runs due to non-deterministic model behavior.
|
| 475 |
-
> The Silent Degradation task is hardest β it requires distinguishing signal from 60% noise
|
| 476 |
-
> and making a nuanced P2 judgment (not an outage yet). Scores on this task can range
|
| 477 |
-
> from 0.0 to 0.55 depending on the model's log parsing on that specific run.
|
| 478 |
|
| 479 |
---
|
| 480 |
|
|
|
|
| 463 |
|---|---|---|
|
| 464 |
| Single Service Crash | Easy | 1.0000 |
|
| 465 |
| Cascading Failure | Medium | 0.6500 |
|
| 466 |
+
| Silent Degradation | Hard | 1.0000 |
|
| 467 |
+
| **Average** | | **0.8833** |
|
| 468 |
+
|
| 469 |
+
> **Note:** Silent Degradation (Hard) requires distinguishing signal from 60% noise
|
| 470 |
+
> and making a nuanced P2 judgment. The model successfully filtered noise and identified
|
| 471 |
+
> `payment-db` as root cause with `flush-cache:payment-db` remediation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
---
|
| 474 |
|
inference.py
CHANGED
|
@@ -258,7 +258,7 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
|
| 258 |
feedback = obs.get("last_action_feedback", "")
|
| 259 |
actions_taken.append(action)
|
| 260 |
print(f" Step {steps_taken}: {action['action_type']}({action['value']}) "
|
| 261 |
-
f"
|
| 262 |
except Exception as e:
|
| 263 |
print(f" Step {steps_taken}: environment error: {e}")
|
| 264 |
break
|
|
@@ -343,7 +343,7 @@ def main():
|
|
| 343 |
score = result["score"]
|
| 344 |
steps = result["steps_taken"]
|
| 345 |
total += score
|
| 346 |
-
bar = "
|
| 347 |
print(f"{task:<25} {score:.4f} [{bar}] ({steps} steps)")
|
| 348 |
for k, v in result.get("breakdown", {}).items():
|
| 349 |
print(f" {k:<20} {v}")
|
|
|
|
| 258 |
feedback = obs.get("last_action_feedback", "")
|
| 259 |
actions_taken.append(action)
|
| 260 |
print(f" Step {steps_taken}: {action['action_type']}({action['value']}) "
|
| 261 |
+
f"-> reward={reward:+.2f} | {feedback[:50]}")
|
| 262 |
except Exception as e:
|
| 263 |
print(f" Step {steps_taken}: environment error: {e}")
|
| 264 |
break
|
|
|
|
| 343 |
score = result["score"]
|
| 344 |
steps = result["steps_taken"]
|
| 345 |
total += score
|
| 346 |
+
bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
|
| 347 |
print(f"{task:<25} {score:.4f} [{bar}] ({steps} steps)")
|
| 348 |
for k, v in result.get("breakdown", {}).items():
|
| 349 |
print(f" {k:<20} {v}")
|