OGrohit commited on
Commit
1972eae
Β·
1 Parent(s): 4c76730

Update baseline scores, fix inference.py, remove junk file

Browse files
Files changed (2) hide show
  1. README.md +6 -12
  2. inference.py +2 -2
README.md CHANGED
@@ -463,18 +463,12 @@ Scores produced by `inference.py` using `llama-3.3-70b-versatile` via Groq API (
463
  |---|---|---|
464
  | Single Service Crash | Easy | 1.0000 |
465
  | Cascading Failure | Medium | 0.6500 |
466
- | Silent Degradation | Hard | 0.0000 |
467
- | **Average** | | **0.5500** |
468
-
469
- Expected ranges based on design:
470
- - Single crash: 0.75–0.85 β†’ **Exceeded (1.0000)**
471
- - Cascading failure: 0.45–0.60 β†’ **Exceeded (0.6500)**
472
- - Silent degradation: 0.20–0.40 β†’ **Below range (0.0000 β€” see note)**
473
-
474
- > **Note:** LLM-based scoring varies across runs due to non-deterministic model behavior.
475
- > The Silent Degradation task is hardest β€” it requires distinguishing signal from 60% noise
476
- > and making a nuanced P2 judgment (not an outage yet). Scores on this task can range
477
- > from 0.0 to 0.55 depending on the model's log parsing on that specific run.
478
 
479
  ---
480
 
 
463
  |---|---|---|
464
  | Single Service Crash | Easy | 1.0000 |
465
  | Cascading Failure | Medium | 0.6500 |
466
+ | Silent Degradation | Hard | 1.0000 |
467
+ | **Average** | | **0.8833** |
468
+
469
+ > **Note:** Silent Degradation (Hard) requires distinguishing signal from 60% noise
470
+ > and making a nuanced P2 judgment. The model successfully filtered noise and identified
471
+ > `payment-db` as root cause with `flush-cache:payment-db` remediation.
 
 
 
 
 
 
472
 
473
  ---
474
 
inference.py CHANGED
@@ -258,7 +258,7 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
258
  feedback = obs.get("last_action_feedback", "")
259
  actions_taken.append(action)
260
  print(f" Step {steps_taken}: {action['action_type']}({action['value']}) "
261
- f"β†’ reward={reward:+.2f} | {feedback[:50]}")
262
  except Exception as e:
263
  print(f" Step {steps_taken}: environment error: {e}")
264
  break
@@ -343,7 +343,7 @@ def main():
343
  score = result["score"]
344
  steps = result["steps_taken"]
345
  total += score
346
- bar = "β–ˆ" * int(score * 20) + "β–‘" * (20 - int(score * 20))
347
  print(f"{task:<25} {score:.4f} [{bar}] ({steps} steps)")
348
  for k, v in result.get("breakdown", {}).items():
349
  print(f" {k:<20} {v}")
 
258
  feedback = obs.get("last_action_feedback", "")
259
  actions_taken.append(action)
260
  print(f" Step {steps_taken}: {action['action_type']}({action['value']}) "
261
+ f"-> reward={reward:+.2f} | {feedback[:50]}")
262
  except Exception as e:
263
  print(f" Step {steps_taken}: environment error: {e}")
264
  break
 
343
  score = result["score"]
344
  steps = result["steps_taken"]
345
  total += score
346
+ bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
347
  print(f"{task:<25} {score:.4f} [{bar}] ({steps} steps)")
348
  for k, v in result.get("breakdown", {}).items():
349
  print(f" {k:<20} {v}")