--- a/benchmarks/validate.py +++ b/benchmarks/validate.py @@ -107,7 +107,7 @@ def make_mock(task_name): mock = MockLLMBackend() t = TASKS[task_name] def actor(msgs): text = " ".join(m.content for m in msgs) - has_h = "Learned Strategies" in text and "None yet" not in text + has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text code = t["good"] if has_h else t["bad"]