akseljoonas HF Staff commited on
Commit
73882d9
·
1 Parent(s): 477a013

Improve agent loop, tools, and research subagent for long-running autonomous tasks

Browse files

- Add --no-stream and --max-iterations CLI flags for headless operation
- Simplify context manager: remove staged pruning, track actual token usage
- Add doom-loop detection and context budget to research subagent
- Improve file tools: better enforcement, error messages, descriptions
- Improve bash tool: longer timeouts, background execution guidance
- Add training logging and HP sweep guidance to system prompt
- Relax Python requirement to >=3.11
- Remove unused file_content_cache

agent/config.py CHANGED
@@ -23,8 +23,7 @@ class Config(BaseModel):
23
  session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
24
  auto_save_interval: int = 3 # Save every N user turns (0 = disabled)
25
  yolo_mode: bool = False # Auto-approve all tool calls without confirmation
26
- max_tool_failures_per_turn: int = 3 # Disable a tool after this many failures in one turn
27
- max_requests_per_turn: int = 50 # Hard cap on LLM requests per agent turn
28
 
29
  # Permission control parameters
30
  confirm_cpu_jobs: bool = True
 
23
  session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
24
  auto_save_interval: int = 3 # Save every N user turns (0 = disabled)
25
  yolo_mode: bool = False # Auto-approve all tool calls without confirmation
26
+ max_iterations: int = 300 # Max LLM calls per agent turn (-1 = unlimited)
 
27
 
28
  # Permission control parameters
29
  confirm_cpu_jobs: bool = True
agent/context_manager/manager.py CHANGED
@@ -89,7 +89,7 @@ class ContextManager:
89
  )
90
  self.max_context = max_context - 10000
91
  self.compact_size = int(max_context * compact_size)
92
- self.context_length = max_context
93
  self.untouched_messages = untouched_messages
94
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
95
 
@@ -243,114 +243,10 @@ class ContextManager:
243
 
244
  return False
245
 
246
- # Tools whose outputs should never be pruned (too valuable to summarise)
247
- _PRUNE_SKIP_TOOLS: set[str] = {"research", "plan_tool"}
248
-
249
- # Tools whose outputs are pruned via a cheap LLM call instead of
250
- # deterministic truncation (the output structure is too complex for
251
- # a fixed head-slice to capture the answer reliably).
252
- _LLM_PRUNE_TOOLS: set[str] = {"hf_jobs"}
253
-
254
- async def prune_old_tool_outputs(self, model_name: str | None = None) -> None:
255
- """Stage 1 compaction: shrink old tool outputs.
256
-
257
- For any tool message older than the last 6 messages whose content
258
- exceeds 500 chars:
259
- - Tools in _LLM_PRUNE_TOOLS get a cheap LLM summarisation (≤600 tokens).
260
- - All other tools get a deterministic one-line summary.
261
- tool_call_id and name are always preserved.
262
- """
263
- if len(self.items) <= 6:
264
- return
265
-
266
- cutoff = len(self.items) - 6
267
-
268
- # Find the preceding assistant tool_call arguments so the LLM
269
- # knows what question the tool output was answering.
270
- def _find_tool_call_args(tool_call_id: str) -> str | None:
271
- for msg in self.items:
272
- if getattr(msg, "role", None) != "assistant":
273
- continue
274
- for tc in getattr(msg, "tool_calls", None) or []:
275
- tc_id = tc.id if hasattr(tc, "id") else tc.get("id")
276
- if tc_id == tool_call_id:
277
- fn = tc.function if hasattr(tc, "function") else tc.get("function", {})
278
- return fn.arguments if hasattr(fn, "arguments") else fn.get("arguments", "")
279
- return None
280
-
281
- for i in range(cutoff - 1, -1, -1):
282
- msg = self.items[i]
283
- if getattr(msg, "role", None) != "tool":
284
- continue
285
- content = getattr(msg, "content", None) or ""
286
- if len(content) <= 500:
287
- continue
288
-
289
- tool_name = getattr(msg, "name", None) or "tool"
290
- if tool_name in self._PRUNE_SKIP_TOOLS:
291
- continue
292
-
293
- # --- LLM-based pruning for complex tool outputs ---
294
- if tool_name in self._LLM_PRUNE_TOOLS and model_name:
295
- call_args = _find_tool_call_args(getattr(msg, "tool_call_id", ""))
296
- context_line = (
297
- f"The tool was called with: {call_args}\n\n" if call_args else ""
298
- )
299
- try:
300
- hf_key = os.environ.get("INFERENCE_TOKEN")
301
- resp = await acompletion(
302
- model=model_name,
303
- messages=[
304
- Message(
305
- role="user",
306
- content=(
307
- f"{context_line}"
308
- f"Below is the raw output of the '{tool_name}' tool.\n"
309
- "Give the answer to the original request unchanged — "
310
- "preserve all job IDs, numbers, status values, error "
311
- "messages, and metrics exactly. Omit filler/boilerplate. "
312
- "Stay under 600 tokens.\n\n"
313
- f"{content}"
314
- ),
315
- )
316
- ],
317
- max_completion_tokens=600,
318
- api_key=hf_key
319
- if hf_key and model_name.startswith("huggingface/")
320
- else None,
321
- )
322
- msg.content = resp.choices[0].message.content
323
- continue
324
- except Exception:
325
- logger.warning(
326
- "LLM prune failed for %s, falling back to deterministic",
327
- tool_name,
328
- )
329
- # fall through to deterministic pruning below
330
-
331
- # --- Deterministic pruning ---
332
- preview = content[:80]
333
- total = len(content)
334
-
335
- if tool_name == "bash":
336
- exit_code_part = ""
337
- if "exit_code" in content[:200]:
338
- for line in content[:200].splitlines():
339
- if "exit_code" in line:
340
- exit_code_part = "exit_code visible if present, "
341
- break
342
- summary = f"[bash: {exit_code_part}{preview}... ({total} chars)]"
343
- else:
344
- summary = f"[{tool_name}: {preview}... ({total} chars)]"
345
-
346
- msg.content = summary
347
-
348
  async def compact(
349
  self, model_name: str, tool_specs: list[dict] | None = None
350
  ) -> None:
351
  """Remove old messages to keep history under target size"""
352
- await self.prune_old_tool_outputs(model_name=model_name)
353
-
354
  if (self.context_length <= self.max_context) or not self.items:
355
  return
356
 
@@ -358,6 +254,15 @@ class ContextManager:
358
  self.items[0] if self.items and self.items[0].role == "system" else None
359
  )
360
 
 
 
 
 
 
 
 
 
 
361
  # Don't summarize a certain number of just-preceding messages
362
  # Walk back to find a user message to make sure we keep an assistant -> user ->
363
  # assistant general conversation structure
@@ -366,7 +271,7 @@ class ContextManager:
366
  idx -= 1
367
 
368
  recent_messages = self.items[idx:]
369
- messages_to_summarize = self.items[1:idx]
370
 
371
  # improbable, messages would have to very long
372
  if not messages_to_summarize:
@@ -393,11 +298,11 @@ class ContextManager:
393
  role="assistant", content=response.choices[0].message.content
394
  )
395
 
396
- # Reconstruct: system + summary + recent messages (includes tools)
397
- if system_msg:
398
- self.items = [system_msg, summarized_message] + recent_messages
399
- else:
400
- self.items = [summarized_message] + recent_messages
401
 
402
  self.context_length = (
403
  len(self.system_prompt) // 4 + response.usage.completion_tokens
 
89
  )
90
  self.max_context = max_context - 10000
91
  self.compact_size = int(max_context * compact_size)
92
+ self.context_length = 0 # Updated after each LLM call with actual usage
93
  self.untouched_messages = untouched_messages
94
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
95
 
 
243
 
244
  return False
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  async def compact(
247
  self, model_name: str, tool_specs: list[dict] | None = None
248
  ) -> None:
249
  """Remove old messages to keep history under target size"""
 
 
250
  if (self.context_length <= self.max_context) or not self.items:
251
  return
252
 
 
254
  self.items[0] if self.items and self.items[0].role == "system" else None
255
  )
256
 
257
+ # Preserve the first user message (task prompt) — never summarize it
258
+ first_user_msg = None
259
+ first_user_idx = 1
260
+ for i in range(1, len(self.items)):
261
+ if getattr(self.items[i], "role", None) == "user":
262
+ first_user_msg = self.items[i]
263
+ first_user_idx = i
264
+ break
265
+
266
  # Don't summarize a certain number of just-preceding messages
267
  # Walk back to find a user message to make sure we keep an assistant -> user ->
268
  # assistant general conversation structure
 
271
  idx -= 1
272
 
273
  recent_messages = self.items[idx:]
274
+ messages_to_summarize = self.items[first_user_idx + 1:idx]
275
 
276
  # improbable, messages would have to very long
277
  if not messages_to_summarize:
 
298
  role="assistant", content=response.choices[0].message.content
299
  )
300
 
301
+ # Reconstruct: system + first user msg + summary + recent messages
302
+ head = [system_msg] if system_msg else []
303
+ if first_user_msg:
304
+ head.append(first_user_msg)
305
+ self.items = head + [summarized_message] + recent_messages
306
 
307
  self.context_length = (
308
  len(self.system_prompt) // 4 + response.usage.completion_tokens
agent/core/agent_loop.py CHANGED
@@ -153,35 +153,6 @@ _MAX_LLM_RETRIES = 3
153
  _LLM_RETRY_DELAYS = [5, 15, 30] # seconds between retries
154
 
155
 
156
- def _append_failure_warning(
157
- output: str,
158
- tool_name: str,
159
- tool_error_counts: dict[str, int],
160
- max_failures: int,
161
- ) -> str:
162
- """Track a tool failure and append a warning to the output.
163
-
164
- Returns the output with an appended warning indicating how many
165
- failures have occurred and whether the LLM should switch approach.
166
- """
167
- tool_error_counts[tool_name] = tool_error_counts.get(tool_name, 0) + 1
168
- count = tool_error_counts[tool_name]
169
- if count >= max_failures:
170
- return output + (
171
- f"\n\n⚠ Tool '{tool_name}' has now failed "
172
- f"{count} times this turn. You should try a "
173
- f"different approach instead of calling this "
174
- f"tool again."
175
- )
176
- remaining = max_failures - count
177
- return output + (
178
- f"\n\n⚠ Tool '{tool_name}' has failed "
179
- f"{count}/{max_failures} times this turn. "
180
- f"{remaining} attempt(s) before you should "
181
- f"switch to a different approach."
182
- )
183
-
184
-
185
  def _is_transient_error(error: Exception) -> bool:
186
  """Return True for errors that are likely transient and worth retrying."""
187
  err_str = str(error).lower()
@@ -200,10 +171,12 @@ def _is_transient_error(error: Exception) -> bool:
200
 
201
  async def _compact_and_notify(session: Session) -> None:
202
  """Run compaction and send event if context was reduced."""
203
- await session.context_manager.prune_old_tool_outputs(
204
- model_name=session.config.model_name,
205
- )
206
  old_length = session.context_manager.context_length
 
 
 
 
 
207
  tool_specs = session.tool_router.get_tool_specs_for_llm()
208
  await session.context_manager.compact(
209
  model_name=session.config.model_name,
@@ -211,6 +184,11 @@ async def _compact_and_notify(session: Session) -> None:
211
  )
212
  new_length = session.context_manager.context_length
213
  if new_length != old_length:
 
 
 
 
 
214
  await session.send_event(
215
  Event(
216
  event_type="compacted",
@@ -446,7 +424,7 @@ class Handlers:
446
 
447
  @staticmethod
448
  async def run_agent(
449
- session: Session, text: str, max_iterations: int = 300
450
  ) -> str | None:
451
  """
452
  Handle user input (like user_input_or_turn in codex.rs:1291)
@@ -474,10 +452,9 @@ class Handlers:
474
  iteration = 0
475
  final_response = None
476
  errored = False
477
- tool_error_counts: dict[str, int] = {}
478
 
479
- effective_max = min(max_iterations, session.config.max_requests_per_turn)
480
- while iteration < effective_max:
481
  # ── Cancellation check: before LLM call ──
482
  if session.is_cancelled:
483
  break
@@ -582,6 +559,34 @@ class Handlers:
582
 
583
  # If no tool calls, add assistant message and we're done
584
  if not tool_calls:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  if content:
586
  assistant_msg = Message(role="assistant", content=content)
587
  session.context_manager.add_message(assistant_msg, token_count)
@@ -722,15 +727,7 @@ class Handlers:
722
  results = gather_task.result()
723
 
724
  # 4. Record results and send outputs (order preserved)
725
- max_failures = session.config.max_tool_failures_per_turn
726
  for tc, tool_name, tool_args, output, success in results:
727
- if not success:
728
- output = _append_failure_warning(
729
- output, tool_name, tool_error_counts, max_failures,
730
- )
731
- else:
732
- tool_error_counts.pop(tool_name, None)
733
-
734
  tool_msg = Message(
735
  role="tool",
736
  content=output,
@@ -788,6 +785,14 @@ class Handlers:
788
 
789
  except ContextWindowExceededError:
790
  # Force compact and retry this iteration
 
 
 
 
 
 
 
 
791
  session.context_manager.context_length = (
792
  session.context_manager.max_context + 1
793
  )
 
153
  _LLM_RETRY_DELAYS = [5, 15, 30] # seconds between retries
154
 
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def _is_transient_error(error: Exception) -> bool:
157
  """Return True for errors that are likely transient and worth retrying."""
158
  err_str = str(error).lower()
 
171
 
172
  async def _compact_and_notify(session: Session) -> None:
173
  """Run compaction and send event if context was reduced."""
 
 
 
174
  old_length = session.context_manager.context_length
175
+ max_ctx = session.context_manager.max_context
176
+ logger.debug(
177
+ "Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
178
+ old_length, max_ctx, old_length > max_ctx,
179
+ )
180
  tool_specs = session.tool_router.get_tool_specs_for_llm()
181
  await session.context_manager.compact(
182
  model_name=session.config.model_name,
 
184
  )
185
  new_length = session.context_manager.context_length
186
  if new_length != old_length:
187
+ logger.warning(
188
+ "Context compacted: %d -> %d tokens (max=%d, %d messages)",
189
+ old_length, new_length, max_ctx,
190
+ len(session.context_manager.items),
191
+ )
192
  await session.send_event(
193
  Event(
194
  event_type="compacted",
 
424
 
425
  @staticmethod
426
  async def run_agent(
427
+ session: Session, text: str,
428
  ) -> str | None:
429
  """
430
  Handle user input (like user_input_or_turn in codex.rs:1291)
 
452
  iteration = 0
453
  final_response = None
454
  errored = False
455
+ max_iterations = session.config.max_iterations
456
 
457
+ while max_iterations == -1 or iteration < max_iterations:
 
458
  # ── Cancellation check: before LLM call ──
459
  if session.is_cancelled:
460
  break
 
559
 
560
  # If no tool calls, add assistant message and we're done
561
  if not tool_calls:
562
+ logger.warning(
563
+ "Agent loop ending: no tool calls. "
564
+ "finish_reason=%s, token_count=%d, "
565
+ "context_length=%d, max_context=%d, "
566
+ "iteration=%d/%d, "
567
+ "response_text=%s",
568
+ finish_reason,
569
+ token_count,
570
+ session.context_manager.context_length,
571
+ session.context_manager.max_context,
572
+ iteration,
573
+ max_iterations,
574
+ (content or "")[:500],
575
+ )
576
+ await session.send_event(
577
+ Event(
578
+ event_type="tool_log",
579
+ data={
580
+ "tool": "system",
581
+ "log": (
582
+ f"Loop exit: no tool calls. "
583
+ f"finish_reason={finish_reason}, "
584
+ f"tokens={token_count}/{session.context_manager.max_context}, "
585
+ f"iter={iteration}/{max_iterations}"
586
+ ),
587
+ },
588
+ )
589
+ )
590
  if content:
591
  assistant_msg = Message(role="assistant", content=content)
592
  session.context_manager.add_message(assistant_msg, token_count)
 
727
  results = gather_task.result()
728
 
729
  # 4. Record results and send outputs (order preserved)
 
730
  for tc, tool_name, tool_args, output, success in results:
 
 
 
 
 
 
 
731
  tool_msg = Message(
732
  role="tool",
733
  content=output,
 
785
 
786
  except ContextWindowExceededError:
787
  # Force compact and retry this iteration
788
+ logger.warning(
789
+ "ContextWindowExceededError at iteration %d — forcing compaction "
790
+ "(context_length=%d, max_context=%d, messages=%d)",
791
+ iteration,
792
+ session.context_manager.context_length,
793
+ session.context_manager.max_context,
794
+ len(session.context_manager.items),
795
+ )
796
  session.context_manager.context_length = (
797
  session.context_manager.max_context + 1
798
  )
agent/core/session.py CHANGED
@@ -12,7 +12,6 @@ from typing import Any, Optional
12
 
13
  from agent.config import Config
14
  from agent.context_manager.manager import ContextManager
15
- from agent.tools.file_content_cache import FileContentCache
16
 
17
  logger = logging.getLogger(__name__)
18
 
@@ -110,8 +109,6 @@ class Session:
110
  self.sandbox = None
111
  self._running_job_ids: set[str] = set() # HF job IDs currently executing
112
 
113
- self.file_content_cache = FileContentCache()
114
-
115
  # Session trajectory logging
116
  self.logged_events: list[dict] = []
117
  self.session_start_time = datetime.now().isoformat()
 
12
 
13
  from agent.config import Config
14
  from agent.context_manager.manager import ContextManager
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
 
109
  self.sandbox = None
110
  self._running_job_ids: set[str] = set() # HF job IDs currently executing
111
 
 
 
112
  # Session trajectory logging
113
  self.logged_events: list[dict] = []
114
  self.session_start_time = datetime.now().isoformat()
agent/main.py CHANGED
@@ -858,7 +858,12 @@ async def main():
858
  get_console().print("\n[dim]Bye.[/dim]\n")
859
 
860
 
861
- async def headless_main(prompt: str, model: str | None = None) -> None:
 
 
 
 
 
862
  """Run a single prompt headlessly and exit."""
863
  import logging
864
 
@@ -876,12 +881,13 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
876
  config.yolo_mode = True # Auto-approve everything in headless mode
877
 
878
  if model:
879
- if model not in VALID_MODEL_IDS:
880
- print(f"ERROR: Unknown model '{model}'. Valid: {', '.join(VALID_MODEL_IDS)}", file=sys.stderr)
881
- sys.exit(1)
882
  config.model_name = model
883
 
 
 
 
884
  print(f"Model: {config.model_name}", file=sys.stderr)
 
885
  print(f"Prompt: {prompt}", file=sys.stderr)
886
  print("---", file=sys.stderr)
887
 
@@ -900,7 +906,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
900
  session_holder=session_holder,
901
  hf_token=hf_token,
902
  local_mode=True,
903
- stream=True,
904
  )
905
  )
906
 
@@ -922,6 +928,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
922
  shimmer = _ThinkingShimmer(console)
923
  stream_buf = _StreamBuffer(console)
924
  _hl_last_tool = [None]
 
925
  shimmer.start()
926
 
927
  while True:
@@ -960,6 +967,26 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
960
  log = event.data.get("log", "") if event.data else ""
961
  if log:
962
  print_tool_log(tool, log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
  elif event.event_type == "compacted":
964
  old_tokens = event.data.get("old_tokens", 0) if event.data else 0
965
  new_tokens = event.data.get("new_tokens", 0) if event.data else 0
@@ -973,6 +1000,8 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
973
  elif event.event_type in ("turn_complete", "interrupted"):
974
  shimmer.stop()
975
  stream_buf.discard()
 
 
976
  break
977
 
978
  # Shutdown
@@ -999,11 +1028,18 @@ if __name__ == "__main__":
999
  parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
1000
  parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
1001
  parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
 
 
 
 
1002
  args = parser.parse_args()
1003
 
1004
  try:
1005
  if args.prompt:
1006
- asyncio.run(headless_main(args.prompt, model=args.model))
 
 
 
1007
  else:
1008
  asyncio.run(main())
1009
  except KeyboardInterrupt:
 
858
  get_console().print("\n[dim]Bye.[/dim]\n")
859
 
860
 
861
+ async def headless_main(
862
+ prompt: str,
863
+ model: str | None = None,
864
+ max_iterations: int | None = None,
865
+ stream: bool = True,
866
+ ) -> None:
867
  """Run a single prompt headlessly and exit."""
868
  import logging
869
 
 
881
  config.yolo_mode = True # Auto-approve everything in headless mode
882
 
883
  if model:
 
 
 
884
  config.model_name = model
885
 
886
+ if max_iterations is not None:
887
+ config.max_iterations = max_iterations
888
+
889
  print(f"Model: {config.model_name}", file=sys.stderr)
890
+ print(f"Max iterations: {config.max_iterations}", file=sys.stderr)
891
  print(f"Prompt: {prompt}", file=sys.stderr)
892
  print("---", file=sys.stderr)
893
 
 
906
  session_holder=session_holder,
907
  hf_token=hf_token,
908
  local_mode=True,
909
+ stream=stream,
910
  )
911
  )
912
 
 
928
  shimmer = _ThinkingShimmer(console)
929
  stream_buf = _StreamBuffer(console)
930
  _hl_last_tool = [None]
931
+ _hl_sub_id = [1]
932
  shimmer.start()
933
 
934
  while True:
 
967
  log = event.data.get("log", "") if event.data else ""
968
  if log:
969
  print_tool_log(tool, log)
970
+ elif event.event_type == "approval_required":
971
+ # Auto-approve everything in headless mode (safety net if yolo_mode
972
+ # didn't prevent the approval event for some reason)
973
+ tools_data = event.data.get("tools", []) if event.data else []
974
+ approvals = [
975
+ {
976
+ "tool_call_id": t.get("tool_call_id", ""),
977
+ "approved": True,
978
+ "feedback": None,
979
+ }
980
+ for t in tools_data
981
+ ]
982
+ _hl_sub_id[0] += 1
983
+ await submission_queue.put(Submission(
984
+ id=f"hl_approval_{_hl_sub_id[0]}",
985
+ operation=Operation(
986
+ op_type=OpType.EXEC_APPROVAL,
987
+ data={"approvals": approvals},
988
+ ),
989
+ ))
990
  elif event.event_type == "compacted":
991
  old_tokens = event.data.get("old_tokens", 0) if event.data else 0
992
  new_tokens = event.data.get("new_tokens", 0) if event.data else 0
 
1000
  elif event.event_type in ("turn_complete", "interrupted"):
1001
  shimmer.stop()
1002
  stream_buf.discard()
1003
+ history_size = event.data.get("history_size", "?") if event.data else "?"
1004
+ print(f"\n--- Agent {event.event_type} (history_size={history_size}) ---", file=sys.stderr)
1005
  break
1006
 
1007
  # Shutdown
 
1028
  parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
1029
  parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
1030
  parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
1031
+ parser.add_argument("--max-iterations", type=int, default=None,
1032
+ help="Max LLM requests per turn (default: 50, use -1 for unlimited)")
1033
+ parser.add_argument("--no-stream", action="store_true",
1034
+ help="Disable token streaming (use non-streaming LLM calls)")
1035
  args = parser.parse_args()
1036
 
1037
  try:
1038
  if args.prompt:
1039
+ max_iter = args.max_iterations
1040
+ if max_iter is not None and max_iter < 0:
1041
+ max_iter = 10_000 # effectively unlimited
1042
+ asyncio.run(headless_main(args.prompt, model=args.model, max_iterations=max_iter, stream=not args.no_stream))
1043
  else:
1044
  asyncio.run(main())
1045
  except KeyboardInterrupt:
agent/prompts/system_prompt_v3.yaml CHANGED
@@ -46,6 +46,8 @@ system_prompt: |
46
  2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
47
  3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
48
 
 
 
49
  Dataset format requirements by training method:
50
  SFT: "messages", "text", or "prompt"/"completion"
51
  DPO: "prompt", "chosen", "rejected"
@@ -112,6 +114,35 @@ system_prompt: |
112
  Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
113
  Do not mark plan tasks as completed if they failed or are only partially done.
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  # Communication
116
 
117
  - Be concise and direct. No filler, no restating what the user said.
 
46
  2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
47
  3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
48
 
49
+ Training logging: always set disable_tqdm=True, logging_strategy="steps", and logging_first_step=True in your TrainingArguments/SFTConfig so loss values are printed as plain text lines you can grep, not hidden inside tqdm progress bars.
50
+
51
  Dataset format requirements by training method:
52
  SFT: "messages", "text", or "prompt"/"completion"
53
  DPO: "prompt", "chosen", "rejected"
 
114
  Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
115
  Do not mark plan tasks as completed if they failed or are only partially done.
116
 
117
+ # Autonomous / headless mode
118
+
119
+ When running autonomously (no human in the loop), you MUST follow these rules:
120
+
121
+ NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, check the timer, or verify outputs. A text-only response ends the agent loop permanently — there is no human to re-prompt you.
122
+
123
+ NEVER STOP WORKING. Do NOT decide you are "done" while time remains. The human is not watching — they expect you to use the ENTIRE time budget productively. Do NOT ask "should I continue?" or "is this a good stopping point?" — there is nobody to answer. You run until the timer expires or you are manually killed.
124
+
125
+ Your workflow is a loop, not a checklist. Once you have a working result, KEEP ITERATING:
126
+
127
+ LOOP UNTIL TIME RUNS OUT:
128
+ 1. Research the approach (read docs, find examples, check current APIs)
129
+ 2. Implement the solution (write code, set up training)
130
+ 3. Train and evaluate
131
+ 4. Save the model to the required output location
132
+ 5. Check the timer — if time remains, improve: tune hyperparameters, try different data preprocessing, adjust the training recipe, try a different approach entirely
133
+ 6. Go to step 1
134
+
135
+ HYPERPARAMETER TUNING: Do not tune hyperparameters by hand one-at-a-time. Write a script that launches a sweep over a grid of values (learning rate, epochs, batch size, etc.) and evaluates each run automatically. One well-designed sweep script beats ten manual experiments.
136
+
137
+ If you run out of ideas: research. Use the research tool to find papers on the task or technique — look for recent methods, ablation results, tricks that worked for similar problems. Re-read the task prompt for angles you missed. Re-read the training logs for clues. Try combining approaches from different papers. Try a fundamentally different strategy from the literature. There is always a paper you haven't read yet.
138
+
139
+ Check the remaining time periodically with the timer command specified in the task prompt. Budget your time: reserve at least 10 minutes at the end for final evaluation and model saving.
140
+
141
+ The task is NOT done until:
142
+ - The required output directory exists (e.g. final_model/) with a valid model
143
+ - You have evaluated the model and confirmed it works
144
+ - The timer has expired or is about to expire
145
+
146
  # Communication
147
 
148
  - Be concise and direct. No filler, no restating what the user said.
agent/tools/edit_utils.py CHANGED
@@ -181,7 +181,11 @@ def apply_edit(
181
  if old_str not in content:
182
  original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
183
  if original_match is None:
184
- raise ValueError("old_str not found in file.")
 
 
 
 
185
  old_str = original_match
186
 
187
  count = content.count(old_str)
@@ -189,8 +193,10 @@ def apply_edit(
189
  if mode == "replace":
190
  if count > 1 and not replace_all:
191
  raise ValueError(
192
- f"old_str appears {count} times. Use replace_all=true to replace all, "
193
- "or provide a more specific old_str."
 
 
194
  )
195
  if replace_all:
196
  new_content = content.replace(old_str, new_str)
 
181
  if old_str not in content:
182
  original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
183
  if original_match is None:
184
+ raise ValueError(
185
+ "old_str was not found in the file. Make sure old_str matches "
186
+ "the file contents exactly, including whitespace and indentation. "
187
+ "Use the read tool to verify the current file contents before retrying."
188
+ )
189
  old_str = original_match
190
 
191
  count = content.count(old_str)
 
193
  if mode == "replace":
194
  if count > 1 and not replace_all:
195
  raise ValueError(
196
+ f"Found {count} matches of old_str in the file, but replace_all is "
197
+ f"false. To replace all occurrences, set replace_all to true. To "
198
+ f"replace only one, provide a larger old_str with more surrounding "
199
+ f"context to uniquely identify the instance."
200
  )
201
  if replace_all:
202
  new_content = content.replace(old_str, new_str)
agent/tools/file_content_cache.py DELETED
@@ -1,40 +0,0 @@
1
- """Cache for detecting unchanged local file re-reads."""
2
-
3
- from __future__ import annotations
4
-
5
- import hashlib
6
-
7
-
8
- def _short_hash(content: str) -> str:
9
- return hashlib.sha256(content.encode()).hexdigest()[:16]
10
-
11
-
12
- def _resolve(path: str) -> str:
13
- try:
14
- from pathlib import Path
15
- return str(Path(path).resolve())
16
- except Exception:
17
- return path
18
-
19
-
20
- class FileContentCache:
21
- """Tracks file content hashes to skip re-reading unchanged files."""
22
-
23
- def __init__(self) -> None:
24
- self._cache: dict[str, tuple[str, int]] = {}
25
-
26
- def record_read(self, path: str, content: str, turn: int) -> None:
27
- key = _resolve(path)
28
- self._cache[key] = (_short_hash(content), turn)
29
-
30
- def check_unchanged(self, path: str, content: str) -> tuple[bool, int | None]:
31
- key = _resolve(path)
32
- cached = self._cache.get(key)
33
- if cached is None:
34
- return False, None
35
- cached_hash, turn = cached
36
- return _short_hash(content) == cached_hash, turn
37
-
38
- def clear_path(self, path: str) -> None:
39
- key = _resolve(path)
40
- self._cache.pop(key, None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agent/tools/local_tools.py CHANGED
@@ -15,16 +15,25 @@ import tempfile
15
  from pathlib import Path
16
  from typing import Any
17
 
18
- from agent.tools.sandbox_client import Sandbox
19
 
20
  MAX_OUTPUT_CHARS = 25_000
21
- MAX_LINE_LENGTH = 2000
22
  DEFAULT_READ_LINES = 2000
23
  DEFAULT_TIMEOUT = 120
24
- MAX_TIMEOUT = 600
25
 
26
  _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
27
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def _atomic_write(path: Path, content: str) -> None:
30
  """Write file atomically via temp file + os.replace().
@@ -78,6 +87,7 @@ def _truncate_output(output: str, max_chars: int = MAX_OUTPUT_CHARS, head_ratio:
78
  meta = f"\n\n... ({omitted:,} of {total:,} chars omitted, showing first {head_budget:,} + last {tail_budget:,}) ...\n"
79
  if spill_path:
80
  meta += f"Full output saved to {spill_path} — use the read tool with offset/limit to inspect specific sections.\n"
 
81
  return head + meta + tail
82
 
83
 
@@ -104,7 +114,14 @@ async def _bash_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
104
  output = "(no output)"
105
  return output, result.returncode == 0
106
  except subprocess.TimeoutExpired:
107
- return f"Command timed out after {timeout}s.", False
 
 
 
 
 
 
 
108
  except Exception as e:
109
  return f"bash error: {e}", False
110
 
@@ -123,17 +140,7 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
123
  except Exception as e:
124
  return f"read error: {e}", False
125
 
126
- # Check if file is unchanged since last read
127
- session = _kw.get("session")
128
- if session is not None:
129
- is_unchanged, last_turn = session.file_content_cache.check_unchanged(
130
- file_path, raw_content
131
- )
132
- if is_unchanged:
133
- return (
134
- f"[File unchanged since turn {last_turn}, "
135
- f"content already in context.]"
136
- ), True
137
 
138
  lines = raw_content.splitlines()
139
  offset = max((args.get("offset") or 1), 1)
@@ -146,11 +153,6 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
146
  line = line[:MAX_LINE_LENGTH] + "..."
147
  numbered.append(f"{i:>6}\t{line}")
148
 
149
- if session is not None:
150
- session.file_content_cache.record_read(
151
- file_path, raw_content, session.turn_count
152
- )
153
-
154
  return "\n".join(numbered), True
155
 
156
 
@@ -160,11 +162,14 @@ async def _write_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
160
  if not file_path:
161
  return "No path provided.", False
162
  p = Path(file_path)
 
 
 
 
 
163
  try:
164
  _atomic_write(p, content)
165
- session = _kw.get("session")
166
- if session is not None:
167
- session.file_content_cache.clear_path(file_path)
168
  msg = f"Wrote {len(content)} bytes to {file_path}"
169
  # Syntax validation for Python files
170
  if p.suffix == ".py":
@@ -194,6 +199,11 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
194
  p = Path(file_path)
195
  if not p.exists():
196
  return f"File not found: {file_path}", False
 
 
 
 
 
197
 
198
  try:
199
  text = p.read_text()
@@ -212,10 +222,6 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
212
  except Exception as e:
213
  return f"edit write error: {e}", False
214
 
215
- session = _kw.get("session")
216
- if session is not None:
217
- session.file_content_cache.clear_path(file_path)
218
-
219
  msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
220
  if fuzzy_note:
221
  msg += f" {fuzzy_note}"
@@ -234,18 +240,22 @@ _LOCAL_TOOL_SPECS = {
234
  "description": (
235
  "Run a shell command on the local machine and return stdout/stderr.\n"
236
  "\n"
237
- "Commands run in a shell at the working directory (default: current directory). "
238
- "Each invocation is independent.\n"
239
- "\n"
240
- "AVOID using bash for operations covered by specialized tools:\n"
241
- "- File reading: use read (not cat/head/tail)\n"
242
- "- File editing: use edit (not sed/awk)\n"
243
- "- File writing: use write (not echo/cat <<EOF)\n"
244
  "\n"
 
245
  "Chain dependent commands with &&. Independent commands should be "
246
  "separate bash calls (they can run in parallel).\n"
247
  "\n"
248
- "Timeout default 120s, max 600s."
 
 
 
 
 
 
249
  ),
250
  "parameters": {
251
  "type": "object",
@@ -266,22 +276,125 @@ _LOCAL_TOOL_SPECS = {
266
  },
267
  "timeout": {
268
  "type": "integer",
269
- "description": "Timeout in seconds (default: 120, max: 600).",
270
  },
271
  },
272
  },
273
  },
274
  "read": {
275
- "description": Sandbox.TOOLS["read"]["description"],
276
- "parameters": Sandbox.TOOLS["read"]["parameters"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  },
278
  "write": {
279
- "description": Sandbox.TOOLS["write"]["description"],
280
- "parameters": Sandbox.TOOLS["write"]["parameters"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  },
282
  "edit": {
283
- "description": Sandbox.TOOLS["edit"]["description"],
284
- "parameters": Sandbox.TOOLS["edit"]["parameters"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  },
286
  }
287
 
 
15
  from pathlib import Path
16
  from typing import Any
17
 
 
18
 
19
  MAX_OUTPUT_CHARS = 25_000
20
+ MAX_LINE_LENGTH = 4000
21
  DEFAULT_READ_LINES = 2000
22
  DEFAULT_TIMEOUT = 120
23
+ MAX_TIMEOUT = 36000 # 10 hours — needed for long training runs (e.g. PostTrainBench)
24
 
25
  _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
26
 
27
+ # Track files that have been read this session (enforces read-before-write/edit)
28
+ _files_read: set[str] = set()
29
+
30
+
31
+ def _resolve_path(path: str) -> str:
32
+ try:
33
+ return str(Path(path).resolve())
34
+ except Exception:
35
+ return path
36
+
37
 
38
  def _atomic_write(path: Path, content: str) -> None:
39
  """Write file atomically via temp file + os.replace().
 
87
  meta = f"\n\n... ({omitted:,} of {total:,} chars omitted, showing first {head_budget:,} + last {tail_budget:,}) ...\n"
88
  if spill_path:
89
  meta += f"Full output saved to {spill_path} — use the read tool with offset/limit to inspect specific sections.\n"
90
+ meta += "IMPORTANT: The command has finished. Analyze the output above and continue with your next action.\n"
91
  return head + meta + tail
92
 
93
 
 
114
  output = "(no output)"
115
  return output, result.returncode == 0
116
  except subprocess.TimeoutExpired:
117
+ return (
118
+ f"Command timed out after {timeout}s and was killed.\n\n"
119
+ f"For long-running commands, run in the background and poll:\n"
120
+ f" nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
121
+ f"Then check status with:\n"
122
+ f" kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
123
+ f" tail -n 50 /tmp/output.log"
124
+ ), False
125
  except Exception as e:
126
  return f"bash error: {e}", False
127
 
 
140
  except Exception as e:
141
  return f"read error: {e}", False
142
 
143
+ _files_read.add(_resolve_path(file_path))
 
 
 
 
 
 
 
 
 
 
144
 
145
  lines = raw_content.splitlines()
146
  offset = max((args.get("offset") or 1), 1)
 
153
  line = line[:MAX_LINE_LENGTH] + "..."
154
  numbered.append(f"{i:>6}\t{line}")
155
 
 
 
 
 
 
156
  return "\n".join(numbered), True
157
 
158
 
 
162
  if not file_path:
163
  return "No path provided.", False
164
  p = Path(file_path)
165
+ if p.exists() and _resolve_path(file_path) not in _files_read:
166
+ return (
167
+ f"You must read {file_path} before overwriting it. "
168
+ f"Use the read tool first to see current contents."
169
+ ), False
170
  try:
171
  _atomic_write(p, content)
172
+ _files_read.add(_resolve_path(file_path))
 
 
173
  msg = f"Wrote {len(content)} bytes to {file_path}"
174
  # Syntax validation for Python files
175
  if p.suffix == ".py":
 
199
  p = Path(file_path)
200
  if not p.exists():
201
  return f"File not found: {file_path}", False
202
+ if _resolve_path(file_path) not in _files_read:
203
+ return (
204
+ f"You must read {file_path} before editing it. "
205
+ f"Use the read tool first to see current contents."
206
+ ), False
207
 
208
  try:
209
  text = p.read_text()
 
222
  except Exception as e:
223
  return f"edit write error: {e}", False
224
 
 
 
 
 
225
  msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
226
  if fuzzy_note:
227
  msg += f" {fuzzy_note}"
 
240
  "description": (
241
  "Run a shell command on the local machine and return stdout/stderr.\n"
242
  "\n"
243
+ "IMPORTANT: Do NOT use bash for file operations use the dedicated tools instead:\n"
244
+ "- To read files: use read (not cat/head/tail)\n"
245
+ "- To edit files: use edit (not sed/awk)\n"
246
+ "- To write files: use write (not echo/cat <<EOF)\n"
 
 
 
247
  "\n"
248
+ "Commands run in a shell at the working directory. Each invocation is independent.\n"
249
  "Chain dependent commands with &&. Independent commands should be "
250
  "separate bash calls (they can run in parallel).\n"
251
  "\n"
252
+ "For long-running commands (training, evaluation), run in the background and poll:\n"
253
+ " nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
254
+ "Then check status:\n"
255
+ " kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
256
+ " tail -n 50 /tmp/output.log\n"
257
+ "\n"
258
+ "Timeout default 120s, max 36000s."
259
  ),
260
  "parameters": {
261
  "type": "object",
 
276
  },
277
  "timeout": {
278
  "type": "integer",
279
+ "description": "Optional timeout in seconds (default: 120, max: 36000).",
280
  },
281
  },
282
  },
283
  },
284
  "read": {
285
+ "description": (
286
+ "Reads a file from the local filesystem. Returns contents with line numbers "
287
+ "(cat -n format).\n"
288
+ "\n"
289
+ "Usage:\n"
290
+ "- By default, reads up to 2000 lines from the beginning of the file.\n"
291
+ "- You can optionally specify offset and limit for large files, but prefer "
292
+ "reading the whole file first.\n"
293
+ "- Lines longer than 4000 chars are truncated.\n"
294
+ "- Cannot read directories — use bash with 'ls' instead.\n"
295
+ "- You should read multiple potentially useful files in parallel when possible.\n"
296
+ "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
297
+ "write tools will reject operations on files you haven't read."
298
+ ),
299
+ "parameters": {
300
+ "type": "object",
301
+ "required": ["path"],
302
+ "additionalProperties": False,
303
+ "properties": {
304
+ "path": {
305
+ "type": "string",
306
+ "description": "Absolute path to the file to read.",
307
+ },
308
+ "offset": {
309
+ "type": "integer",
310
+ "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
311
+ },
312
+ "limit": {
313
+ "type": "integer",
314
+ "description": "The number of lines to read. Only provide if the file is too large to read at once.",
315
+ },
316
+ },
317
+ },
318
  },
319
  "write": {
320
+ "description": (
321
+ "Writes a file to the local filesystem. Overwrites the existing file if one "
322
+ "exists at the path.\n"
323
+ "\n"
324
+ "- If this is an existing file, you MUST use the read tool first. This tool "
325
+ "will fail if you did not read the file first.\n"
326
+ "- ALWAYS prefer editing existing files with the edit tool over overwriting "
327
+ "with write.\n"
328
+ "- Creates parent directories as needed."
329
+ ),
330
+ "parameters": {
331
+ "type": "object",
332
+ "required": ["path", "content"],
333
+ "additionalProperties": False,
334
+ "properties": {
335
+ "path": {
336
+ "type": "string",
337
+ "description": "Absolute path to the file to write.",
338
+ },
339
+ "content": {
340
+ "type": "string",
341
+ "description": "The complete file content to write.",
342
+ },
343
+ },
344
+ },
345
  },
346
  "edit": {
347
+ "description": (
348
+ "Performs string replacements in files. Supports exact matching with "
349
+ "fuzzy fallback.\n"
350
+ "\n"
351
+ "Usage:\n"
352
+ "- You must read the file at least once before editing. This tool will "
353
+ "error if you attempt an edit without reading the file.\n"
354
+ "- The edit will FAIL if old_str is not unique in the file. Either provide "
355
+ "a larger string with more surrounding context to make it unique, or set "
356
+ "replace_all to true.\n"
357
+ "- old_str and new_str must differ.\n"
358
+ "- Preserve indentation exactly as it appears in the file.\n"
359
+ "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
360
+ "- To delete code, set new_str to empty string.\n"
361
+ "- Use replace_all for renaming variables or strings across the file.\n"
362
+ "\n"
363
+ "Modes:\n"
364
+ "- replace (default): replace first occurrence of old_str with new_str.\n"
365
+ "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
366
+ "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
367
+ ),
368
+ "parameters": {
369
+ "type": "object",
370
+ "required": ["path", "old_str", "new_str"],
371
+ "additionalProperties": False,
372
+ "properties": {
373
+ "path": {
374
+ "type": "string",
375
+ "description": "Absolute path to the file to edit.",
376
+ },
377
+ "old_str": {
378
+ "type": "string",
379
+ "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
380
+ },
381
+ "new_str": {
382
+ "type": "string",
383
+ "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
384
+ },
385
+ "replace_all": {
386
+ "type": "boolean",
387
+ "description": "Replace all occurrences of old_str (default: false).",
388
+ "default": False,
389
+ },
390
+ "mode": {
391
+ "type": "string",
392
+ "enum": ["replace", "append_after", "prepend_before"],
393
+ "description": "Edit mode (default: replace).",
394
+ "default": "replace",
395
+ },
396
+ },
397
+ },
398
  },
399
  }
400
 
agent/tools/research_tool.py CHANGED
@@ -14,10 +14,17 @@ from typing import Any
14
 
15
  from litellm import Message, acompletion
16
 
 
17
  from agent.core.session import Event
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
 
 
 
 
 
21
  # Tools the research agent can use (read-only subset)
22
  RESEARCH_TOOL_NAMES = {
23
  "read",
@@ -171,7 +178,7 @@ def _resolve_llm_params(model_name: str) -> dict:
171
  def _get_research_model(main_model: str) -> str:
172
  """Pick a cheaper model for research based on the main model."""
173
  if "anthropic/" in main_model:
174
- return "anthropic/claude-haiku-4-5-20251001"
175
  # For non-Anthropic models (HF router etc.), use the same model
176
  return main_model
177
 
@@ -221,12 +228,60 @@ async def research_handler(
221
 
222
  _tool_uses = 0
223
  _total_tokens = 0
 
224
 
225
  await _log("Starting research sub-agent...")
226
 
227
- # Run the research loop (max 20 iterations research should be focused)
228
- max_iterations = 20
229
  for _iteration in range(max_iterations):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  try:
231
  response = await acompletion(
232
  messages=messages,
@@ -242,7 +297,7 @@ async def research_handler(
242
 
243
  # Track tokens
244
  if response.usage:
245
- _total_tokens += response.usage.total_tokens
246
  await _log(f"tokens:{_total_tokens}")
247
 
248
  choice = response.choices[0]
@@ -308,8 +363,31 @@ async def research_handler(
308
  )
309
  )
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  return (
312
- "Research agent hit iteration limit (20). "
313
  "Partial findings may be incomplete — try a more focused task.",
314
  False,
315
  )
 
14
 
15
  from litellm import Message, acompletion
16
 
17
+ from agent.core.doom_loop import check_for_doom_loop
18
  from agent.core.session import Event
19
 
20
  logger = logging.getLogger(__name__)
21
 
22
+ # Context budget for the research subagent (tokens).
23
+ # When usage exceeds WARN threshold, the subagent is told to wrap up.
24
+ # At MAX, the loop is force-stopped and whatever content exists is returned.
25
+ _RESEARCH_CONTEXT_WARN = 170_000 # 85% of 200k
26
+ _RESEARCH_CONTEXT_MAX = 190_000
27
+
28
  # Tools the research agent can use (read-only subset)
29
  RESEARCH_TOOL_NAMES = {
30
  "read",
 
178
  def _get_research_model(main_model: str) -> str:
179
  """Pick a cheaper model for research based on the main model."""
180
  if "anthropic/" in main_model:
181
+ return "anthropic/claude-sonnet-4-6"
182
  # For non-Anthropic models (HF router etc.), use the same model
183
  return main_model
184
 
 
228
 
229
  _tool_uses = 0
230
  _total_tokens = 0
231
+ _warned_context = False
232
 
233
  await _log("Starting research sub-agent...")
234
 
235
+ # Run the research loop context budget is the real limiter
236
+ max_iterations = 60
237
  for _iteration in range(max_iterations):
238
+ # ── Doom-loop detection ──
239
+ doom_prompt = check_for_doom_loop(messages)
240
+ if doom_prompt:
241
+ logger.warning("Research sub-agent doom loop detected at iteration %d", _iteration)
242
+ await _log("Doom loop detected — injecting corrective prompt")
243
+ messages.append(Message(role="user", content=doom_prompt))
244
+
245
+ # ── Context budget: warn at 75%, hard-stop at 95% ──
246
+ if _total_tokens >= _RESEARCH_CONTEXT_MAX:
247
+ logger.warning(
248
+ "Research sub-agent hit context max (%d tokens) — forcing summary",
249
+ _total_tokens,
250
+ )
251
+ await _log(f"Context limit reached ({_total_tokens} tokens) — forcing wrap-up")
252
+ # Ask for a final summary with no tools
253
+ messages.append(Message(
254
+ role="user",
255
+ content=(
256
+ "[SYSTEM: CONTEXT LIMIT REACHED] You have used all available context. "
257
+ "Summarize your findings NOW. Do NOT call any more tools."
258
+ ),
259
+ ))
260
+ try:
261
+ response = await acompletion(
262
+ messages=messages,
263
+ tools=None, # no tools — force text response
264
+ stream=False,
265
+ timeout=120,
266
+ **llm_params,
267
+ )
268
+ content = response.choices[0].message.content or ""
269
+ return content or "Research context exhausted — no summary produced.", bool(content)
270
+ except Exception:
271
+ return "Research context exhausted and summary call failed.", False
272
+
273
+ if not _warned_context and _total_tokens >= _RESEARCH_CONTEXT_WARN:
274
+ _warned_context = True
275
+ await _log(f"Context at {_total_tokens} tokens — nudging to wrap up")
276
+ messages.append(Message(
277
+ role="user",
278
+ content=(
279
+ "[SYSTEM: You have used 75% of your context budget. "
280
+ "Start wrapping up: finish any critical lookups, then "
281
+ "produce your final summary within the next 1-2 iterations.]"
282
+ ),
283
+ ))
284
+
285
  try:
286
  response = await acompletion(
287
  messages=messages,
 
297
 
298
  # Track tokens
299
  if response.usage:
300
+ _total_tokens = response.usage.total_tokens
301
  await _log(f"tokens:{_total_tokens}")
302
 
303
  choice = response.choices[0]
 
363
  )
364
  )
365
 
366
+ # ── Iteration limit: try to salvage findings ──
367
+ await _log("Iteration limit reached — extracting summary")
368
+ messages.append(Message(
369
+ role="user",
370
+ content=(
371
+ "[SYSTEM: ITERATION LIMIT] You have reached the maximum number of research "
372
+ "iterations. Summarize ALL findings so far. Do NOT call any more tools."
373
+ ),
374
+ ))
375
+ try:
376
+ response = await acompletion(
377
+ messages=messages,
378
+ tools=None,
379
+ stream=False,
380
+ timeout=120,
381
+ **llm_params,
382
+ )
383
+ content = response.choices[0].message.content or ""
384
+ if content:
385
+ return content, True
386
+ except Exception as e:
387
+ logger.error("Research summary call failed: %s", e)
388
+
389
  return (
390
+ "Research agent hit iteration limit (60). "
391
  "Partial findings may be incomplete — try a more focused task.",
392
  False,
393
  )
agent/tools/sandbox_client.py CHANGED
@@ -57,7 +57,7 @@ HARDWARE_OPTIONS = [
57
  "a100-large",
58
  ]
59
  OUTPUT_LIMIT = 25000
60
- LINE_LIMIT = 2000
61
  DEFAULT_READ_LIMIT = 2000
62
  DEFAULT_TIMEOUT = 240
63
  MAX_TIMEOUT = 1200
@@ -855,22 +855,23 @@ class Sandbox:
855
  "description": (
856
  "Run a shell command in the remote sandbox and return stdout/stderr.\n"
857
  "\n"
858
- "Commands run in a shell at the working directory (default /app). "
859
- "Each invocation is independent use files in /app to persist state.\n"
860
- "\n"
861
- "AVOID using bash for operations covered by specialized tools:\n"
862
- "- File reading: use read (not cat/head/tail)\n"
863
- "- File editing: use edit (not sed/awk)\n"
864
- "- File writing: use write (not echo/cat <<EOF)\n"
865
- "\n"
866
- "For long-running tasks, background them:\n"
867
- " nohup uv run train.py > /app/train.log 2>&1 &\n"
868
- "Then check with read on the log file.\n"
869
  "\n"
 
 
870
  "Chain dependent commands with &&. Independent commands should be "
871
  "separate bash calls (they can run in parallel).\n"
872
  "\n"
873
- "Timeout default 120s, max 600s."
 
 
 
 
 
 
874
  ),
875
  "parameters": {
876
  "type": "object",
@@ -883,7 +884,7 @@ class Sandbox:
883
  },
884
  "description": {
885
  "type": "string",
886
- "description": "Short description (5-10 words, active voice). E.g. 'Install dependencies', 'Run training script'.",
887
  },
888
  "work_dir": {
889
  "type": "string",
@@ -891,20 +892,25 @@ class Sandbox:
891
  },
892
  "timeout": {
893
  "type": "integer",
894
- "description": "Timeout in seconds (default: 240, max: 1200).",
895
  },
896
  },
897
  },
898
  },
899
  "read": {
900
  "description": (
901
- "Read file contents with line numbers (cat -n format).\n"
902
- "\n"
903
- "Returns the first 2000 lines by default. For large files, use offset/limit "
904
- "to read a specific range. Line numbers always match the original file.\n"
905
  "\n"
906
- "Lines longer than 2000 chars are truncated.\n"
907
- "Cannot read directories use bash with 'ls' instead."
 
 
 
 
 
 
 
908
  ),
909
  "parameters": {
910
  "type": "object",
@@ -917,21 +923,25 @@ class Sandbox:
917
  },
918
  "offset": {
919
  "type": "integer",
920
- "description": "Start from this line (1-based). Only if file is too large.",
921
  },
922
  "limit": {
923
  "type": "integer",
924
- "description": "Number of lines to read. Only if file is too large.",
925
  },
926
  },
927
  },
928
  },
929
  "write": {
930
  "description": (
931
- "Create or overwrite a file. Creates parent directories as needed.\n"
 
932
  "\n"
933
- "For existing files, you MUST read the file first (system enforced). "
934
- "Prefer edit for modifications."
 
 
 
935
  ),
936
  "parameters": {
937
  "type": "object",
@@ -944,32 +954,32 @@ class Sandbox:
944
  },
945
  "content": {
946
  "type": "string",
947
- "description": "Complete file content.",
948
  },
949
  },
950
  },
951
  },
952
  "edit": {
953
  "description": (
954
- "Targeted edit via string replacement with fuzzy matching fallback.\n"
 
955
  "\n"
956
- "Modes:\n"
957
- "- replace (default): replace first occurrence of old_str with new_str.\n"
958
- "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
959
- "- prepend_before: insert new_str immediately before old_str (old_str is kept).\n"
960
- "\n"
961
- "Rules:\n"
962
- "- old_str must appear EXACTLY once (unless replace_all is true).\n"
963
- "- Include enough context in old_str for uniqueness.\n"
964
  "- old_str and new_str must differ.\n"
965
- "- Preserve indentation exactly.\n"
 
966
  "- To delete code, set new_str to empty string.\n"
967
- "- File MUST have been read this session (system enforced).\n"
968
- "- Do NOT include line number prefixes in old_str/new_str.\n"
969
  "\n"
970
- "If exact match fails, the tool automatically tries trimmed/normalized matching.\n"
971
- "Use replace_all=true for batch operations like variable renaming.\n"
972
- "Use append_after/prepend_before to insert code without replacing existing code."
 
973
  ),
974
  "parameters": {
975
  "type": "object",
@@ -978,16 +988,19 @@ class Sandbox:
978
  "properties": {
979
  "path": {
980
  "type": "string",
981
- "description": "Absolute path to the file.",
982
  },
983
  "old_str": {
984
  "type": "string",
985
- "description": "Text to find (fuzzy matching used as fallback).",
 
 
 
 
986
  },
987
- "new_str": {"type": "string", "description": "Replacement text (or text to insert for append_after/prepend_before)."},
988
  "replace_all": {
989
  "type": "boolean",
990
- "description": "Replace all occurrences (default: false).",
991
  "default": False,
992
  },
993
  "mode": {
 
57
  "a100-large",
58
  ]
59
  OUTPUT_LIMIT = 25000
60
+ LINE_LIMIT = 4000
61
  DEFAULT_READ_LIMIT = 2000
62
  DEFAULT_TIMEOUT = 240
63
  MAX_TIMEOUT = 1200
 
855
  "description": (
856
  "Run a shell command in the remote sandbox and return stdout/stderr.\n"
857
  "\n"
858
+ "IMPORTANT: Do NOT use bash for file operations use the dedicated tools instead:\n"
859
+ "- To read files: use read (not cat/head/tail)\n"
860
+ "- To edit files: use edit (not sed/awk)\n"
861
+ "- To write files: use write (not echo/cat <<EOF)\n"
 
 
 
 
 
 
 
862
  "\n"
863
+ "Commands run in a shell at /app. Each invocation is independent — "
864
+ "use files in /app to persist state.\n"
865
  "Chain dependent commands with &&. Independent commands should be "
866
  "separate bash calls (they can run in parallel).\n"
867
  "\n"
868
+ "For long-running commands (training, evaluation), run in the background and poll:\n"
869
+ " nohup <command> > /app/output.log 2>&1 & echo $!\n"
870
+ "Then check status:\n"
871
+ " kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
872
+ " tail -n 50 /app/output.log\n"
873
+ "\n"
874
+ "Timeout default 240s, max 1200s."
875
  ),
876
  "parameters": {
877
  "type": "object",
 
884
  },
885
  "description": {
886
  "type": "string",
887
+ "description": "Short description (5-10 words, active voice).",
888
  },
889
  "work_dir": {
890
  "type": "string",
 
892
  },
893
  "timeout": {
894
  "type": "integer",
895
+ "description": "Optional timeout in seconds (default: 240, max: 1200).",
896
  },
897
  },
898
  },
899
  },
900
  "read": {
901
  "description": (
902
+ "Reads a file from the sandbox filesystem. Returns contents with line "
903
+ "numbers (cat -n format).\n"
 
 
904
  "\n"
905
+ "Usage:\n"
906
+ "- By default, reads up to 2000 lines from the beginning of the file.\n"
907
+ "- You can optionally specify offset and limit for large files, but prefer "
908
+ "reading the whole file first.\n"
909
+ "- Lines longer than 4000 chars are truncated.\n"
910
+ "- Cannot read directories — use bash with 'ls' instead.\n"
911
+ "- You should read multiple potentially useful files in parallel when possible.\n"
912
+ "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
913
+ "write tools will reject operations on files you haven't read."
914
  ),
915
  "parameters": {
916
  "type": "object",
 
923
  },
924
  "offset": {
925
  "type": "integer",
926
+ "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
927
  },
928
  "limit": {
929
  "type": "integer",
930
+ "description": "The number of lines to read. Only provide if the file is too large to read at once.",
931
  },
932
  },
933
  },
934
  },
935
  "write": {
936
  "description": (
937
+ "Writes a file to the sandbox filesystem. Overwrites the existing file if "
938
+ "one exists at the path.\n"
939
  "\n"
940
+ "- If this is an existing file, you MUST use the read tool first. This tool "
941
+ "will fail if you did not read the file first.\n"
942
+ "- ALWAYS prefer editing existing files with the edit tool over overwriting "
943
+ "with write.\n"
944
+ "- Creates parent directories as needed."
945
  ),
946
  "parameters": {
947
  "type": "object",
 
954
  },
955
  "content": {
956
  "type": "string",
957
+ "description": "The complete file content to write.",
958
  },
959
  },
960
  },
961
  },
962
  "edit": {
963
  "description": (
964
+ "Performs string replacements in files. Supports exact matching with "
965
+ "fuzzy fallback.\n"
966
  "\n"
967
+ "Usage:\n"
968
+ "- You must read the file at least once before editing. This tool will "
969
+ "error if you attempt an edit without reading the file.\n"
970
+ "- The edit will FAIL if old_str is not unique in the file. Either provide "
971
+ "a larger string with more surrounding context to make it unique, or set "
972
+ "replace_all to true.\n"
 
 
973
  "- old_str and new_str must differ.\n"
974
+ "- Preserve indentation exactly as it appears in the file.\n"
975
+ "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
976
  "- To delete code, set new_str to empty string.\n"
977
+ "- Use replace_all for renaming variables or strings across the file.\n"
 
978
  "\n"
979
+ "Modes:\n"
980
+ "- replace (default): replace first occurrence of old_str with new_str.\n"
981
+ "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
982
+ "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
983
  ),
984
  "parameters": {
985
  "type": "object",
 
988
  "properties": {
989
  "path": {
990
  "type": "string",
991
+ "description": "Absolute path to the file to edit.",
992
  },
993
  "old_str": {
994
  "type": "string",
995
+ "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
996
+ },
997
+ "new_str": {
998
+ "type": "string",
999
+ "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
1000
  },
 
1001
  "replace_all": {
1002
  "type": "boolean",
1003
+ "description": "Replace all occurrences of old_str (default: false).",
1004
  "default": False,
1005
  },
1006
  "mode": {
agent/tools/sandbox_tool.py CHANGED
@@ -245,25 +245,6 @@ def _make_tool_handler(sandbox_tool_name: str):
245
  result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
246
  if result.success:
247
  output = result.output or "(no output)"
248
- cache = getattr(session, "file_content_cache", None)
249
- file_path = args.get("path", "")
250
-
251
- if sandbox_tool_name == "read" and cache and file_path:
252
- is_unchanged, last_turn = cache.check_unchanged(
253
- f"sandbox:{file_path}", output
254
- )
255
- if is_unchanged:
256
- return (
257
- f"[File unchanged since turn {last_turn}, "
258
- f"content already in context.]"
259
- ), True
260
- cache.record_read(
261
- f"sandbox:{file_path}", output, session.turn_count
262
- )
263
-
264
- if sandbox_tool_name in ("write", "edit") and cache and file_path:
265
- cache.clear_path(f"sandbox:{file_path}")
266
-
267
  return output, True
268
  else:
269
  error_msg = result.error or "Unknown error"
 
245
  result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
246
  if result.success:
247
  output = result.output or "(no output)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  return output, True
249
  else:
250
  error_msg = result.error or "Unknown error"
pyproject.toml CHANGED
@@ -3,7 +3,7 @@ name = "hf-agent"
3
  version = "0.1.0"
4
  description = "Add your description here"
5
  readme = "README.md"
6
- requires-python = ">=3.12"
7
  dependencies = [
8
  "datasets>=4.4.1",
9
  # Core dependencies (always required)
@@ -49,3 +49,13 @@ dev = [
49
  all = [
50
  "hf-agent[agent,eval,dev]",
51
  ]
 
 
 
 
 
 
 
 
 
 
 
3
  version = "0.1.0"
4
  description = "Add your description here"
5
  readme = "README.md"
6
+ requires-python = ">=3.11"
7
  dependencies = [
8
  "datasets>=4.4.1",
9
  # Core dependencies (always required)
 
49
  all = [
50
  "hf-agent[agent,eval,dev]",
51
  ]
52
+
53
+ [build-system]
54
+ requires = ["setuptools>=64"]
55
+ build-backend = "setuptools.build_meta"
56
+
57
+ [tool.setuptools.packages.find]
58
+ include = ["agent*"]
59
+
60
+ [tool.uv]
61
+ package = true
uv.lock CHANGED
@@ -871,7 +871,7 @@ wheels = [
871
  [[package]]
872
  name = "hf-agent"
873
  version = "0.1.0"
874
- source = { virtual = "." }
875
  dependencies = [
876
  { name = "datasets" },
877
  { name = "pydantic" },
@@ -890,6 +890,7 @@ agent = [
890
  { name = "nbformat" },
891
  { name = "prompt-toolkit" },
892
  { name = "requests" },
 
893
  { name = "thefuzz" },
894
  { name = "uvicorn", extra = ["standard"] },
895
  { name = "websockets" },
@@ -909,6 +910,7 @@ all = [
909
  { name = "prompt-toolkit" },
910
  { name = "pytest" },
911
  { name = "requests" },
 
912
  { name = "tenacity" },
913
  { name = "thefuzz" },
914
  { name = "uvicorn", extra = ["standard"] },
@@ -945,6 +947,7 @@ requires-dist = [
945
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
946
  { name = "python-dotenv", specifier = ">=1.2.1" },
947
  { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
 
948
  { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
949
  { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
950
  { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },
 
871
  [[package]]
872
  name = "hf-agent"
873
  version = "0.1.0"
874
+ source = { editable = "." }
875
  dependencies = [
876
  { name = "datasets" },
877
  { name = "pydantic" },
 
890
  { name = "nbformat" },
891
  { name = "prompt-toolkit" },
892
  { name = "requests" },
893
+ { name = "rich" },
894
  { name = "thefuzz" },
895
  { name = "uvicorn", extra = ["standard"] },
896
  { name = "websockets" },
 
910
  { name = "prompt-toolkit" },
911
  { name = "pytest" },
912
  { name = "requests" },
913
+ { name = "rich" },
914
  { name = "tenacity" },
915
  { name = "thefuzz" },
916
  { name = "uvicorn", extra = ["standard"] },
 
947
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
948
  { name = "python-dotenv", specifier = ">=1.2.1" },
949
  { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
950
+ { name = "rich", marker = "extra == 'agent'", specifier = ">=13.0.0" },
951
  { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
952
  { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
953
  { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },