TomLii Claude Sonnet 4.6 commited on
Commit
76004d7
Β·
1 Parent(s): bc3f415

Streamline Settings panel: Quest-4B only, raise Max Turns, add Memory Strategy

Browse files

- Drop the free-model fallback list; the Space is Quest-4B only now and the
Model field is a read-only display.
- Raise Max Turns ceiling from 20 to 100 to match long-horizon runs.
- Remove the Search Results Per Query slider; the agent pins max_results to
DEFAULT_MAX_SEARCH_RESULTS (10).
- Add a Memory Strategy dropdown with condenser (default), discard-all, and
hide-tool-results. Strategy is applied per turn β€” discard-all resets the
history every 8 turns and hide-tool-results collapses older tool responses
to a stub, mirroring the QUEST research repo's env-var switch.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +79 -45
app.py CHANGED
@@ -24,25 +24,13 @@ QUEST_BASE_URL = os.getenv("QUEST_BASE_URL", "").strip()
24
  # want the original repo id. QUEST_ENDPOINT_MODEL overrides this if needed.
25
  QUEST_ENDPOINT_MODEL = os.getenv("QUEST_ENDPOINT_MODEL", "tgi").strip() or "tgi"
26
 
27
- # Shared HF Inference API fallbacks (free, rate-limited). These are used when
28
- # the user picks one of these from the Model dropdown; they do NOT go through
29
- # the private endpoint.
30
- FREE_FALLBACK_MODELS = [
31
- "Qwen/Qwen3-8B",
32
- "google/gemma-3-12b-it",
33
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
34
- "Qwen/Qwen2.5-7B-Instruct",
35
- "meta-llama/Llama-3.1-8B-Instruct",
36
- ]
37
 
38
- # Quest-4B shows up first when the endpoint is wired; otherwise we still list
39
- # it so you can see what the target model is, but it will only work after the
40
- # QUEST_BASE_URL secret is configured.
41
- DEFAULT_MODEL_CHOICES = [QUEST_MODEL_ID] + FREE_FALLBACK_MODELS
42
- DEFAULT_MODEL = os.getenv(
43
- "DEFAULT_MODEL",
44
- QUEST_MODEL_ID if QUEST_BASE_URL else FREE_FALLBACK_MODELS[0],
45
- )
46
 
47
  PAPER_URL = os.getenv("PAPER_URL", "#")
48
  CODE_URL = os.getenv("CODE_URL", "#")
@@ -1200,8 +1188,7 @@ def _build_client_for_model(model: str) -> Tuple[InferenceClient, str, List[str]
1200
  )
1201
  return client, QUEST_ENDPOINT_MODEL, []
1202
  client = InferenceClient(token=token, timeout=60)
1203
- fallbacks = [m for m in FREE_FALLBACK_MODELS if m != model]
1204
- return client, model, fallbacks
1205
 
1206
 
1207
  def call_model(
@@ -1267,12 +1254,60 @@ def _trace_to_json(state: "AgentState", used_model: str) -> str:
1267
  )
1268
 
1269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1270
  def build_research_agent(
1271
  question: str,
1272
  model: str,
1273
  max_turns: int,
1274
- max_search_results: int,
1275
  temperature: float,
 
1276
  ):
1277
  """Run the ReAct research loop as a generator.
1278
 
@@ -1305,8 +1340,12 @@ def build_research_agent(
1305
  status_lines.append("πŸš€ Starting research agent")
1306
  yield _emit()
1307
 
 
 
 
1308
  for turn in range(1, max_turns + 1):
1309
- if state.trusted_notes and turn > 1 and turn % 3 == 0:
 
1310
  summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
1311
  messages.append(
1312
  {
@@ -1383,8 +1422,8 @@ def build_research_agent(
1383
  queries = [str(q).strip() for q in raw_query if str(q).strip()]
1384
  else:
1385
  queries = [str(raw_query).strip()] if str(raw_query).strip() else []
1386
- max_results = int(tool_args.get("max_results", max_search_results))
1387
- max_results = max(1, min(max_results, 10))
1388
 
1389
  queries_preview = ", ".join(f"`{q}`" for q in queries) or "_(empty)_"
1390
  status_lines.append(f"πŸ” turn {turn}: searching {queries_preview}")
@@ -1513,9 +1552,8 @@ def build_research_agent(
1513
 
1514
  def run_ui(
1515
  question: str,
1516
- model: str,
1517
  max_turns: int,
1518
- max_search_results: int,
1519
  temperature: float,
1520
  ):
1521
  if not question.strip():
@@ -1528,22 +1566,21 @@ def run_ui(
1528
  )
1529
  yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
1530
  return
1531
- if model == QUEST_MODEL_ID and not QUEST_BASE_URL:
1532
  warning = (
1533
- f"`{QUEST_MODEL_ID}` is private and not available via the free HF Inference API. "
1534
- "Create a dedicated HF Inference Endpoint for it (https://ui.endpoints.huggingface.co/), "
1535
- "then set `QUEST_BASE_URL` in Space Secrets to the endpoint's `/v1/` URL. "
1536
- "In the meantime you can pick one of the open-weights models in the dropdown."
1537
  )
1538
  yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
1539
  return
1540
  try:
1541
  for partial_answer, partial_trace in build_research_agent(
1542
  question=question,
1543
- model=model,
1544
  max_turns=max_turns,
1545
- max_search_results=max_search_results,
1546
  temperature=temperature,
 
1547
  ):
1548
  yield partial_answer, partial_trace
1549
  except Exception as exc:
@@ -1650,25 +1687,22 @@ with gr.Blocks(
1650
 
1651
  with gr.Group(elem_classes="section-card"):
1652
  gr.HTML('<div class="section-heading">Settings</div>')
1653
- model = gr.Dropdown(
1654
  label="Model",
1655
- choices=DEFAULT_MODEL_CHOICES,
1656
- value=DEFAULT_MODEL if DEFAULT_MODEL in DEFAULT_MODEL_CHOICES else DEFAULT_MODEL_CHOICES[0],
1657
- allow_custom_value=True,
1658
  )
1659
  max_turns = gr.Slider(
1660
  label="Max Turns",
1661
  minimum=2,
1662
- maximum=20,
1663
  value=6,
1664
  step=1,
1665
  )
1666
- max_search_results = gr.Slider(
1667
- label="Search Results Per Query",
1668
- minimum=1,
1669
- maximum=10,
1670
- value=5,
1671
- step=1,
1672
  )
1673
  temperature = gr.Slider(
1674
  label="Temperature",
@@ -1692,7 +1726,7 @@ with gr.Blocks(
1692
 
1693
  run_event = run_btn.click(
1694
  fn=run_ui,
1695
- inputs=[question, model, max_turns, max_search_results, temperature],
1696
  outputs=[answer, trace],
1697
  )
1698
  for btn, ex in zip(example_buttons, EXAMPLES):
 
24
  # want the original repo id. QUEST_ENDPOINT_MODEL overrides this if needed.
25
  QUEST_ENDPOINT_MODEL = os.getenv("QUEST_ENDPOINT_MODEL", "tgi").strip() or "tgi"
26
 
27
+ # This Space runs exclusively on Quest-4B served via the private HF Inference
28
+ # Endpoint pointed to by QUEST_BASE_URL. No public fallback list β€” the model
29
+ # field in the UI is display-only.
30
+ DEFAULT_MODEL = QUEST_MODEL_ID
 
 
 
 
 
 
31
 
32
+ # Internal defaults. Search budget is no longer user-tunable.
33
+ DEFAULT_MAX_SEARCH_RESULTS = 10
 
 
 
 
 
 
34
 
35
  PAPER_URL = os.getenv("PAPER_URL", "#")
36
  CODE_URL = os.getenv("CODE_URL", "#")
 
1188
  )
1189
  return client, QUEST_ENDPOINT_MODEL, []
1190
  client = InferenceClient(token=token, timeout=60)
1191
+ return client, model, []
 
1192
 
1193
 
1194
  def call_model(
 
1254
  )
1255
 
1256
 
1257
+ MEMORY_STRATEGIES = ("condenser", "discard-all", "hide-tool-results")
1258
+
1259
+
1260
+ def _normalize_memory_strategy(strategy: str) -> str:
1261
+ s = (strategy or "condenser").strip().lower().replace("_", "-")
1262
+ return s if s in MEMORY_STRATEGIES else "condenser"
1263
+
1264
+
1265
+ def _apply_memory_strategy(messages: List[Dict[str, str]], strategy: str, turn: int) -> None:
1266
+ """Keep the message history inside a manageable context budget.
1267
+
1268
+ - condenser: no-op (the main loop also injects a periodic trusted-note
1269
+ summary; that is the light "condenser" this Space ships with).
1270
+ - discard-all: every 8 turns, reset history to [system, user question]
1271
+ so the model pays for fresh context rather than replaying old tool
1272
+ results.
1273
+ - hide-tool-results: cap the number of surviving tool-response user
1274
+ messages at 3 β€” older ones get their content replaced with a stub.
1275
+ """
1276
+ if strategy == "discard-all":
1277
+ if turn > 1 and turn % 8 == 0 and len(messages) > 2:
1278
+ system_msg = messages[0]
1279
+ question_msg = messages[1]
1280
+ messages.clear()
1281
+ messages.append(system_msg)
1282
+ messages.append(question_msg)
1283
+ messages.append(
1284
+ {
1285
+ "role": "user",
1286
+ "content": "[memory discarded at turn "
1287
+ f"{turn} β€” continue the research from the original question]",
1288
+ }
1289
+ )
1290
+ elif strategy == "hide-tool-results":
1291
+ keep_tail = 3
1292
+ tool_indices = [
1293
+ i for i, m in enumerate(messages)
1294
+ if m.get("role") == "user" and str(m.get("content", "")).startswith("<tool_response>")
1295
+ ]
1296
+ if len(tool_indices) > keep_tail:
1297
+ for i in tool_indices[:-keep_tail]:
1298
+ if messages[i]["content"] != "<tool_response>[hidden]</tool_response>":
1299
+ messages[i] = {
1300
+ "role": "user",
1301
+ "content": "<tool_response>[hidden]</tool_response>",
1302
+ }
1303
+
1304
+
1305
  def build_research_agent(
1306
  question: str,
1307
  model: str,
1308
  max_turns: int,
 
1309
  temperature: float,
1310
+ memory_strategy: str = "condenser",
1311
  ):
1312
  """Run the ReAct research loop as a generator.
1313
 
 
1340
  status_lines.append("πŸš€ Starting research agent")
1341
  yield _emit()
1342
 
1343
+ strategy = _normalize_memory_strategy(memory_strategy)
1344
+ os.environ["MEMORY_STRATEGY"] = strategy
1345
+
1346
  for turn in range(1, max_turns + 1):
1347
+ _apply_memory_strategy(messages, strategy, turn)
1348
+ if strategy == "condenser" and state.trusted_notes and turn > 1 and turn % 3 == 0:
1349
  summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
1350
  messages.append(
1351
  {
 
1422
  queries = [str(q).strip() for q in raw_query if str(q).strip()]
1423
  else:
1424
  queries = [str(raw_query).strip()] if str(raw_query).strip() else []
1425
+ max_results = int(tool_args.get("max_results", DEFAULT_MAX_SEARCH_RESULTS))
1426
+ max_results = max(1, min(max_results, DEFAULT_MAX_SEARCH_RESULTS))
1427
 
1428
  queries_preview = ", ".join(f"`{q}`" for q in queries) or "_(empty)_"
1429
  status_lines.append(f"πŸ” turn {turn}: searching {queries_preview}")
 
1552
 
1553
  def run_ui(
1554
  question: str,
 
1555
  max_turns: int,
1556
+ memory_strategy: str,
1557
  temperature: float,
1558
  ):
1559
  if not question.strip():
 
1566
  )
1567
  yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
1568
  return
1569
+ if not QUEST_BASE_URL:
1570
  warning = (
1571
+ f"`{QUEST_MODEL_ID}` needs a private HF Inference Endpoint. "
1572
+ "Create one at https://ui.endpoints.huggingface.co/, then set "
1573
+ "`QUEST_BASE_URL` in Space Secrets to the endpoint's `/v1/` URL."
 
1574
  )
1575
  yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
1576
  return
1577
  try:
1578
  for partial_answer, partial_trace in build_research_agent(
1579
  question=question,
1580
+ model=QUEST_MODEL_ID,
1581
  max_turns=max_turns,
 
1582
  temperature=temperature,
1583
+ memory_strategy=memory_strategy,
1584
  ):
1585
  yield partial_answer, partial_trace
1586
  except Exception as exc:
 
1687
 
1688
  with gr.Group(elem_classes="section-card"):
1689
  gr.HTML('<div class="section-heading">Settings</div>')
1690
+ gr.Textbox(
1691
  label="Model",
1692
+ value=QUEST_MODEL_ID,
1693
+ interactive=False,
 
1694
  )
1695
  max_turns = gr.Slider(
1696
  label="Max Turns",
1697
  minimum=2,
1698
+ maximum=100,
1699
  value=6,
1700
  step=1,
1701
  )
1702
+ memory_strategy = gr.Dropdown(
1703
+ label="Memory Strategy",
1704
+ choices=list(MEMORY_STRATEGIES),
1705
+ value="condenser",
 
 
1706
  )
1707
  temperature = gr.Slider(
1708
  label="Temperature",
 
1726
 
1727
  run_event = run_btn.click(
1728
  fn=run_ui,
1729
+ inputs=[question, max_turns, memory_strategy, temperature],
1730
  outputs=[answer, trace],
1731
  )
1732
  for btn, ex in zip(example_buttons, EXAMPLES):