Streamline Settings panel: Quest-4B only, raise Max Turns, add Memory Strategy
Browse files- Drop the free-model fallback list; the Space is Quest-4B only now and the
Model field is a read-only display.
- Raise Max Turns ceiling from 20 to 100 to match long-horizon runs.
- Remove the Search Results Per Query slider; the agent pins max_results to
DEFAULT_MAX_SEARCH_RESULTS (10).
- Add a Memory Strategy dropdown with condenser (default), discard-all, and
hide-tool-results. Strategy is applied per turn β discard-all resets the
history every 8 turns and hide-tool-results collapses older tool responses
to a stub, mirroring the QUEST research repo's env-var switch.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -24,25 +24,13 @@ QUEST_BASE_URL = os.getenv("QUEST_BASE_URL", "").strip()
|
|
| 24 |
# want the original repo id. QUEST_ENDPOINT_MODEL overrides this if needed.
|
| 25 |
QUEST_ENDPOINT_MODEL = os.getenv("QUEST_ENDPOINT_MODEL", "tgi").strip() or "tgi"
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
#
|
| 29 |
-
# the
|
| 30 |
-
|
| 31 |
-
"Qwen/Qwen3-8B",
|
| 32 |
-
"google/gemma-3-12b-it",
|
| 33 |
-
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 34 |
-
"Qwen/Qwen2.5-7B-Instruct",
|
| 35 |
-
"meta-llama/Llama-3.1-8B-Instruct",
|
| 36 |
-
]
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
| 40 |
-
# QUEST_BASE_URL secret is configured.
|
| 41 |
-
DEFAULT_MODEL_CHOICES = [QUEST_MODEL_ID] + FREE_FALLBACK_MODELS
|
| 42 |
-
DEFAULT_MODEL = os.getenv(
|
| 43 |
-
"DEFAULT_MODEL",
|
| 44 |
-
QUEST_MODEL_ID if QUEST_BASE_URL else FREE_FALLBACK_MODELS[0],
|
| 45 |
-
)
|
| 46 |
|
| 47 |
PAPER_URL = os.getenv("PAPER_URL", "#")
|
| 48 |
CODE_URL = os.getenv("CODE_URL", "#")
|
|
@@ -1200,8 +1188,7 @@ def _build_client_for_model(model: str) -> Tuple[InferenceClient, str, List[str]
|
|
| 1200 |
)
|
| 1201 |
return client, QUEST_ENDPOINT_MODEL, []
|
| 1202 |
client = InferenceClient(token=token, timeout=60)
|
| 1203 |
-
|
| 1204 |
-
return client, model, fallbacks
|
| 1205 |
|
| 1206 |
|
| 1207 |
def call_model(
|
|
@@ -1267,12 +1254,60 @@ def _trace_to_json(state: "AgentState", used_model: str) -> str:
|
|
| 1267 |
)
|
| 1268 |
|
| 1269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1270 |
def build_research_agent(
|
| 1271 |
question: str,
|
| 1272 |
model: str,
|
| 1273 |
max_turns: int,
|
| 1274 |
-
max_search_results: int,
|
| 1275 |
temperature: float,
|
|
|
|
| 1276 |
):
|
| 1277 |
"""Run the ReAct research loop as a generator.
|
| 1278 |
|
|
@@ -1305,8 +1340,12 @@ def build_research_agent(
|
|
| 1305 |
status_lines.append("π Starting research agent")
|
| 1306 |
yield _emit()
|
| 1307 |
|
|
|
|
|
|
|
|
|
|
| 1308 |
for turn in range(1, max_turns + 1):
|
| 1309 |
-
|
|
|
|
| 1310 |
summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
|
| 1311 |
messages.append(
|
| 1312 |
{
|
|
@@ -1383,8 +1422,8 @@ def build_research_agent(
|
|
| 1383 |
queries = [str(q).strip() for q in raw_query if str(q).strip()]
|
| 1384 |
else:
|
| 1385 |
queries = [str(raw_query).strip()] if str(raw_query).strip() else []
|
| 1386 |
-
max_results = int(tool_args.get("max_results",
|
| 1387 |
-
max_results = max(1, min(max_results,
|
| 1388 |
|
| 1389 |
queries_preview = ", ".join(f"`{q}`" for q in queries) or "_(empty)_"
|
| 1390 |
status_lines.append(f"π turn {turn}: searching {queries_preview}")
|
|
@@ -1513,9 +1552,8 @@ def build_research_agent(
|
|
| 1513 |
|
| 1514 |
def run_ui(
|
| 1515 |
question: str,
|
| 1516 |
-
model: str,
|
| 1517 |
max_turns: int,
|
| 1518 |
-
|
| 1519 |
temperature: float,
|
| 1520 |
):
|
| 1521 |
if not question.strip():
|
|
@@ -1528,22 +1566,21 @@ def run_ui(
|
|
| 1528 |
)
|
| 1529 |
yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
|
| 1530 |
return
|
| 1531 |
-
if
|
| 1532 |
warning = (
|
| 1533 |
-
f"`{QUEST_MODEL_ID}`
|
| 1534 |
-
"Create
|
| 1535 |
-
"
|
| 1536 |
-
"In the meantime you can pick one of the open-weights models in the dropdown."
|
| 1537 |
)
|
| 1538 |
yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
|
| 1539 |
return
|
| 1540 |
try:
|
| 1541 |
for partial_answer, partial_trace in build_research_agent(
|
| 1542 |
question=question,
|
| 1543 |
-
model=
|
| 1544 |
max_turns=max_turns,
|
| 1545 |
-
max_search_results=max_search_results,
|
| 1546 |
temperature=temperature,
|
|
|
|
| 1547 |
):
|
| 1548 |
yield partial_answer, partial_trace
|
| 1549 |
except Exception as exc:
|
|
@@ -1650,25 +1687,22 @@ with gr.Blocks(
|
|
| 1650 |
|
| 1651 |
with gr.Group(elem_classes="section-card"):
|
| 1652 |
gr.HTML('<div class="section-heading">Settings</div>')
|
| 1653 |
-
|
| 1654 |
label="Model",
|
| 1655 |
-
|
| 1656 |
-
|
| 1657 |
-
allow_custom_value=True,
|
| 1658 |
)
|
| 1659 |
max_turns = gr.Slider(
|
| 1660 |
label="Max Turns",
|
| 1661 |
minimum=2,
|
| 1662 |
-
maximum=
|
| 1663 |
value=6,
|
| 1664 |
step=1,
|
| 1665 |
)
|
| 1666 |
-
|
| 1667 |
-
label="
|
| 1668 |
-
|
| 1669 |
-
|
| 1670 |
-
value=5,
|
| 1671 |
-
step=1,
|
| 1672 |
)
|
| 1673 |
temperature = gr.Slider(
|
| 1674 |
label="Temperature",
|
|
@@ -1692,7 +1726,7 @@ with gr.Blocks(
|
|
| 1692 |
|
| 1693 |
run_event = run_btn.click(
|
| 1694 |
fn=run_ui,
|
| 1695 |
-
inputs=[question,
|
| 1696 |
outputs=[answer, trace],
|
| 1697 |
)
|
| 1698 |
for btn, ex in zip(example_buttons, EXAMPLES):
|
|
|
|
| 24 |
# want the original repo id. QUEST_ENDPOINT_MODEL overrides this if needed.
|
| 25 |
QUEST_ENDPOINT_MODEL = os.getenv("QUEST_ENDPOINT_MODEL", "tgi").strip() or "tgi"
|
| 26 |
|
| 27 |
+
# This Space runs exclusively on Quest-4B served via the private HF Inference
|
| 28 |
+
# Endpoint pointed to by QUEST_BASE_URL. No public fallback list β the model
|
| 29 |
+
# field in the UI is display-only.
|
| 30 |
+
DEFAULT_MODEL = QUEST_MODEL_ID
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
# Internal defaults. Search budget is no longer user-tunable.
|
| 33 |
+
DEFAULT_MAX_SEARCH_RESULTS = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
PAPER_URL = os.getenv("PAPER_URL", "#")
|
| 36 |
CODE_URL = os.getenv("CODE_URL", "#")
|
|
|
|
| 1188 |
)
|
| 1189 |
return client, QUEST_ENDPOINT_MODEL, []
|
| 1190 |
client = InferenceClient(token=token, timeout=60)
|
| 1191 |
+
return client, model, []
|
|
|
|
| 1192 |
|
| 1193 |
|
| 1194 |
def call_model(
|
|
|
|
| 1254 |
)
|
| 1255 |
|
| 1256 |
|
| 1257 |
+
MEMORY_STRATEGIES = ("condenser", "discard-all", "hide-tool-results")
|
| 1258 |
+
|
| 1259 |
+
|
| 1260 |
+
def _normalize_memory_strategy(strategy: str) -> str:
|
| 1261 |
+
s = (strategy or "condenser").strip().lower().replace("_", "-")
|
| 1262 |
+
return s if s in MEMORY_STRATEGIES else "condenser"
|
| 1263 |
+
|
| 1264 |
+
|
| 1265 |
+
def _apply_memory_strategy(messages: List[Dict[str, str]], strategy: str, turn: int) -> None:
|
| 1266 |
+
"""Keep the message history inside a manageable context budget.
|
| 1267 |
+
|
| 1268 |
+
- condenser: no-op (the main loop also injects a periodic trusted-note
|
| 1269 |
+
summary; that is the light "condenser" this Space ships with).
|
| 1270 |
+
- discard-all: every 8 turns, reset history to [system, user question]
|
| 1271 |
+
so the model pays for fresh context rather than replaying old tool
|
| 1272 |
+
results.
|
| 1273 |
+
- hide-tool-results: cap the number of surviving tool-response user
|
| 1274 |
+
messages at 3 β older ones get their content replaced with a stub.
|
| 1275 |
+
"""
|
| 1276 |
+
if strategy == "discard-all":
|
| 1277 |
+
if turn > 1 and turn % 8 == 0 and len(messages) > 2:
|
| 1278 |
+
system_msg = messages[0]
|
| 1279 |
+
question_msg = messages[1]
|
| 1280 |
+
messages.clear()
|
| 1281 |
+
messages.append(system_msg)
|
| 1282 |
+
messages.append(question_msg)
|
| 1283 |
+
messages.append(
|
| 1284 |
+
{
|
| 1285 |
+
"role": "user",
|
| 1286 |
+
"content": "[memory discarded at turn "
|
| 1287 |
+
f"{turn} β continue the research from the original question]",
|
| 1288 |
+
}
|
| 1289 |
+
)
|
| 1290 |
+
elif strategy == "hide-tool-results":
|
| 1291 |
+
keep_tail = 3
|
| 1292 |
+
tool_indices = [
|
| 1293 |
+
i for i, m in enumerate(messages)
|
| 1294 |
+
if m.get("role") == "user" and str(m.get("content", "")).startswith("<tool_response>")
|
| 1295 |
+
]
|
| 1296 |
+
if len(tool_indices) > keep_tail:
|
| 1297 |
+
for i in tool_indices[:-keep_tail]:
|
| 1298 |
+
if messages[i]["content"] != "<tool_response>[hidden]</tool_response>":
|
| 1299 |
+
messages[i] = {
|
| 1300 |
+
"role": "user",
|
| 1301 |
+
"content": "<tool_response>[hidden]</tool_response>",
|
| 1302 |
+
}
|
| 1303 |
+
|
| 1304 |
+
|
| 1305 |
def build_research_agent(
|
| 1306 |
question: str,
|
| 1307 |
model: str,
|
| 1308 |
max_turns: int,
|
|
|
|
| 1309 |
temperature: float,
|
| 1310 |
+
memory_strategy: str = "condenser",
|
| 1311 |
):
|
| 1312 |
"""Run the ReAct research loop as a generator.
|
| 1313 |
|
|
|
|
| 1340 |
status_lines.append("π Starting research agent")
|
| 1341 |
yield _emit()
|
| 1342 |
|
| 1343 |
+
strategy = _normalize_memory_strategy(memory_strategy)
|
| 1344 |
+
os.environ["MEMORY_STRATEGY"] = strategy
|
| 1345 |
+
|
| 1346 |
for turn in range(1, max_turns + 1):
|
| 1347 |
+
_apply_memory_strategy(messages, strategy, turn)
|
| 1348 |
+
if strategy == "condenser" and state.trusted_notes and turn > 1 and turn % 3 == 0:
|
| 1349 |
summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
|
| 1350 |
messages.append(
|
| 1351 |
{
|
|
|
|
| 1422 |
queries = [str(q).strip() for q in raw_query if str(q).strip()]
|
| 1423 |
else:
|
| 1424 |
queries = [str(raw_query).strip()] if str(raw_query).strip() else []
|
| 1425 |
+
max_results = int(tool_args.get("max_results", DEFAULT_MAX_SEARCH_RESULTS))
|
| 1426 |
+
max_results = max(1, min(max_results, DEFAULT_MAX_SEARCH_RESULTS))
|
| 1427 |
|
| 1428 |
queries_preview = ", ".join(f"`{q}`" for q in queries) or "_(empty)_"
|
| 1429 |
status_lines.append(f"π turn {turn}: searching {queries_preview}")
|
|
|
|
| 1552 |
|
| 1553 |
def run_ui(
|
| 1554 |
question: str,
|
|
|
|
| 1555 |
max_turns: int,
|
| 1556 |
+
memory_strategy: str,
|
| 1557 |
temperature: float,
|
| 1558 |
):
|
| 1559 |
if not question.strip():
|
|
|
|
| 1566 |
)
|
| 1567 |
yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
|
| 1568 |
return
|
| 1569 |
+
if not QUEST_BASE_URL:
|
| 1570 |
warning = (
|
| 1571 |
+
f"`{QUEST_MODEL_ID}` needs a private HF Inference Endpoint. "
|
| 1572 |
+
"Create one at https://ui.endpoints.huggingface.co/, then set "
|
| 1573 |
+
"`QUEST_BASE_URL` in Space Secrets to the endpoint's `/v1/` URL."
|
|
|
|
| 1574 |
)
|
| 1575 |
yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
|
| 1576 |
return
|
| 1577 |
try:
|
| 1578 |
for partial_answer, partial_trace in build_research_agent(
|
| 1579 |
question=question,
|
| 1580 |
+
model=QUEST_MODEL_ID,
|
| 1581 |
max_turns=max_turns,
|
|
|
|
| 1582 |
temperature=temperature,
|
| 1583 |
+
memory_strategy=memory_strategy,
|
| 1584 |
):
|
| 1585 |
yield partial_answer, partial_trace
|
| 1586 |
except Exception as exc:
|
|
|
|
| 1687 |
|
| 1688 |
with gr.Group(elem_classes="section-card"):
|
| 1689 |
gr.HTML('<div class="section-heading">Settings</div>')
|
| 1690 |
+
gr.Textbox(
|
| 1691 |
label="Model",
|
| 1692 |
+
value=QUEST_MODEL_ID,
|
| 1693 |
+
interactive=False,
|
|
|
|
| 1694 |
)
|
| 1695 |
max_turns = gr.Slider(
|
| 1696 |
label="Max Turns",
|
| 1697 |
minimum=2,
|
| 1698 |
+
maximum=100,
|
| 1699 |
value=6,
|
| 1700 |
step=1,
|
| 1701 |
)
|
| 1702 |
+
memory_strategy = gr.Dropdown(
|
| 1703 |
+
label="Memory Strategy",
|
| 1704 |
+
choices=list(MEMORY_STRATEGIES),
|
| 1705 |
+
value="condenser",
|
|
|
|
|
|
|
| 1706 |
)
|
| 1707 |
temperature = gr.Slider(
|
| 1708 |
label="Temperature",
|
|
|
|
| 1726 |
|
| 1727 |
run_event = run_btn.click(
|
| 1728 |
fn=run_ui,
|
| 1729 |
+
inputs=[question, max_turns, memory_strategy, temperature],
|
| 1730 |
outputs=[answer, trace],
|
| 1731 |
)
|
| 1732 |
for btn, ex in zip(example_buttons, EXAMPLES):
|