Spaces:

osunlp
/

QUEST

Running

TomLii Claude Sonnet 4.6 commited on Apr 21

Commit

76004d7

1 Parent(s): bc3f415

Streamline Settings panel: Quest-4B only, raise Max Turns, add Memory Strategy

- Drop the free-model fallback list; the Space is Quest-4B only now and the
Model field is a read-only display.
- Raise Max Turns ceiling from 20 to 100 to match long-horizon runs.
- Remove the Search Results Per Query slider; the agent pins max_results to
DEFAULT_MAX_SEARCH_RESULTS (10).
- Add a Memory Strategy dropdown with condenser (default), discard-all, and
hide-tool-results. Strategy is applied per turn — discard-all resets the
history every 8 turns and hide-tool-results collapses older tool responses
to a stub, mirroring the QUEST research repo's env-var switch.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +79 -45

app.py CHANGED Viewed

@@ -24,25 +24,13 @@ QUEST_BASE_URL = os.getenv("QUEST_BASE_URL", "").strip()
 # want the original repo id. QUEST_ENDPOINT_MODEL overrides this if needed.
 QUEST_ENDPOINT_MODEL = os.getenv("QUEST_ENDPOINT_MODEL", "tgi").strip() or "tgi"
-# Shared HF Inference API fallbacks (free, rate-limited). These are used when
-# the user picks one of these from the Model dropdown; they do NOT go through
-# the private endpoint.
-FREE_FALLBACK_MODELS = [
-    "Qwen/Qwen3-8B",
-    "google/gemma-3-12b-it",
-    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
-    "Qwen/Qwen2.5-7B-Instruct",
-    "meta-llama/Llama-3.1-8B-Instruct",
-]
-# Quest-4B shows up first when the endpoint is wired; otherwise we still list
-# it so you can see what the target model is, but it will only work after the
-# QUEST_BASE_URL secret is configured.
-DEFAULT_MODEL_CHOICES = [QUEST_MODEL_ID] + FREE_FALLBACK_MODELS
-DEFAULT_MODEL = os.getenv(
-    "DEFAULT_MODEL",
-    QUEST_MODEL_ID if QUEST_BASE_URL else FREE_FALLBACK_MODELS[0],
-)
 PAPER_URL = os.getenv("PAPER_URL", "#")
 CODE_URL = os.getenv("CODE_URL", "#")
@@ -1200,8 +1188,7 @@ def _build_client_for_model(model: str) -> Tuple[InferenceClient, str, List[str]
         )
         return client, QUEST_ENDPOINT_MODEL, []
     client = InferenceClient(token=token, timeout=60)
-    fallbacks = [m for m in FREE_FALLBACK_MODELS if m != model]
-    return client, model, fallbacks
 def call_model(
@@ -1267,12 +1254,60 @@ def _trace_to_json(state: "AgentState", used_model: str) -> str:
     )
 def build_research_agent(
     question: str,
     model: str,
     max_turns: int,
-    max_search_results: int,
     temperature: float,
 ):
     """Run the ReAct research loop as a generator.
@@ -1305,8 +1340,12 @@ def build_research_agent(
     status_lines.append("🚀 Starting research agent")
     yield _emit()
     for turn in range(1, max_turns + 1):
-        if state.trusted_notes and turn > 1 and turn % 3 == 0:
             summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
             messages.append(
                 {
@@ -1383,8 +1422,8 @@ def build_research_agent(
                     queries = [str(q).strip() for q in raw_query if str(q).strip()]
                 else:
                     queries = [str(raw_query).strip()] if str(raw_query).strip() else []
-                max_results = int(tool_args.get("max_results", max_search_results))
-                max_results = max(1, min(max_results, 10))
                 queries_preview = ", ".join(f"`{q}`" for q in queries) or "_(empty)_"
                 status_lines.append(f"🔍 turn {turn}: searching {queries_preview}")
@@ -1513,9 +1552,8 @@ def build_research_agent(
 def run_ui(
     question: str,
-    model: str,
     max_turns: int,
-    max_search_results: int,
     temperature: float,
 ):
     if not question.strip():
@@ -1528,22 +1566,21 @@ def run_ui(
         )
         yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
         return
-    if model == QUEST_MODEL_ID and not QUEST_BASE_URL:
         warning = (
-            f"`{QUEST_MODEL_ID}` is private and not available via the free HF Inference API. "
-            "Create a dedicated HF Inference Endpoint for it (https://ui.endpoints.huggingface.co/), "
-            "then set `QUEST_BASE_URL` in Space Secrets to the endpoint's `/v1/` URL. "
-            "In the meantime you can pick one of the open-weights models in the dropdown."
         )
         yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
         return
     try:
         for partial_answer, partial_trace in build_research_agent(
             question=question,
-            model=model,
             max_turns=max_turns,
-            max_search_results=max_search_results,
             temperature=temperature,
         ):
             yield partial_answer, partial_trace
     except Exception as exc:
@@ -1650,25 +1687,22 @@ with gr.Blocks(
             with gr.Group(elem_classes="section-card"):
                 gr.HTML('<div class="section-heading">Settings</div>')
-                model = gr.Dropdown(
                     label="Model",
-                    choices=DEFAULT_MODEL_CHOICES,
-                    value=DEFAULT_MODEL if DEFAULT_MODEL in DEFAULT_MODEL_CHOICES else DEFAULT_MODEL_CHOICES[0],
-                    allow_custom_value=True,
                 )
                 max_turns = gr.Slider(
                     label="Max Turns",
                     minimum=2,
-                    maximum=20,
                     value=6,
                     step=1,
                 )
-                max_search_results = gr.Slider(
-                    label="Search Results Per Query",
-                    minimum=1,
-                    maximum=10,
-                    value=5,
-                    step=1,
                 )
                 temperature = gr.Slider(
                     label="Temperature",
@@ -1692,7 +1726,7 @@ with gr.Blocks(
     run_event = run_btn.click(
         fn=run_ui,
-        inputs=[question, model, max_turns, max_search_results, temperature],
         outputs=[answer, trace],
     )
     for btn, ex in zip(example_buttons, EXAMPLES):

 # want the original repo id. QUEST_ENDPOINT_MODEL overrides this if needed.
 QUEST_ENDPOINT_MODEL = os.getenv("QUEST_ENDPOINT_MODEL", "tgi").strip() or "tgi"
+# This Space runs exclusively on Quest-4B served via the private HF Inference
+# Endpoint pointed to by QUEST_BASE_URL. No public fallback list — the model
+# field in the UI is display-only.
+DEFAULT_MODEL = QUEST_MODEL_ID
+# Internal defaults. Search budget is no longer user-tunable.
+DEFAULT_MAX_SEARCH_RESULTS = 10
 PAPER_URL = os.getenv("PAPER_URL", "#")
 CODE_URL = os.getenv("CODE_URL", "#")
         )
         return client, QUEST_ENDPOINT_MODEL, []
     client = InferenceClient(token=token, timeout=60)
+    return client, model, []
 def call_model(
     )
+MEMORY_STRATEGIES = ("condenser", "discard-all", "hide-tool-results")
+def _normalize_memory_strategy(strategy: str) -> str:
+    s = (strategy or "condenser").strip().lower().replace("_", "-")
+    return s if s in MEMORY_STRATEGIES else "condenser"
+def _apply_memory_strategy(messages: List[Dict[str, str]], strategy: str, turn: int) -> None:
+    """Keep the message history inside a manageable context budget.
+    - condenser: no-op (the main loop also injects a periodic trusted-note
+      summary; that is the light "condenser" this Space ships with).
+    - discard-all: every 8 turns, reset history to [system, user question]
+      so the model pays for fresh context rather than replaying old tool
+      results.
+    - hide-tool-results: cap the number of surviving tool-response user
+      messages at 3 — older ones get their content replaced with a stub.
+    """
+    if strategy == "discard-all":
+        if turn > 1 and turn % 8 == 0 and len(messages) > 2:
+            system_msg = messages[0]
+            question_msg = messages[1]
+            messages.clear()
+            messages.append(system_msg)
+            messages.append(question_msg)
+            messages.append(
+                {
+                    "role": "user",
+                    "content": "[memory discarded at turn "
+                    f"{turn} — continue the research from the original question]",
+                }
+            )
+    elif strategy == "hide-tool-results":
+        keep_tail = 3
+        tool_indices = [
+            i for i, m in enumerate(messages)
+            if m.get("role") == "user" and str(m.get("content", "")).startswith("<tool_response>")
+        ]
+        if len(tool_indices) > keep_tail:
+            for i in tool_indices[:-keep_tail]:
+                if messages[i]["content"] != "<tool_response>[hidden]</tool_response>":
+                    messages[i] = {
+                        "role": "user",
+                        "content": "<tool_response>[hidden]</tool_response>",
+                    }
 def build_research_agent(
     question: str,
     model: str,
     max_turns: int,
     temperature: float,
+    memory_strategy: str = "condenser",
 ):
     """Run the ReAct research loop as a generator.
     status_lines.append("🚀 Starting research agent")
     yield _emit()
+    strategy = _normalize_memory_strategy(memory_strategy)
+    os.environ["MEMORY_STRATEGY"] = strategy
     for turn in range(1, max_turns + 1):
+        _apply_memory_strategy(messages, strategy, turn)
+        if strategy == "condenser" and state.trusted_notes and turn > 1 and turn % 3 == 0:
             summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
             messages.append(
                 {
                     queries = [str(q).strip() for q in raw_query if str(q).strip()]
                 else:
                     queries = [str(raw_query).strip()] if str(raw_query).strip() else []
+                max_results = int(tool_args.get("max_results", DEFAULT_MAX_SEARCH_RESULTS))
+                max_results = max(1, min(max_results, DEFAULT_MAX_SEARCH_RESULTS))
                 queries_preview = ", ".join(f"`{q}`" for q in queries) or "_(empty)_"
                 status_lines.append(f"🔍 turn {turn}: searching {queries_preview}")
 def run_ui(
     question: str,
     max_turns: int,
+    memory_strategy: str,
     temperature: float,
 ):
     if not question.strip():
         )
         yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
         return
+    if not QUEST_BASE_URL:
         warning = (
+            f"`{QUEST_MODEL_ID}` needs a private HF Inference Endpoint. "
+            "Create one at https://ui.endpoints.huggingface.co/, then set "
+            "`QUEST_BASE_URL` in Space Secrets to the endpoint's `/v1/` URL."
         )
         yield warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
         return
     try:
         for partial_answer, partial_trace in build_research_agent(
             question=question,
+            model=QUEST_MODEL_ID,
             max_turns=max_turns,
             temperature=temperature,
+            memory_strategy=memory_strategy,
         ):
             yield partial_answer, partial_trace
     except Exception as exc:
             with gr.Group(elem_classes="section-card"):
                 gr.HTML('<div class="section-heading">Settings</div>')
+                gr.Textbox(
                     label="Model",
+                    value=QUEST_MODEL_ID,
+                    interactive=False,
                 )
                 max_turns = gr.Slider(
                     label="Max Turns",
                     minimum=2,
+                    maximum=100,
                     value=6,
                     step=1,
                 )
+                memory_strategy = gr.Dropdown(
+                    label="Memory Strategy",
+                    choices=list(MEMORY_STRATEGIES),
+                    value="condenser",
                 )
                 temperature = gr.Slider(
                     label="Temperature",
     run_event = run_btn.click(
         fn=run_ui,
+        inputs=[question, max_turns, memory_strategy, temperature],
         outputs=[answer, trace],
     )
     for btn, ex in zip(example_buttons, EXAMPLES):