command (without additional steps). If it does, please also provide a link to its PyPI page; if not, explicitly note that this installation method isn't mentioned.",
},
]
def _example_label(ex: Dict[str, str]) -> str:
return f"{ex['icon']} {ex['category']} — {ex['text']}"
with gr.Blocks(
title="QUEST · Deep Research by OSU NLP",
theme=APP_THEME,
css=CUSTOM_CSS,
fill_width=True,
) as demo:
# --- Quest-style header (Q mark + title + byline) ---
gr.HTML(
"""
"""
)
# --- Main two-column layout ---
with gr.Row(elem_classes="layout-gap"):
with gr.Column(scale=6, min_width=420):
with gr.Group(elem_classes="section-card"):
gr.HTML(
'Ask the agent
'
'QUEST: What can I research for you?
'
)
question = gr.Textbox(
show_label=False,
placeholder="Ask anything you want to research in depth...",
lines=6,
)
with gr.Row(elem_classes="action-row"):
run_btn = gr.Button("Run Research", variant="primary", size="lg")
stop_btn = gr.Button("Stop", variant="stop", size="lg")
clear_btn = gr.Button("Clear", variant="secondary", size="lg")
with gr.Group(elem_classes="section-card"):
gr.HTML(
'Try examples
'
'These are simple starting points; feel free to ask your own complex research questions.
'
)
with gr.Column(elem_classes="example-buttons"):
example_buttons = [
gr.Button(_example_label(ex), variant="secondary", elem_classes="example-btn")
for ex in EXAMPLES
]
with gr.Group(elem_classes="section-card"):
gr.HTML(
''
'Output'
''
'If you see a connection error, please wait a moment and retry.'
''
'
'
)
with gr.Tabs():
with gr.TabItem("Result"):
answer = gr.Markdown(label="Final Answer")
with gr.TabItem("Record"):
trace = gr.Code(label="Execution Trace (JSON)", language="json")
with gr.Column(scale=4, min_width=340, elem_classes="right-stack"):
with gr.Group(elem_classes="section-card"):
gr.HTML(
f"""
Open release
"""
)
with gr.Group(elem_classes="section-card"):
gr.HTML('Settings
')
gr.Textbox(
label="Model",
value=QUEST_MODEL_ID,
interactive=False,
elem_id="quest-model",
)
memory_strategy = gr.Radio(
label="Memory Strategy",
choices=[
("Condenser (default)", "condenser"),
("Vanilla", "vanilla"),
("Discard-all", "discard_all"),
("Hide-tool-result", "hide_tool_result"),
],
value="condenser",
elem_id="quest-memory-strategy",
)
gr.HTML(
''
'Condenser (default) — when context grows large, a State Summarizer LLM compresses earlier turns into a structured JSON of trusted/untrusted/uncertain claims, visited sources, and prior search queries; the agent continues with that compact state.
'
'Vanilla — memory management disabled; the full conversation history is kept.
'
'Discard-all — when context grows large, the entire message history is reset, restarting the agent from the original question with no accumulated context.
'
'Hide-tool-result — when context grows large, older tool responses are pruned; only the most recent tool result is kept.'
'
'
)
max_turns = gr.Slider(
label="Max Turns",
minimum=2,
maximum=50,
value=25,
step=1,
elem_id="quest-max-turns",
)
temperature = gr.Slider(
label="Temperature",
minimum=0.0,
maximum=1.5,
value=1.0,
step=0.1,
elem_id="quest-temperature",
)
gr.HTML(
"""
"""
)
# IMPORTANT: keep `run_event` pointing at the .click() (the long-running
# generator), not at the chained .then(). stop_btn.cancels=[run_event]
# must target the generator for Stop to actually interrupt it; if we
# capture the .then() result instead, Stop cancels the instant clear
# lambda and the agent keeps running.
run_event = run_btn.click(
fn=run_ui,
inputs=[question, max_turns, memory_strategy, temperature],
outputs=[answer, trace],
)
# .success() (not .then()) so the textbox is cleared ONLY on a clean run.
# If Stop cancels the generator we leave the question intact, so the user
# can tweak it and re-run without retyping.
run_event.success(
fn=lambda: "",
inputs=[],
outputs=[question],
)
for btn, ex in zip(example_buttons, EXAMPLES):
btn.click(
fn=(lambda text=ex["text"]: text),
inputs=[],
outputs=[question],
)
stop_btn.click(fn=None, cancels=[run_event])
clear_btn.click(
fn=lambda: ("", "", "{}"),
inputs=[],
outputs=[question, answer, trace],
)
# The research agent is almost entirely I/O-bound (it waits on the OSC vLLM
# endpoint, Serper, Jina and Azure over HTTP), so many runs can proceed in
# parallel on even a small CPU box. Gradio's default_concurrency_limit is 1,
# which serialises every run_ui call and is what produces the "long queue of
# requests pending" warning. Lift it; the real ceiling is the 8 OSC vLLM
# instances behind the nginx load balancer. Tunable via the QUEST_CONCURRENCY
# Space variable without a code change.
QUEST_CONCURRENCY = int(os.getenv("QUEST_CONCURRENCY", "12"))
QUEST_QUEUE_MAX = int(os.getenv("QUEST_QUEUE_MAX", "80"))
demo.queue(
default_concurrency_limit=QUEST_CONCURRENCY,
max_size=QUEST_QUEUE_MAX,
)
if __name__ == "__main__":
demo.launch(max_threads=max(40, QUEST_CONCURRENCY * 3))