Route Quest-4B through dedicated HF Inference Endpoint; ship QUEST prompt/schema
Browse files- .env.example +11 -3
- README.md +96 -40
- app.py +192 -69
.env.example
CHANGED
|
@@ -1,5 +1,13 @@
|
|
| 1 |
-
#
|
| 2 |
HF_TOKEN=hf_xxx
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Required: personal HF token with read access to osunlp/Quest-4B.
|
| 2 |
HF_TOKEN=hf_xxx
|
| 3 |
|
| 4 |
+
# Dedicated HF Inference Endpoint URL that serves osunlp/Quest-4B.
|
| 5 |
+
# Must end with /v1/.
|
| 6 |
+
QUEST_BASE_URL=https://your-endpoint-id.aws.endpoints.huggingface.cloud/v1/
|
| 7 |
+
|
| 8 |
+
# Model name the endpoint responds to. TGI containers usually use "tgi";
|
| 9 |
+
# vLLM containers usually use the original repo id ("osunlp/Quest-4B").
|
| 10 |
+
QUEST_ENDPOINT_MODEL=tgi
|
| 11 |
+
|
| 12 |
+
# Default model preselected in the dropdown.
|
| 13 |
+
DEFAULT_MODEL=osunlp/Quest-4B
|
README.md
CHANGED
|
@@ -9,58 +9,114 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# DeepResearch Space
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
-
|
| 18 |
-
- `
|
| 19 |
-
-
|
| 20 |
-
-
|
| 21 |
-
- easy model replacement later
|
| 22 |
|
| 23 |
-
|
| 24 |
|
| 25 |
-
``
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
```
|
| 31 |
|
| 32 |
-
##
|
| 33 |
|
| 34 |
-
1.
|
| 35 |
-
2.
|
| 36 |
-
3.
|
| 37 |
-
|
| 38 |
-
4.
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
###
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
1. keep tool protocol unchanged (`<tool_call>`, `<tool_response>`, `<answer>`)
|
| 59 |
-
2. replace only model adapter (`call_model`)
|
| 60 |
-
3. keep UI and tool chain unchanged
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# DeepResearch Space
|
| 13 |
|
| 14 |
+
An interactive Hugging Face Space for a **Quest DeepResearch** agent. The app
|
| 15 |
+
can either talk to **`osunlp/Quest-4B`** (our own fine-tuned research model,
|
| 16 |
+
routed through a private HF Inference Endpoint) or fall back to open-weights
|
| 17 |
+
models through the shared HF Inference API.
|
| 18 |
|
| 19 |
+
Supported tools:
|
| 20 |
+
- `search` (DuckDuckGo, multi-query)
|
| 21 |
+
- `visit` (HTTP fetch + text extraction, multi-URL)
|
| 22 |
+
- lightweight research-state summary to cut repeated work
|
| 23 |
+
- `<answer>` extraction for the final response
|
|
|
|
| 24 |
|
| 25 |
+
---
|
| 26 |
|
| 27 |
+
## 1) Use our own `osunlp/Quest-4B` model (recommended)
|
| 28 |
+
|
| 29 |
+
Because the model is **private** during the beta, it is not on the free
|
| 30 |
+
Inference API. You host it yourself on a dedicated HF Inference Endpoint
|
| 31 |
+
(pay-as-you-go, scale-to-zero), and point this Space at it.
|
|
|
|
| 32 |
|
| 33 |
+
### 1a) Create the endpoint once
|
| 34 |
|
| 35 |
+
1. Open <https://ui.endpoints.huggingface.co/> and click **"New endpoint"**.
|
| 36 |
+
2. **Model repository**: `osunlp/Quest-4B` (use a token with access).
|
| 37 |
+
3. **Hardware**: `1x Nvidia L4 (24GB)` is usually the sweet spot for a 4B
|
| 38 |
+
model. `Nvidia T4 small (16GB)` works too and is cheaper.
|
| 39 |
+
4. **Advanced → Container Type**: keep `Text Generation Inference` (TGI) or
|
| 40 |
+
pick `vLLM`. Both expose an OpenAI-compatible `/v1/` route.
|
| 41 |
+
5. **Autoscaling → Scale-to-Zero**: enable it so you only pay when the
|
| 42 |
+
endpoint is serving traffic.
|
| 43 |
+
6. Hit **Create endpoint**. After ~1–2 minutes it turns `Running` and shows a
|
| 44 |
+
base URL like `https://abcdef.us-east-1.aws.endpoints.huggingface.cloud`.
|
| 45 |
|
| 46 |
+
### 1b) Tell the Space how to reach it
|
| 47 |
|
| 48 |
+
In this Space's **Settings → Secrets / Variables**:
|
| 49 |
|
| 50 |
+
| Name | Value | Why |
|
| 51 |
+
|---|---|---|
|
| 52 |
+
| `HF_TOKEN` | your personal HF token with read access to `osunlp/Quest-4B` | pulls private weights & authenticates the endpoint call |
|
| 53 |
+
| `QUEST_BASE_URL` | the endpoint URL **ending with `/v1/`** (e.g. `https://abcdef.us-east-1.aws.endpoints.huggingface.cloud/v1/`) | tells the app to route chat completions to your endpoint |
|
| 54 |
+
| `QUEST_ENDPOINT_MODEL` | `tgi` (default; set to the original repo id `osunlp/Quest-4B` if you deployed with vLLM) | some containers need the exact model name |
|
| 55 |
+
| `DEFAULT_MODEL` | `osunlp/Quest-4B` | preselects the right option in the UI |
|
| 56 |
|
| 57 |
+
Click **Restart this Space**. The `Model` dropdown now shows
|
| 58 |
+
`osunlp/Quest-4B` at the top; selecting it routes requests through your
|
| 59 |
+
endpoint.
|
| 60 |
|
| 61 |
+
> Cost reality-check: on a 1× L4 at `$0.80/hr` with Scale-to-Zero, a small
|
| 62 |
+
> internal beta (a handful of testers, dozens of queries per day) typically
|
| 63 |
+
> stays under **\$100/month**. You can stop the endpoint manually from the UI
|
| 64 |
+
> any time to freeze costs.
|
| 65 |
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## 2) Fallback: free open-weights models
|
| 69 |
+
|
| 70 |
+
If you just want to try the UI without spinning up an endpoint, pick any of
|
| 71 |
+
these in the dropdown. They run through the shared HF Inference API.
|
| 72 |
+
|
| 73 |
+
- `Qwen/Qwen3-8B`
|
| 74 |
+
- `google/gemma-3-12b-it`
|
| 75 |
+
- `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B`
|
| 76 |
+
- `Qwen/Qwen2.5-7B-Instruct`
|
| 77 |
+
- `meta-llama/Llama-3.1-8B-Instruct`
|
| 78 |
+
|
| 79 |
+
Only `HF_TOKEN` is required for this path.
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
|
| 83 |
+
## 3) Share the beta with org members (without paying for Team)
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
Option A (simplest, **\$0** for access, Space Hardware stays on free CPU):
|
| 86 |
+
|
| 87 |
+
1. Keep the Space under your personal account.
|
| 88 |
+
2. **Settings → Visibility → Private**.
|
| 89 |
+
3. **Settings → Collaborators** → add each tester by HF username.
|
| 90 |
+
4. Endpoint lives under your personal namespace too, so the bill goes to
|
| 91 |
+
your personal payment method (you can expense invoices from
|
| 92 |
+
<https://huggingface.co/settings/billing>).
|
| 93 |
+
|
| 94 |
+
Option B (org-level billing): upgrade the organization to a Team plan and
|
| 95 |
+
recreate both the Space and the endpoint under the org namespace.
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## 4) Local development
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
python -m venv .venv
|
| 103 |
+
source .venv/bin/activate
|
| 104 |
+
pip install -r requirements.txt
|
| 105 |
+
export HF_TOKEN=... # required
|
| 106 |
+
export QUEST_BASE_URL=https://.../v1/ # optional; only if testing against the endpoint
|
| 107 |
+
python app.py
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
---
|
| 111 |
|
| 112 |
+
## 5) Architecture notes
|
| 113 |
+
|
| 114 |
+
- `app.py` uses `huggingface_hub.InferenceClient(base_url=QUEST_BASE_URL, ...)`
|
| 115 |
+
for the private-endpoint path and the same client without `base_url` for the
|
| 116 |
+
shared API path.
|
| 117 |
+
- The system prompt matches the schema Quest-4B was trained on (array-based
|
| 118 |
+
`search` / `visit` with an explicit `goal`), so the private model stays
|
| 119 |
+
in-distribution. The open-weights fallbacks also follow the same schema.
|
| 120 |
+
- Visited URLs and search queries are cached in-process so repeated tool
|
| 121 |
+
calls don't re-hit the network.
|
| 122 |
+
- `<answer>...</answer>` terminates the ReAct loop.
|
app.py
CHANGED
|
@@ -2,8 +2,9 @@ import json
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
from dataclasses import dataclass, field
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
-
from typing import Any, Dict, List, Optional, Set, Tuple
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import requests
|
|
@@ -12,44 +13,73 @@ from duckduckgo_search import DDGS
|
|
| 12 |
from huggingface_hub import InferenceClient
|
| 13 |
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"Qwen/Qwen3-8B",
|
| 18 |
"google/gemma-3-12b-it",
|
| 19 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 20 |
-
# Fallback older but usually reliable
|
| 21 |
"Qwen/Qwen2.5-7B-Instruct",
|
| 22 |
"meta-llama/Llama-3.1-8B-Instruct",
|
| 23 |
]
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
PAPER_URL = os.getenv("PAPER_URL", "#")
|
| 26 |
CODE_URL = os.getenv("CODE_URL", "#")
|
| 27 |
DATASET_URL = os.getenv("DATASET_URL", "#")
|
| 28 |
MODEL_URL = os.getenv("MODEL_URL", "#")
|
| 29 |
|
| 30 |
-
SYSTEM_PROMPT = """You are a Deep Research assistant.
|
| 31 |
-
You can think step by step, use tools, and then return a final answer.
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
</
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
<tool_call>
|
| 40 |
-
{"name":
|
| 41 |
</tool_call>
|
| 42 |
|
| 43 |
-
|
| 44 |
-
<answer>
|
| 45 |
-
...final answer...
|
| 46 |
-
</answer>
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
- If a tool fails, recover and continue.
|
| 52 |
-
"""
|
| 53 |
|
| 54 |
|
| 55 |
TOOL_RESPONSE_TEMPLATE = """<tool_response>
|
|
@@ -682,7 +712,7 @@ def parse_tool_call(text: str) -> Tuple[Optional[str], Optional[Dict[str, Any]],
|
|
| 682 |
return name, arguments, None
|
| 683 |
|
| 684 |
|
| 685 |
-
def
|
| 686 |
if not query.strip():
|
| 687 |
return {"ok": False, "error": "Search query cannot be empty."}
|
| 688 |
cache_key = f"{query.strip().lower()}::{max_results}"
|
|
@@ -704,6 +734,22 @@ def run_search(query: str, max_results: int = 5) -> Dict[str, Any]:
|
|
| 704 |
return payload
|
| 705 |
|
| 706 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 707 |
def _clean_html_to_text(html: str, max_chars: int) -> str:
|
| 708 |
soup = BeautifulSoup(html, "html.parser")
|
| 709 |
for tag in soup(["script", "style", "noscript"]):
|
|
@@ -713,12 +759,12 @@ def _clean_html_to_text(html: str, max_chars: int) -> str:
|
|
| 713 |
return text[:max_chars]
|
| 714 |
|
| 715 |
|
| 716 |
-
def
|
| 717 |
if not url.strip():
|
| 718 |
return {"ok": False, "error": "URL cannot be empty."}
|
| 719 |
cache_key = f"{url.strip()}::{max_chars}"
|
| 720 |
if cache_key in VISIT_CACHE:
|
| 721 |
-
return {**VISIT_CACHE[cache_key], "cached": True}
|
| 722 |
try:
|
| 723 |
resp = requests.get(
|
| 724 |
url,
|
|
@@ -731,11 +777,47 @@ def run_visit(url: str, max_chars: int = 6000) -> Dict[str, Any]:
|
|
| 731 |
text = _clean_html_to_text(resp.text, max_chars=max_chars)
|
| 732 |
else:
|
| 733 |
text = resp.text[:max_chars]
|
| 734 |
-
payload = {"ok": True, "url": url, "content": text, "cached": False}
|
| 735 |
VISIT_CACHE[cache_key] = payload
|
| 736 |
return payload
|
| 737 |
except Exception as exc:
|
| 738 |
-
return {"ok": False, "url": url, "error": str(exc)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
|
| 740 |
|
| 741 |
def call_model(
|
|
@@ -774,14 +856,14 @@ def build_research_agent(
|
|
| 774 |
max_search_results: int,
|
| 775 |
temperature: float,
|
| 776 |
) -> Tuple[str, str]:
|
| 777 |
-
|
| 778 |
-
|
|
|
|
| 779 |
state = AgentState()
|
| 780 |
-
used_model =
|
| 781 |
-
recent_model_candidates = [m for m in DEFAULT_FREE_MODELS if m != model]
|
| 782 |
|
| 783 |
messages: List[Dict[str, str]] = [
|
| 784 |
-
{"role": "system", "content":
|
| 785 |
{"role": "user", "content": question},
|
| 786 |
]
|
| 787 |
|
|
@@ -797,14 +879,18 @@ def build_research_agent(
|
|
| 797 |
}
|
| 798 |
)
|
| 799 |
|
| 800 |
-
|
| 801 |
client=client,
|
| 802 |
messages=messages,
|
| 803 |
-
preferred_model=
|
| 804 |
-
candidate_models=
|
| 805 |
temperature=temperature,
|
| 806 |
max_new_tokens=1400,
|
| 807 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
messages.append({"role": "assistant", "content": model_output})
|
| 809 |
state.trace.append({"turn": turn, "assistant": model_output})
|
| 810 |
|
|
@@ -827,48 +913,77 @@ def build_research_agent(
|
|
| 827 |
continue
|
| 828 |
else:
|
| 829 |
if tool_name == "search":
|
| 830 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 831 |
max_results = int(tool_args.get("max_results", max_search_results))
|
| 832 |
max_results = max(1, min(max_results, 10))
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 847 |
if first_titles:
|
| 848 |
state.trusted_notes.append(
|
| 849 |
-
f"Searched '{
|
| 850 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
elif tool_name == "visit":
|
| 852 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
max_chars = int(tool_args.get("max_chars", 6000))
|
| 854 |
max_chars = max(500, min(max_chars, 20000))
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
if snippet:
|
| 869 |
state.trusted_notes.append(
|
| 870 |
-
f"Visited {
|
| 871 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
else:
|
| 873 |
tool_response = {"ok": False, "error": f"Unknown tool: {tool_name}"}
|
| 874 |
|
|
@@ -922,6 +1037,14 @@ def run_ui(
|
|
| 922 |
"Go to Settings -> Secrets -> add `HF_TOKEN`, then retry."
|
| 923 |
)
|
| 924 |
return warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
try:
|
| 926 |
return build_research_agent(
|
| 927 |
question=question,
|
|
@@ -997,8 +1120,8 @@ with gr.Blocks(
|
|
| 997 |
gr.HTML('<div class="section-heading">Settings</div>')
|
| 998 |
model = gr.Dropdown(
|
| 999 |
label="Model",
|
| 1000 |
-
choices=
|
| 1001 |
-
value=DEFAULT_MODEL if DEFAULT_MODEL in
|
| 1002 |
allow_custom_value=True,
|
| 1003 |
)
|
| 1004 |
max_turns = gr.Slider(
|
|
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
from dataclasses import dataclass, field
|
| 5 |
+
from datetime import date
|
| 6 |
from pathlib import Path
|
| 7 |
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
import requests
|
|
|
|
| 13 |
from huggingface_hub import InferenceClient
|
| 14 |
|
| 15 |
|
| 16 |
+
# --- Model configuration ---------------------------------------------------
|
| 17 |
+
# Our own DeepResearch model. When QUEST_BASE_URL is configured in Space
|
| 18 |
+
# Secrets, the app will route requests to that dedicated HF Inference Endpoint
|
| 19 |
+
# instead of the shared HF Inference API.
|
| 20 |
+
QUEST_MODEL_ID = "osunlp/Quest-4B"
|
| 21 |
+
QUEST_BASE_URL = os.getenv("QUEST_BASE_URL", "").strip()
|
| 22 |
+
# Endpoints built from the TGI image expose a single-model OpenAI route; the
|
| 23 |
+
# model name passed to chat_completion is usually "tgi". vLLM endpoints usually
|
| 24 |
+
# want the original repo id. QUEST_ENDPOINT_MODEL overrides this if needed.
|
| 25 |
+
QUEST_ENDPOINT_MODEL = os.getenv("QUEST_ENDPOINT_MODEL", "tgi").strip() or "tgi"
|
| 26 |
+
|
| 27 |
+
# Shared HF Inference API fallbacks (free, rate-limited). These are used when
|
| 28 |
+
# the user picks one of these from the Model dropdown; they do NOT go through
|
| 29 |
+
# the private endpoint.
|
| 30 |
+
FREE_FALLBACK_MODELS = [
|
| 31 |
"Qwen/Qwen3-8B",
|
| 32 |
"google/gemma-3-12b-it",
|
| 33 |
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
|
|
|
| 34 |
"Qwen/Qwen2.5-7B-Instruct",
|
| 35 |
"meta-llama/Llama-3.1-8B-Instruct",
|
| 36 |
]
|
| 37 |
+
|
| 38 |
+
# Quest-4B shows up first when the endpoint is wired; otherwise we still list
|
| 39 |
+
# it so you can see what the target model is, but it will only work after the
|
| 40 |
+
# QUEST_BASE_URL secret is configured.
|
| 41 |
+
DEFAULT_MODEL_CHOICES = [QUEST_MODEL_ID] + FREE_FALLBACK_MODELS
|
| 42 |
+
DEFAULT_MODEL = os.getenv(
|
| 43 |
+
"DEFAULT_MODEL",
|
| 44 |
+
QUEST_MODEL_ID if QUEST_BASE_URL else FREE_FALLBACK_MODELS[0],
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
PAPER_URL = os.getenv("PAPER_URL", "#")
|
| 48 |
CODE_URL = os.getenv("CODE_URL", "#")
|
| 49 |
DATASET_URL = os.getenv("DATASET_URL", "#")
|
| 50 |
MODEL_URL = os.getenv("MODEL_URL", "#")
|
| 51 |
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
# --- System prompt ---------------------------------------------------------
|
| 54 |
+
# Full QUEST SYSTEM_PROMPT (mirrors inference/prompt.py in the research repo)
|
| 55 |
+
# so that Quest-4B sees the exact tool schema it was trained with. Other
|
| 56 |
+
# models still follow this schema just fine in practice.
|
| 57 |
+
QUEST_SYSTEM_PROMPT = """You are a deep research assistant. Your core function is to conduct thorough, multi-source investigations into any topic. You must handle both broad, open-domain inquiries and queries within specialized academic fields. For every request, synthesize information from credible, diverse sources to deliver a comprehensive, accurate, and objective response. When you have gathered sufficient information and are ready to provide the definitive response, you must enclose the entire final answer within <answer></answer> tags.
|
| 58 |
+
|
| 59 |
+
# Tools
|
| 60 |
+
|
| 61 |
+
You may call one or more functions to assist with the user query.
|
| 62 |
+
|
| 63 |
+
You are provided with function signatures within <tools></tools> XML tags:
|
| 64 |
+
<tools>
|
| 65 |
+
{"type": "function", "function": {"name": "search", "description": "Perform Google web searches then returns a string of the top search results. Accepts multiple queries.", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string", "description": "The search query."}, "minItems": 1, "description": "The list of search queries."}}, "required": ["query"]}}}
|
| 66 |
+
{"type": "function", "function": {"name": "visit", "description": "Visit webpage(s) and return the summary of the content.", "parameters": {"type": "object", "properties": {"url": {"type": "array", "items": {"type": "string"}, "description": "The URL(s) of the webpage(s) to visit. Can be a single URL or an array of URLs."}, "goal": {"type": "string", "description": "The specific information goal for visiting webpage(s)."}}, "required": ["url", "goal"]}}}
|
| 67 |
+
</tools>
|
| 68 |
+
|
| 69 |
+
# Using prev_state (Research State Summary)
|
| 70 |
+
|
| 71 |
+
If you see a "RESEARCH STATE SUMMARY (prev_state)" section in the user message, it contains a compressed summary of previous research progress. Use it to avoid repeating searches/visits that have already been executed, use verified information directly in your answer, and follow up on uncertain claims only when needed.
|
| 72 |
+
|
| 73 |
+
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
| 74 |
<tool_call>
|
| 75 |
+
{"name": <function-name>, "arguments": <args-json-object>}
|
| 76 |
</tool_call>
|
| 77 |
|
| 78 |
+
Current date: """
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
|
| 81 |
+
def build_system_prompt() -> str:
|
| 82 |
+
return QUEST_SYSTEM_PROMPT + date.today().isoformat()
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
TOOL_RESPONSE_TEMPLATE = """<tool_response>
|
|
|
|
| 712 |
return name, arguments, None
|
| 713 |
|
| 714 |
|
| 715 |
+
def _run_search_single(query: str, max_results: int) -> Dict[str, Any]:
|
| 716 |
if not query.strip():
|
| 717 |
return {"ok": False, "error": "Search query cannot be empty."}
|
| 718 |
cache_key = f"{query.strip().lower()}::{max_results}"
|
|
|
|
| 734 |
return payload
|
| 735 |
|
| 736 |
|
| 737 |
+
def run_search(query: Union[str, List[str]], max_results: int = 5) -> Dict[str, Any]:
|
| 738 |
+
"""Runs one or more queries through DuckDuckGo.
|
| 739 |
+
|
| 740 |
+
QUEST's schema passes `query` as an array of strings, while the simpler
|
| 741 |
+
starter schema used a single string. We accept both shapes.
|
| 742 |
+
"""
|
| 743 |
+
if isinstance(query, list):
|
| 744 |
+
sub_results: List[Dict[str, Any]] = []
|
| 745 |
+
for q in query:
|
| 746 |
+
if not isinstance(q, str) or not q.strip():
|
| 747 |
+
continue
|
| 748 |
+
sub_results.append(_run_search_single(q, max_results))
|
| 749 |
+
return {"ok": True, "queries": query, "results": sub_results}
|
| 750 |
+
return _run_search_single(str(query or "").strip(), max_results)
|
| 751 |
+
|
| 752 |
+
|
| 753 |
def _clean_html_to_text(html: str, max_chars: int) -> str:
|
| 754 |
soup = BeautifulSoup(html, "html.parser")
|
| 755 |
for tag in soup(["script", "style", "noscript"]):
|
|
|
|
| 759 |
return text[:max_chars]
|
| 760 |
|
| 761 |
|
| 762 |
+
def _run_visit_single(url: str, max_chars: int, goal: str = "") -> Dict[str, Any]:
|
| 763 |
if not url.strip():
|
| 764 |
return {"ok": False, "error": "URL cannot be empty."}
|
| 765 |
cache_key = f"{url.strip()}::{max_chars}"
|
| 766 |
if cache_key in VISIT_CACHE:
|
| 767 |
+
return {**VISIT_CACHE[cache_key], "cached": True, "goal": goal}
|
| 768 |
try:
|
| 769 |
resp = requests.get(
|
| 770 |
url,
|
|
|
|
| 777 |
text = _clean_html_to_text(resp.text, max_chars=max_chars)
|
| 778 |
else:
|
| 779 |
text = resp.text[:max_chars]
|
| 780 |
+
payload = {"ok": True, "url": url, "content": text, "cached": False, "goal": goal}
|
| 781 |
VISIT_CACHE[cache_key] = payload
|
| 782 |
return payload
|
| 783 |
except Exception as exc:
|
| 784 |
+
return {"ok": False, "url": url, "error": str(exc), "goal": goal}
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
def run_visit(
|
| 788 |
+
url: Union[str, List[str]],
|
| 789 |
+
max_chars: int = 6000,
|
| 790 |
+
goal: str = "",
|
| 791 |
+
) -> Dict[str, Any]:
|
| 792 |
+
"""Fetches one or more URLs. Accepts string or list (QUEST schema)."""
|
| 793 |
+
if isinstance(url, list):
|
| 794 |
+
sub_results: List[Dict[str, Any]] = []
|
| 795 |
+
for u in url:
|
| 796 |
+
if not isinstance(u, str) or not u.strip():
|
| 797 |
+
continue
|
| 798 |
+
sub_results.append(_run_visit_single(u, max_chars, goal))
|
| 799 |
+
return {"ok": True, "goal": goal, "results": sub_results}
|
| 800 |
+
return _run_visit_single(str(url or "").strip(), max_chars, goal)
|
| 801 |
+
|
| 802 |
+
|
| 803 |
+
def _build_client_for_model(model: str) -> Tuple[InferenceClient, str, List[str]]:
|
| 804 |
+
"""Returns (client, primary_model_id, fallback_model_ids).
|
| 805 |
+
|
| 806 |
+
When the user picks the Quest model and QUEST_BASE_URL is configured, the
|
| 807 |
+
InferenceClient is pointed at the dedicated endpoint; otherwise we hit the
|
| 808 |
+
shared HF Inference API and let the starter fall back across free models.
|
| 809 |
+
"""
|
| 810 |
+
token = os.getenv("HF_TOKEN")
|
| 811 |
+
if model == QUEST_MODEL_ID and QUEST_BASE_URL:
|
| 812 |
+
client = InferenceClient(
|
| 813 |
+
base_url=QUEST_BASE_URL,
|
| 814 |
+
token=token,
|
| 815 |
+
timeout=120,
|
| 816 |
+
)
|
| 817 |
+
return client, QUEST_ENDPOINT_MODEL, []
|
| 818 |
+
client = InferenceClient(token=token, timeout=60)
|
| 819 |
+
fallbacks = [m for m in FREE_FALLBACK_MODELS if m != model]
|
| 820 |
+
return client, model, fallbacks
|
| 821 |
|
| 822 |
|
| 823 |
def call_model(
|
|
|
|
| 856 |
max_search_results: int,
|
| 857 |
temperature: float,
|
| 858 |
) -> Tuple[str, str]:
|
| 859 |
+
client, primary_model, fallback_models = _build_client_for_model(model)
|
| 860 |
+
# Display label: the real HF repo id is nicer than the TGI shim name.
|
| 861 |
+
display_primary = model if (model == QUEST_MODEL_ID) else primary_model
|
| 862 |
state = AgentState()
|
| 863 |
+
used_model = display_primary
|
|
|
|
| 864 |
|
| 865 |
messages: List[Dict[str, str]] = [
|
| 866 |
+
{"role": "system", "content": build_system_prompt()},
|
| 867 |
{"role": "user", "content": question},
|
| 868 |
]
|
| 869 |
|
|
|
|
| 879 |
}
|
| 880 |
)
|
| 881 |
|
| 882 |
+
raw_output, endpoint_model = call_model(
|
| 883 |
client=client,
|
| 884 |
messages=messages,
|
| 885 |
+
preferred_model=primary_model,
|
| 886 |
+
candidate_models=fallback_models,
|
| 887 |
temperature=temperature,
|
| 888 |
max_new_tokens=1400,
|
| 889 |
)
|
| 890 |
+
model_output = raw_output
|
| 891 |
+
# Preserve the human-friendly model id for the trace even if the
|
| 892 |
+
# endpoint ignores the "model" param and returns the TGI shim name.
|
| 893 |
+
used_model = display_primary if endpoint_model == primary_model == QUEST_ENDPOINT_MODEL else endpoint_model
|
| 894 |
messages.append({"role": "assistant", "content": model_output})
|
| 895 |
state.trace.append({"turn": turn, "assistant": model_output})
|
| 896 |
|
|
|
|
| 913 |
continue
|
| 914 |
else:
|
| 915 |
if tool_name == "search":
|
| 916 |
+
raw_query = tool_args.get("query", "")
|
| 917 |
+
queries: List[str]
|
| 918 |
+
if isinstance(raw_query, list):
|
| 919 |
+
queries = [str(q).strip() for q in raw_query if str(q).strip()]
|
| 920 |
+
else:
|
| 921 |
+
queries = [str(raw_query).strip()] if str(raw_query).strip() else []
|
| 922 |
max_results = int(tool_args.get("max_results", max_search_results))
|
| 923 |
max_results = max(1, min(max_results, 10))
|
| 924 |
+
|
| 925 |
+
per_query: List[Dict[str, Any]] = []
|
| 926 |
+
for q in queries:
|
| 927 |
+
if q in state.searched_query_set:
|
| 928 |
+
per_query.append({
|
| 929 |
+
"ok": True,
|
| 930 |
+
"query": q,
|
| 931 |
+
"cached": True,
|
| 932 |
+
"note": "Already searched; reusing cached result.",
|
| 933 |
+
"results": [],
|
| 934 |
+
})
|
| 935 |
+
continue
|
| 936 |
+
state.searched_queries.append(q)
|
| 937 |
+
state.searched_query_set.add(q)
|
| 938 |
+
single = _run_search_single(q, max_results)
|
| 939 |
+
per_query.append(single)
|
| 940 |
+
if single.get("ok"):
|
| 941 |
+
first_titles = [r.get("title", "") for r in single.get("results", [])[:2]]
|
| 942 |
if first_titles:
|
| 943 |
state.trusted_notes.append(
|
| 944 |
+
f"Searched '{q}' and found leads: {', '.join(t for t in first_titles if t)}"
|
| 945 |
)
|
| 946 |
+
tool_response = (
|
| 947 |
+
per_query[0]
|
| 948 |
+
if len(per_query) == 1
|
| 949 |
+
else {"ok": True, "queries": queries, "results": per_query}
|
| 950 |
+
)
|
| 951 |
elif tool_name == "visit":
|
| 952 |
+
raw_url = tool_args.get("url", "")
|
| 953 |
+
urls: List[str]
|
| 954 |
+
if isinstance(raw_url, list):
|
| 955 |
+
urls = [str(u).strip() for u in raw_url if str(u).strip()]
|
| 956 |
+
else:
|
| 957 |
+
urls = [str(raw_url).strip()] if str(raw_url).strip() else []
|
| 958 |
+
goal = str(tool_args.get("goal", "")).strip()
|
| 959 |
max_chars = int(tool_args.get("max_chars", 6000))
|
| 960 |
max_chars = max(500, min(max_chars, 20000))
|
| 961 |
+
|
| 962 |
+
per_url: List[Dict[str, Any]] = []
|
| 963 |
+
for u in urls:
|
| 964 |
+
if u in state.visited_url_set:
|
| 965 |
+
per_url.append({
|
| 966 |
+
"ok": True,
|
| 967 |
+
"url": u,
|
| 968 |
+
"cached": True,
|
| 969 |
+
"note": "Already visited; reusing cached result.",
|
| 970 |
+
})
|
| 971 |
+
continue
|
| 972 |
+
state.visited_urls.append(u)
|
| 973 |
+
state.visited_url_set.add(u)
|
| 974 |
+
single = _run_visit_single(u, max_chars, goal)
|
| 975 |
+
per_url.append(single)
|
| 976 |
+
if single.get("ok"):
|
| 977 |
+
snippet = str(single.get("content", ""))[:180]
|
| 978 |
if snippet:
|
| 979 |
state.trusted_notes.append(
|
| 980 |
+
f"Visited {u} and extracted key context: {snippet}"
|
| 981 |
)
|
| 982 |
+
tool_response = (
|
| 983 |
+
per_url[0]
|
| 984 |
+
if len(per_url) == 1
|
| 985 |
+
else {"ok": True, "goal": goal, "results": per_url}
|
| 986 |
+
)
|
| 987 |
else:
|
| 988 |
tool_response = {"ok": False, "error": f"Unknown tool: {tool_name}"}
|
| 989 |
|
|
|
|
| 1037 |
"Go to Settings -> Secrets -> add `HF_TOKEN`, then retry."
|
| 1038 |
)
|
| 1039 |
return warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
|
| 1040 |
+
if model == QUEST_MODEL_ID and not QUEST_BASE_URL:
|
| 1041 |
+
warning = (
|
| 1042 |
+
f"`{QUEST_MODEL_ID}` is private and not available via the free HF Inference API. "
|
| 1043 |
+
"Create a dedicated HF Inference Endpoint for it (https://ui.endpoints.huggingface.co/), "
|
| 1044 |
+
"then set `QUEST_BASE_URL` in Space Secrets to the endpoint's `/v1/` URL. "
|
| 1045 |
+
"In the meantime you can pick one of the open-weights models in the dropdown."
|
| 1046 |
+
)
|
| 1047 |
+
return warning, json.dumps({"error": warning}, ensure_ascii=False, indent=2)
|
| 1048 |
try:
|
| 1049 |
return build_research_agent(
|
| 1050 |
question=question,
|
|
|
|
| 1120 |
gr.HTML('<div class="section-heading">Settings</div>')
|
| 1121 |
model = gr.Dropdown(
|
| 1122 |
label="Model",
|
| 1123 |
+
choices=DEFAULT_MODEL_CHOICES,
|
| 1124 |
+
value=DEFAULT_MODEL if DEFAULT_MODEL in DEFAULT_MODEL_CHOICES else DEFAULT_MODEL_CHOICES[0],
|
| 1125 |
allow_custom_value=True,
|
| 1126 |
)
|
| 1127 |
max_turns = gr.Slider(
|