Spaces:

osunlp
/

QUEST

Running

Lzy01241010 Claude Opus 4.7 commited on 10 days ago

Commit

0bfaafb

1 Parent(s): 70bbd7b

agent: wire research-repo Jina + LLM extractor + scholar + real condenser

Brings the Space inference path into functional parity with
OSU-NLP-Group/QUEST/inference/:

- System prompt now advertises a third tool `google_scholar` (Serper /scholar).
- `visit` first tries r.jina.ai (with `JINA_API_KEYS`); on a hit it runs the
SUMMARY model (`SUMMARY_MODEL_NAME`, `API_KEY`/`API_BASE`) with the
research-repo EXTRACTOR_PROMPT for goal-directed distillation. Falls back
to the existing BeautifulSoup path if Jina or the LLM is unavailable.
- New `_run_scholar_single` mirrors inference/tool_scholar.py (Serper
/scholar endpoint, same row fields).
- `condenser` strategy now invokes a real State Summarizer: when the
in-context token estimate crosses MEMORY_TOKEN_THRESHOLD (default 16000),
the MEMORY model is called with the verbatim MEMORY_SYSTEM_PROMPT from
inference/tool_memory.py to produce the structured trusted/untrusted/
uncertain JSON, which is then injected as RESEARCH STATE SUMMARY and
replaces the long history. The legacy turn-count heuristic stays as a
fallback if the MEMORY model is not configured.
- requirements.txt: openai>=1.40, tiktoken>=0.7

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

app.py +384 -3
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -52,11 +52,27 @@ You are provided with function signatures within <tools></tools> XML tags:
 <tools>
 {"type": "function", "function": {"name": "search", "description": "Perform Google web searches then returns a string of the top search results. Accepts multiple queries.", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string", "description": "The search query."}, "minItems": 1, "description": "The list of search queries."}}, "required": ["query"]}}}
 {"type": "function", "function": {"name": "visit", "description": "Visit webpage(s) and return the summary of the content.", "parameters": {"type": "object", "properties": {"url": {"type": "array", "items": {"type": "string"}, "description": "The URL(s) of the webpage(s) to visit. Can be a single URL or an array of URLs."}, "goal": {"type": "string", "description": "The specific information goal for visiting webpage(s)."}}, "required": ["url", "goal"]}}}
 </tools>
 # Using prev_state (Research State Summary)
-If you see a "RESEARCH STATE SUMMARY (prev_state)" section in the user message, it contains a compressed summary of previous research progress. Use it to avoid repeating searches/visits that have already been executed, use verified information directly in your answer, and follow up on uncertain claims only when needed.
 For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 <tool_call>
@@ -66,6 +82,116 @@ For each function call, return a json object with function name and arguments wi
 Current date: """
 def build_system_prompt() -> str:
     return QUEST_SYSTEM_PROMPT + date.today().isoformat()
@@ -1417,12 +1543,195 @@ def _clean_html_to_text(html: str, max_chars: int) -> str:
     return text[:max_chars]
 def _run_visit_single(url: str, max_chars: int, goal: str = "") -> Dict[str, Any]:
     if not url.strip():
         return {"ok": False, "error": "URL cannot be empty."}
-    cache_key = f"{url.strip()}::{max_chars}"
     if cache_key in VISIT_CACHE:
         return {**VISIT_CACHE[cache_key], "cached": True, "goal": goal}
     try:
         resp = requests.get(
             url,
@@ -1634,6 +1943,13 @@ def build_research_agent(
     ]
     final_answer: Optional[str] = None
     status_lines.append("🚀 Starting research agent")
     yield _emit()
@@ -1643,7 +1959,53 @@ def build_research_agent(
     for turn in range(1, max_turns + 1):
         _apply_memory_strategy(messages, strategy, turn)
-        if strategy == "condenser" and state.trusted_notes and turn > 1 and turn % 3 == 0:
             summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
             messages.append(
                 {
@@ -1816,6 +2178,25 @@ def build_research_agent(
                     f"✅ turn {turn}: read {visit_ok}/{len(urls)} page(s)"
                 )
                 yield _emit()
             else:
                 tool_response = {"ok": False, "error": f"Unknown tool: {tool_name}"}
                 status_lines.append(f"⚠️ turn {turn}: unknown tool `{tool_name}`")

 <tools>
 {"type": "function", "function": {"name": "search", "description": "Perform Google web searches then returns a string of the top search results. Accepts multiple queries.", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string", "description": "The search query."}, "minItems": 1, "description": "The list of search queries."}}, "required": ["query"]}}}
 {"type": "function", "function": {"name": "visit", "description": "Visit webpage(s) and return the summary of the content.", "parameters": {"type": "object", "properties": {"url": {"type": "array", "items": {"type": "string"}, "description": "The URL(s) of the webpage(s) to visit. Can be a single URL or an array of URLs."}, "goal": {"type": "string", "description": "The specific information goal for visiting webpage(s)."}}, "required": ["url", "goal"]}}}
+{"type": "function", "function": {"name": "google_scholar", "description": "Leverage Google Scholar to retrieve relevant information from academic publications. Accepts multiple queries.", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string", "description": "The search query."}, "minItems": 1, "description": "The list of search queries for Google Scholar."}}, "required": ["query"]}}}
 </tools>
 # Using prev_state (Research State Summary)
+If you see a "RESEARCH STATE SUMMARY (prev_state)" section in the user message, it contains a compressed summary of previous research progress. Use it to:
+1. **Avoid redundant work**:
+   - Check `search_queries` to avoid repeating searches that have already been executed.
+   - Check `visited_sources` to avoid visiting URLs that have already been visited.
+2. **Use verified information**:
+   - Check `information_state.trusted` for facts that have been verified from visited sources. You can use these directly in your answer without re-searching or re-visiting.
+   - Check `information_state.untrusted` for claims that have been contradicted or proven unreliable.
+3. **Follow up on uncertain information**:
+   - Check `information_state.uncertain` for claims that need more evidence. The `need` field specifies the exact next action (e.g., "visit <URL>" or "search <query>") to resolve the uncertainty.
+IMPORTANT: Do NOT search for or visit information that is already in `prev_state`, unless it's insufficient to answer the user's question. Only in this case, you are encouraged to search for more information or even visit the same URL. Instead, use the information from `prev_state` directly, or follow the specific actions suggested in `information_state.uncertain.need` if more information is needed.
+The final answer must exclude any information that remains uncertain or pending. All statements included must be fully verified.
 For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 <tool_call>
 Current date: """
+# ---------------------------------------------------------------------------
+# Vendored prompts from OSU-NLP-Group/QUEST (inference/prompt.py and
+# inference/tool_memory.py). Kept verbatim so the secondary-LLM behaviour
+# (visit extractor + condenser State Summarizer) matches the research code.
+# ---------------------------------------------------------------------------
+EXTRACTOR_PROMPT = """Please process the following webpage content and user goal to extract relevant information:
+## **Webpage Content**
+{webpage_content}
+## **User Goal**
+{goal}
+## **Task Guidelines**
+1. **Content Scanning for Rationale**: Locate the **specific sections/data** directly related to the user's goal within the webpage content
+2. **Key Extraction for Evidence**: Identify and extract the **most relevant information** from the content, you never miss any important information, output the **full original context** of the content as far as possible, it can be more than three paragraphs.
+3. **Summary Output for Summary**: Organize into a concise paragraph with logical flow, prioritizing clarity and judge the contribution of the information to the goal.
+**Final Output Format using JSON format has "rational", "evidence", "summary" feilds**
+"""
+MEMORY_SYSTEM_PROMPT = """You are a State Summarizer for a DeepResearch agent.
+Your ONLY job is to maintain a compact, parseable, context-aware state JSON for memory management.
+Your primary objective is to prevent redundant search and redundant visit actions by
+extracting useful, answer-ready information from tool responses and preserving it
+in a structured state.
+You will be given:
+1) events: a chronological list of interaction events (user/assistant messages and tool calls/responses)
+2) prev_state: the previous state JSON (may be empty or null)
+You MUST output ONLY a single JSON object that conforms EXACTLY to the schema below.
+No markdown, no extra text, no code fences, no explanations.
+========================
+OUTPUT JSON SCHEMA (STRICT)
+{
+  "version": "dr_state",
+  "search_queries": [
+    { "q": "string", "intent": "string" }
+  ],
+  "visited_sources": [
+    { "url": "string", "note": "string" }
+  ],
+  "information_state": {
+    "trusted": [
+      { "id": "T1", "claim": "string", "sources": ["string"], "reason": "string" }
+    ],
+    "untrusted": [
+      { "id": "U1", "claim": "string", "sources": ["string"], "reason": "string" }
+    ],
+    "uncertain": [
+      { "id": "C1", "claim": "string", "sources": ["string"], "reason": "string", "need": "string" }
+    ]
+  }
+}
+========================
+TRIGGER NOTE (IMPORTANT)
+This summarizer is invoked automatically when CONTEXT_THRESHOLD is reached:
+- The system invokes summarization when context tokens reach a threshold.
+- Focus on extracting evidence, deduplicating tool usage, and making the state more actionable.
+Note: Agent-initiated condenser tool calls are ignored for memory updates.
+Only automatic CONTEXT_THRESHOLD triggers will update the memory state.
+========================
+CORE PRINCIPLE (CRITICAL)
+Visited pages alone are NOT useful memory.
+For every visit() tool_response, you MUST attempt to extract at least one
+useful, concrete fact into information_state unless the page is irrelevant.
+The goal is that the DeepResearch agent can rely on information_state.trusted
+to answer questions directly, and rely on information_state.uncertain.need
+to know the exact next step without re-searching.
+========================
+UPDATE RULES (IMPORTANT)
+0) Anti-redundancy objective:
+- The state must clearly encode:
+  a) what is already verified and final (trusted),
+  b) what is false or contradicted (untrusted),
+  c) what is missing AND the exact next action to resolve it (uncertain.need).
+- Prefer concrete actions such as:
+  "visit <exact URL>" or "search <exact query>".
+1) Merge with prev_state:
+- Start from prev_state if provided; update it using new events.
+- Never delete past entries except for:
+  a) exact duplicates, or
+  b) bucket migration (moving the same claim between uncertain/trusted/untrusted).
+2) De-duplication:
+- search_queries: dedupe by exact "q" string.
+- visited_sources: dedupe by exact "url".
+- information_state: dedupe by exact "claim" string ACROSS ALL BUCKETS with priority:
+  trusted > untrusted > uncertain.
+3) Output ONLY the JSON object. No markdown, no extra text.
+Return ONLY the updated JSON object."""
 def build_system_prompt() -> str:
     return QUEST_SYSTEM_PROMPT + date.today().isoformat()
     return text[:max_chars]
+# ---------------------------------------------------------------------------
+# Secondary-LLM helpers (visit extractor, condenser State Summarizer, scholar).
+# Mirror inference/tool_visit.py + inference/tool_memory.py + inference/tool_scholar.py.
+# Each helper is best-effort: if the relevant env vars are missing it returns
+# None / falls through to the legacy behaviour so the Space still works.
+# ---------------------------------------------------------------------------
+JINA_API_KEYS = os.getenv("JINA_API_KEYS", "").strip()
+WEBCONTENT_MAXLENGTH = int(os.getenv("WEBCONTENT_MAXLENGTH", "60000"))
+SUMMARY_MODEL_NAME = os.getenv("SUMMARY_MODEL_NAME", "").strip()
+SUMMARY_API_KEY = (os.getenv("API_KEY") or os.getenv("SUMMARY_OPENAI_API_KEY") or "").strip()
+SUMMARY_API_BASE = (os.getenv("API_BASE") or os.getenv("SUMMARY_OPENAI_BASE_URL") or "").strip() or None
+MEMORY_MODEL_NAME = os.getenv("MEMORY_MODEL_NAME", "").strip()
+MEMORY_API_KEY = (os.getenv("MEMORY_OPENAI_API_KEY") or SUMMARY_API_KEY).strip()
+MEMORY_API_BASE = (os.getenv("MEMORY_OPENAI_BASE_URL") or SUMMARY_API_BASE) or None
+MEMORY_TOKEN_THRESHOLD = int(
+    os.getenv("MEMORY_THRESHOLD")
+    or os.getenv("MEMORY_CONTEXT_THRESHOLD")
+    or os.getenv("MEMORY_TOKEN_THRESHOLD")
+    or "16000"
+)
+def _get_openai_client(api_key: str, base_url: Optional[str]):
+    """Lazy import so the Space still imports if `openai` isn't installed yet."""
+    try:
+        from openai import OpenAI
+    except Exception:
+        return None
+    if not api_key:
+        return None
+    return OpenAI(api_key=api_key, base_url=base_url) if base_url else OpenAI(api_key=api_key)
+def _approx_token_count(text: str) -> int:
+    """Cheap token estimate (~4 chars/token). Tiktoken is heavy; this is fine
+    for threshold gating where being off by 20% is harmless."""
+    try:
+        import tiktoken
+        return len(tiktoken.get_encoding("cl100k_base").encode(text))
+    except Exception:
+        return max(1, len(text) // 4)
+def _messages_token_count(messages: List[Dict[str, str]]) -> int:
+    return sum(_approx_token_count(str(m.get("content", ""))) for m in messages)
+def _jina_readpage(url: str) -> Optional[str]:
+    """Fetch a page via Jina Reader (r.jina.ai). Returns markdown text on
+    success, None on failure (caller falls back to BeautifulSoup)."""
+    if not JINA_API_KEYS:
+        return None
+    headers = {"Authorization": f"Bearer {JINA_API_KEYS}"}
+    for attempt in range(3):
+        try:
+            r = requests.get(f"https://r.jina.ai/{url}", headers=headers, timeout=50)
+            if r.status_code == 200 and r.text:
+                return r.text[:WEBCONTENT_MAXLENGTH]
+        except Exception:
+            if attempt == 2:
+                return None
+    return None
+def _llm_extract(webpage_content: str, goal: str) -> Optional[str]:
+    """Run the SUMMARY model as the visit extractor. Mirrors
+    inference/prompt.py:build_visit_extractor_messages + tool_visit's call."""
+    client = _get_openai_client(SUMMARY_API_KEY, SUMMARY_API_BASE)
+    if client is None or not SUMMARY_MODEL_NAME:
+        return None
+    try:
+        resp = client.chat.completions.create(
+            model=SUMMARY_MODEL_NAME,
+            messages=[
+                {
+                    "role": "user",
+                    "content": EXTRACTOR_PROMPT.format(
+                        webpage_content=webpage_content, goal=goal or "general overview"
+                    ),
+                }
+            ],
+            timeout=120,
+        )
+        return (resp.choices[0].message.content or "").strip() or None
+    except Exception:
+        return None
+def _llm_condense(events_text: str, prev_state: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """Run the MEMORY model as the State Summarizer. Returns a parsed JSON
+    state dict, or None if condensation failed."""
+    client = _get_openai_client(MEMORY_API_KEY, MEMORY_API_BASE)
+    if client is None or not MEMORY_MODEL_NAME:
+        return None
+    user_payload = json.dumps(
+        {
+            "events": events_text[-30000:],  # cap input
+            "prev_state": prev_state or None,
+        },
+        ensure_ascii=False,
+    )
+    try:
+        resp = client.chat.completions.create(
+            model=MEMORY_MODEL_NAME,
+            messages=[
+                {"role": "system", "content": MEMORY_SYSTEM_PROMPT},
+                {"role": "user", "content": user_payload},
+            ],
+            timeout=180,
+        )
+        raw = (resp.choices[0].message.content or "").strip()
+        # the prompt says no code fences, but be defensive anyway
+        if raw.startswith("```"):
+            raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.DOTALL)
+        return json.loads(raw)
+    except Exception:
+        return None
+def _run_scholar_single(query: str) -> Dict[str, Any]:
+    """Google Scholar via Serper. Mirrors inference/tool_scholar.py."""
+    q = (query or "").strip()
+    if not q:
+        return {"ok": False, "error": "Scholar query cannot be empty."}
+    if not SERPER_API_KEY:
+        return {
+            "ok": False,
+            "query": q,
+            "error": "SERPER_API_KEY missing — scholar tool unavailable.",
+        }
+    headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
+    payload = json.dumps({"q": q})
+    last_err: Optional[str] = None
+    for _ in range(3):
+        try:
+            r = requests.post(
+                "https://google.serper.dev/scholar",
+                data=payload,
+                headers=headers,
+                timeout=20,
+            )
+            if r.status_code == 200:
+                data = r.json()
+                rows = []
+                for page in data.get("organic", []) or []:
+                    rows.append(
+                        {
+                            "title": page.get("title", ""),
+                            "link": page.get("link", ""),
+                            "year": page.get("year"),
+                            "publicationInfo": page.get("publicationInfo"),
+                            "snippet": page.get("snippet", ""),
+                            "citedBy": page.get("citedBy"),
+                        }
+                    )
+                return {"ok": True, "query": q, "results": rows, "backend": "serper-scholar"}
+            last_err = f"HTTP {r.status_code}: {r.text[:200]}"
+        except Exception as exc:
+            last_err = f"{type(exc).__name__}: {exc}"
+    return {"ok": False, "query": q, "error": f"Serper scholar failed ({last_err})."}
 def _run_visit_single(url: str, max_chars: int, goal: str = "") -> Dict[str, Any]:
     if not url.strip():
         return {"ok": False, "error": "URL cannot be empty."}
+    cache_key = f"{url.strip()}::{max_chars}::{goal[:60]}"
     if cache_key in VISIT_CACHE:
         return {**VISIT_CACHE[cache_key], "cached": True, "goal": goal}
+    # Preferred path: Jina Reader for clean markdown → LLM extractor distils
+    # the page content against the requested goal. Matches the research repo's
+    # inference/tool_visit.py behaviour. Either step failing falls through to
+    # the legacy requests + BeautifulSoup path.
+    jina_md = _jina_readpage(url)
+    if jina_md:
+        extract = _llm_extract(jina_md, goal) if SUMMARY_MODEL_NAME else None
+        result = {
+            "ok": True,
+            "url": url,
+            "goal": goal,
+            "content": (extract or jina_md)[:max_chars],
+            "extractor": "llm" if extract else "jina-raw",
+        }
+        VISIT_CACHE[cache_key] = result
+        return result
     try:
         resp = requests.get(
             url,
     ]
     final_answer: Optional[str] = None
+    # `prev_state` holds the JSON returned by the State Summarizer LLM. It is
+    # refreshed each time the context tokens cross MEMORY_TOKEN_THRESHOLD and
+    # then injected into the model's next user message as a RESEARCH STATE
+    # SUMMARY block. Matches inference/react_agent.py + inference/tool_memory.py
+    # behaviour.
+    prev_state: Optional[Dict[str, Any]] = None
+    condenser_runs = 0
     status_lines.append("🚀 Starting research agent")
     yield _emit()
     for turn in range(1, max_turns + 1):
         _apply_memory_strategy(messages, strategy, turn)
+        # Real LLM-based condenser: when tokens cross the threshold, call the
+        # MEMORY model to produce the structured state JSON, then rebuild the
+        # context as [system, original_question, RESEARCH_STATE_SUMMARY].
+        if (
+            strategy == "condenser"
+            and MEMORY_MODEL_NAME
+            and MEMORY_API_KEY
+            and turn > 1
+            and _messages_token_count(messages) > MEMORY_TOKEN_THRESHOLD
+        ):
+            status_lines.append(
+                f"🗜️ turn {turn}: condensing context (tokens > {MEMORY_TOKEN_THRESHOLD})"
+            )
+            yield _emit()
+            events_text = "\n\n".join(
+                f"[{m.get('role')}] {str(m.get('content',''))[:2000]}"
+                for m in messages[2:]  # skip system + original question
+            )
+            new_state = _llm_condense(events_text, prev_state)
+            if new_state:
+                prev_state = new_state
+                condenser_runs += 1
+                state.trace.append(
+                    {"turn": turn, "condenser_run": condenser_runs, "prev_state": prev_state}
+                )
+                # Reset history to system + question + state summary
+                summary_block = (
+                    "RESEARCH STATE SUMMARY (prev_state)\n"
+                    + json.dumps(prev_state, ensure_ascii=False, indent=2)
+                    + "\n\nUse this summary to avoid redundant work and "
+                    "follow `information_state.uncertain.need` for next steps."
+                )
+                messages[:] = [messages[0], messages[1], {"role": "user", "content": summary_block}]
+                status_lines[-1] = (
+                    f"🗜️ turn {turn}: condensed → "
+                    f"{len(prev_state.get('information_state', {}).get('trusted', []))} trusted, "
+                    f"{len(prev_state.get('information_state', {}).get('uncertain', []))} uncertain"
+                )
+                yield _emit()
+        elif (
+            strategy == "condenser"
+            and (not MEMORY_MODEL_NAME or not MEMORY_API_KEY)
+            and state.trusted_notes
+            and turn > 1
+            and turn % 3 == 0
+        ):
+            # Fallback heuristic when the MEMORY model is not configured.
             summary_lines = "\n".join(f"- {n}" for n in state.trusted_notes[-6:])
             messages.append(
                 {
                     f"✅ turn {turn}: read {visit_ok}/{len(urls)} page(s)"
                 )
                 yield _emit()
+            elif tool_name in ("google_scholar", "scholar"):
+                raw_query = tool_args.get("query", "")
+                queries: List[str]
+                if isinstance(raw_query, list):
+                    queries = [str(q).strip() for q in raw_query if str(q).strip()]
+                else:
+                    queries = [str(raw_query).strip()] if str(raw_query).strip() else []
+                queries_preview = ", ".join(f"`{q}`" for q in queries) or "_(empty)_"
+                status_lines.append(f"🎓 turn {turn}: scholar {queries_preview}")
+                yield _emit()
+                per_q = [_run_scholar_single(q) for q in queries]
+                tool_response = (
+                    per_q[0] if len(per_q) == 1 else {"ok": True, "results": per_q}
+                )
+                ok_count = sum(1 for r in per_q if r.get("ok"))
+                status_lines.append(
+                    f"📚 turn {turn}: scholar {ok_count}/{len(per_q)} ok"
+                )
+                yield _emit()
             else:
                 tool_response = {"ok": False, "error": f"Unknown tool: {tool_name}"}
                 status_lines.append(f"⚠️ turn {turn}: unknown tool `{tool_name}`")

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ huggingface_hub==0.31.2
 duckduckgo_search==8.0.1
 requests==2.32.3
 beautifulsoup4==4.12.3

 duckduckgo_search==8.0.1
 requests==2.32.3
 beautifulsoup4==4.12.3
+openai>=1.40.0
+tiktoken>=0.7.0