Fix empty "..." Markdown output by reworking the <answer> fallback prompt
Browse filesThe "no tool call detected" fallback literally contained
`<answer>...</answer>`, so Quest-4B echoed the template verbatim and
extract_answer captured `...` as the final answer. Reword the prompt,
reject placeholder-only content (ASCII and unicode ellipses), strip
<think> blocks before parsing, handle truncated <answer>, guarantee a
blank line before pipe tables so GFM renders them, and raise the
generation budget from 1400 to 4096 tokens (env-configurable via
QUEST_MAX_NEW_TOKENS).
Made-with: Cursor
app.py
CHANGED
|
@@ -805,13 +805,99 @@ class AgentState:
|
|
| 805 |
trace: List[Dict[str, Any]] = field(default_factory=list)
|
| 806 |
|
| 807 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
def extract_answer(text: str) -> Optional[str]:
|
| 809 |
-
|
| 810 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
|
| 812 |
|
| 813 |
def parse_tool_call(text: str) -> Tuple[Optional[str], Optional[Dict[str, Any]], Optional[str]]:
|
| 814 |
-
|
|
|
|
| 815 |
if not match:
|
| 816 |
return None, None, None
|
| 817 |
payload = match.group(1).strip()
|
|
@@ -1000,7 +1086,7 @@ def build_research_agent(
|
|
| 1000 |
preferred_model=primary_model,
|
| 1001 |
candidate_models=fallback_models,
|
| 1002 |
temperature=temperature,
|
| 1003 |
-
max_new_tokens=
|
| 1004 |
)
|
| 1005 |
model_output = raw_output
|
| 1006 |
# Preserve the human-friendly model id for the trace even if the
|
|
@@ -1019,10 +1105,25 @@ def build_research_agent(
|
|
| 1019 |
tool_response = {"ok": False, "error": tool_err}
|
| 1020 |
elif not tool_name:
|
| 1021 |
# No explicit tool call and no final answer: force finalization.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
messages.append(
|
| 1023 |
{
|
| 1024 |
"role": "user",
|
| 1025 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1026 |
}
|
| 1027 |
)
|
| 1028 |
continue
|
|
@@ -1117,6 +1218,8 @@ def build_research_agent(
|
|
| 1117 |
"I could not finish a complete research answer within the configured turns. "
|
| 1118 |
"Try increasing max turns or switching to a stronger model."
|
| 1119 |
)
|
|
|
|
|
|
|
| 1120 |
|
| 1121 |
citations = "\n".join(f"- {url}" for url in sorted(set(state.visited_urls)))
|
| 1122 |
final_answer = f"**Model used:** `{used_model}`\n\n{final_answer}"
|
|
|
|
| 805 |
trace: List[Dict[str, Any]] = field(default_factory=list)
|
| 806 |
|
| 807 |
|
| 808 |
+
# Accept a variety of placeholder-only answers: a bare ellipsis (ASCII `...`
|
| 809 |
+
# or unicode `…`), a single interpunct, and any whitespace-only content. These
|
| 810 |
+
# show up when the model echoes a literal `<answer>...</answer>` template
|
| 811 |
+
# from the prompt instead of producing a real answer.
|
| 812 |
+
_PLACEHOLDER_ANSWER_RE = re.compile(r"^[\s.\u2026\u00b7]*$")
|
| 813 |
+
|
| 814 |
+
# Pipe-table separator line, e.g. `| --- | :---: |`. The outer pipes are
|
| 815 |
+
# optional in some GFM dialects, so we accept both.
|
| 816 |
+
_TABLE_SEPARATOR_RE = re.compile(
|
| 817 |
+
r"^\s*\|?\s*:?-{2,}:?(?:\s*\|\s*:?-{2,}:?)+\s*\|?\s*$"
|
| 818 |
+
)
|
| 819 |
+
|
| 820 |
+
|
| 821 |
+
def strip_think_blocks(text: str) -> str:
|
| 822 |
+
"""Remove any <think>...</think> reasoning blocks.
|
| 823 |
+
|
| 824 |
+
Quest-4B (Qwen3 family) emits `<think>` reasoning before the final
|
| 825 |
+
answer. When the endpoint is deployed without a reasoning parser, the raw
|
| 826 |
+
tags leak into chat completion `content`; stripping them here keeps the
|
| 827 |
+
extracted answer clean for Markdown rendering.
|
| 828 |
+
"""
|
| 829 |
+
return re.sub(
|
| 830 |
+
r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE
|
| 831 |
+
)
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
def _is_placeholder_answer(text: str) -> bool:
|
| 835 |
+
return bool(_PLACEHOLDER_ANSWER_RE.match(text or ""))
|
| 836 |
+
|
| 837 |
+
|
| 838 |
+
def ensure_markdown_table_blank_lines(text: str) -> str:
|
| 839 |
+
"""Insert a blank line before any pipe-table header row.
|
| 840 |
+
|
| 841 |
+
GitHub-Flavored Markdown requires a pipe table to be preceded by a
|
| 842 |
+
paragraph break; otherwise the header row is folded into the previous
|
| 843 |
+
paragraph and the whole table renders as raw text. Models sometimes glue
|
| 844 |
+
the table directly under a sentence (e.g. "Here's the comparison: | Col
|
| 845 |
+
..."), so we fix that up defensively.
|
| 846 |
+
"""
|
| 847 |
+
lines = text.split("\n")
|
| 848 |
+
out: List[str] = []
|
| 849 |
+
for idx, line in enumerate(lines):
|
| 850 |
+
is_header = (
|
| 851 |
+
"|" in line
|
| 852 |
+
and idx + 1 < len(lines)
|
| 853 |
+
and _TABLE_SEPARATOR_RE.match(lines[idx + 1]) is not None
|
| 854 |
+
)
|
| 855 |
+
if is_header and out and out[-1].strip() != "":
|
| 856 |
+
out.append("")
|
| 857 |
+
out.append(line)
|
| 858 |
+
return "\n".join(out)
|
| 859 |
+
|
| 860 |
+
|
| 861 |
def extract_answer(text: str) -> Optional[str]:
|
| 862 |
+
"""Return the content of the first `<answer>...</answer>` block.
|
| 863 |
+
|
| 864 |
+
Tries two strategies, in order, and discards placeholder-only content
|
| 865 |
+
(bare ellipses) that the model sometimes echoes from the prompt:
|
| 866 |
+
|
| 867 |
+
1. Well-formed `<answer>...</answer>` block.
|
| 868 |
+
2. Truncated `<answer>...` with no closing tag (tokens ran out);
|
| 869 |
+
in that case we take everything after the opening tag.
|
| 870 |
+
"""
|
| 871 |
+
cleaned = strip_think_blocks(text or "")
|
| 872 |
+
|
| 873 |
+
full_match = re.search(
|
| 874 |
+
r"<answer>\s*(.*?)\s*</answer>",
|
| 875 |
+
cleaned,
|
| 876 |
+
flags=re.DOTALL | re.IGNORECASE,
|
| 877 |
+
)
|
| 878 |
+
if full_match is not None:
|
| 879 |
+
candidate = full_match.group(1).strip()
|
| 880 |
+
if candidate and not _is_placeholder_answer(candidate):
|
| 881 |
+
return candidate
|
| 882 |
+
# Closed block was a placeholder / empty: fail fast. Do NOT fall
|
| 883 |
+
# through to the open-ended strategy, or it would re-match the same
|
| 884 |
+
# tag and incorrectly capture `...</answer>` as the answer.
|
| 885 |
+
return None
|
| 886 |
+
|
| 887 |
+
open_match = re.search(
|
| 888 |
+
r"<answer>\s*(.*)$", cleaned, flags=re.DOTALL | re.IGNORECASE
|
| 889 |
+
)
|
| 890 |
+
if open_match is not None:
|
| 891 |
+
candidate = open_match.group(1).strip()
|
| 892 |
+
if candidate and not _is_placeholder_answer(candidate):
|
| 893 |
+
return candidate
|
| 894 |
+
|
| 895 |
+
return None
|
| 896 |
|
| 897 |
|
| 898 |
def parse_tool_call(text: str) -> Tuple[Optional[str], Optional[Dict[str, Any]], Optional[str]]:
|
| 899 |
+
cleaned = strip_think_blocks(text or "")
|
| 900 |
+
match = re.search(r"<tool_call>\s*(.*?)\s*</tool_call>", cleaned, flags=re.DOTALL | re.IGNORECASE)
|
| 901 |
if not match:
|
| 902 |
return None, None, None
|
| 903 |
payload = match.group(1).strip()
|
|
|
|
| 1086 |
preferred_model=primary_model,
|
| 1087 |
candidate_models=fallback_models,
|
| 1088 |
temperature=temperature,
|
| 1089 |
+
max_new_tokens=int(os.getenv("QUEST_MAX_NEW_TOKENS", "4096")),
|
| 1090 |
)
|
| 1091 |
model_output = raw_output
|
| 1092 |
# Preserve the human-friendly model id for the trace even if the
|
|
|
|
| 1105 |
tool_response = {"ok": False, "error": tool_err}
|
| 1106 |
elif not tool_name:
|
| 1107 |
# No explicit tool call and no final answer: force finalization.
|
| 1108 |
+
# IMPORTANT: do not write the literal characters `<answer>...</answer>`
|
| 1109 |
+
# here. Some models (notably the Qwen3 family that Quest-4B is
|
| 1110 |
+
# built on) will echo the template verbatim, which means the
|
| 1111 |
+
# extracted answer ends up being the three-dot placeholder `...`
|
| 1112 |
+
# and the user sees an empty-looking result.
|
| 1113 |
messages.append(
|
| 1114 |
{
|
| 1115 |
"role": "user",
|
| 1116 |
+
"content": (
|
| 1117 |
+
"You did not call a tool and did not produce a final "
|
| 1118 |
+
"answer. Please now write your best final answer, "
|
| 1119 |
+
"wrapped between an opening <answer> tag and a "
|
| 1120 |
+
"closing </answer> tag. Put the real answer text "
|
| 1121 |
+
"between those tags; do not write a literal ellipsis "
|
| 1122 |
+
"or other placeholder. If the question asks for "
|
| 1123 |
+
"tabular data, use GitHub-Flavored Markdown pipe "
|
| 1124 |
+
"tables (`| col1 | col2 |` + `|---|---|`) and put a "
|
| 1125 |
+
"blank line before the first row so the table renders."
|
| 1126 |
+
),
|
| 1127 |
}
|
| 1128 |
)
|
| 1129 |
continue
|
|
|
|
| 1218 |
"I could not finish a complete research answer within the configured turns. "
|
| 1219 |
"Try increasing max turns or switching to a stronger model."
|
| 1220 |
)
|
| 1221 |
+
else:
|
| 1222 |
+
final_answer = ensure_markdown_table_blank_lines(final_answer)
|
| 1223 |
|
| 1224 |
citations = "\n".join(f"- {url}" for url in sorted(set(state.visited_urls)))
|
| 1225 |
final_answer = f"**Model used:** `{used_model}`\n\n{final_answer}"
|