Spaces:
Running
Running
Strip <thought> tags from final answer token stream
Browse filesGemma 4 (and some other models) emit <thought>...</thought> XML tags
at the start of their streamed final answer. These were leaking through
to the UI as raw text.
Added a stateful buffer filter in _stream_final_answer that strips any
<thought>...</thought> blocks from the token stream, handling tags that
span multiple chunk boundaries.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- backend/services/agent.py +51 -2
backend/services/agent.py
CHANGED
|
@@ -842,12 +842,61 @@ class AgentService:
|
|
| 842 |
# to propagate any exception raised inside _run_sync.
|
| 843 |
task = loop.run_in_executor(None, _run_sync)
|
| 844 |
|
| 845 |
-
# Consume tokens as they arrive from the background thread
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
while True:
|
| 847 |
token = await queue.get()
|
| 848 |
if token is None:
|
|
|
|
|
|
|
|
|
|
| 849 |
break
|
| 850 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
|
| 852 |
await task # re-raises any exception from the streaming thread
|
| 853 |
|
|
|
|
| 842 |
# to propagate any exception raised inside _run_sync.
|
| 843 |
task = loop.run_in_executor(None, _run_sync)
|
| 844 |
|
| 845 |
+
# Consume tokens as they arrive from the background thread.
|
| 846 |
+
# Some models (Gemma 4) emit <thought>...</thought> tags at the start
|
| 847 |
+
# of their final answer. We strip them here with a stateful buffer so
|
| 848 |
+
# the UI never renders raw XML thought tags.
|
| 849 |
+
buf = "" # accumulates partial text while we check for tags
|
| 850 |
+
in_thought = False
|
| 851 |
+
OPEN_TAG = "<thought>"
|
| 852 |
+
CLOSE_TAG = "</thought>"
|
| 853 |
+
|
| 854 |
while True:
|
| 855 |
token = await queue.get()
|
| 856 |
if token is None:
|
| 857 |
+
# Flush whatever is buffered (can't be inside a tag at EOF)
|
| 858 |
+
if buf and not in_thought:
|
| 859 |
+
yield buf
|
| 860 |
break
|
| 861 |
+
|
| 862 |
+
buf += token
|
| 863 |
+
|
| 864 |
+
# Process buf until no more complete decisions can be made
|
| 865 |
+
while buf:
|
| 866 |
+
if in_thought:
|
| 867 |
+
# Looking for </thought>
|
| 868 |
+
idx = buf.find(CLOSE_TAG)
|
| 869 |
+
if idx != -1:
|
| 870 |
+
# Found the close tag — discard everything up to and including it
|
| 871 |
+
buf = buf[idx + len(CLOSE_TAG):]
|
| 872 |
+
in_thought = False
|
| 873 |
+
else:
|
| 874 |
+
# Might be a partial </thought> at the end — keep the last
|
| 875 |
+
# len(CLOSE_TAG)-1 chars buffered in case the tag spans chunks
|
| 876 |
+
safe = len(buf) - (len(CLOSE_TAG) - 1)
|
| 877 |
+
if safe > 0:
|
| 878 |
+
buf = buf[safe:] # discard confirmed-inside-thought text
|
| 879 |
+
break
|
| 880 |
+
else:
|
| 881 |
+
# Looking for <thought>
|
| 882 |
+
idx = buf.find(OPEN_TAG)
|
| 883 |
+
if idx == 0:
|
| 884 |
+
# Tag starts right here — enter thought mode, discard the tag
|
| 885 |
+
buf = buf[len(OPEN_TAG):]
|
| 886 |
+
in_thought = True
|
| 887 |
+
elif idx > 0:
|
| 888 |
+
# Emit everything before the tag, then enter thought mode
|
| 889 |
+
yield buf[:idx]
|
| 890 |
+
buf = buf[idx + len(OPEN_TAG):]
|
| 891 |
+
in_thought = True
|
| 892 |
+
else:
|
| 893 |
+
# No open tag found — safe to emit, but keep a small tail
|
| 894 |
+
# in case <thought> is split across chunks
|
| 895 |
+
safe = len(buf) - (len(OPEN_TAG) - 1)
|
| 896 |
+
if safe > 0:
|
| 897 |
+
yield buf[:safe]
|
| 898 |
+
buf = buf[safe:]
|
| 899 |
+
break
|
| 900 |
|
| 901 |
await task # re-raises any exception from the streaming thread
|
| 902 |
|