openhands commited on
Commit
4d2228f
·
1 Parent(s): 96721e2

Update to v1.1 template structure

Browse files
v1.0_rebuild_qwen3.5_and_3.6_template.jinja → qwen3-5_6-template_v1.1.jinja RENAMED
@@ -28,13 +28,18 @@
28
 
29
  {#- ===== SECTION 2: NAMESPACE INITIALISATION =====
30
  Single ns object for all mutable state.
31
- enable_thinking default=true; overridden by template parameter (BUG-003 fix).
 
 
 
32
  -#}
33
  {%- set ns = namespace(
34
  enable_thinking=true,
35
  image_count=0,
36
  video_count=0
37
  ) -%}
 
 
38
  {%- if enable_thinking is defined -%}
39
  {%- if enable_thinking -%}
40
  {%- set ns.enable_thinking = true -%}
@@ -43,28 +48,49 @@
43
  {%- endif -%}
44
  {%- endif -%}
45
 
 
 
 
 
 
 
 
 
 
46
  {#- ===== SECTION 3: PRE-SCAN =====
47
  Track last /no_think or /think flag in user messages.
 
 
48
  The model follows the last flag encountered in multi-turn conversations.
49
  -#}
50
  {%- for i in range(messages | length) -%}
51
- {%- if messages[i].role == 'user' -%}
52
- {%- set _u = messages[i].content if messages[i].content is string else '' -%}
 
53
  {%- if _u.rstrip().endswith('/no_think') -%}
54
  {%- set ns.enable_thinking = false -%}
55
  {%- elif _u.rstrip().endswith('/think') -%}
56
  {%- set ns.enable_thinking = true -%}
57
  {%- endif -%}
 
 
 
 
 
 
 
58
  {%- endif -%}
59
  {%- endfor -%}
60
 
61
  {#- ===== SECTION 4: COLLECT SYSTEM CONTENT =====
62
  Merge all system/developer messages with \n\n separator (BUG-004 fix).
 
63
  -#}
64
  {%- set ns_sys = namespace(content='') -%}
65
  {%- for msg in messages -%}
66
  {%- if msg.role == 'system' or msg.role == 'developer' -%}
67
  {%- set _c = render_content(msg.content | default('')) | trim -%}
 
68
  {%- if _c -%}
69
  {%- if ns_sys.content == '' -%}
70
  {%- set ns_sys.content = _c -%}
@@ -127,7 +153,9 @@
127
 
128
  {#- 7c: Assistant messages -#}
129
  {%- elif message.role == 'assistant' -%}
130
- {#- Safely extract content as string — guard against absent key (BUG-002 fix) -#}
 
 
131
  {%- if message.content is defined and message.content is string -%}
132
  {%- set _ac = message.content -%}
133
  {%- elif message.content is defined and message.content is iterable and message.content is not mapping -%}
@@ -136,6 +164,15 @@
136
  {%- set _ac = '' -%}
137
  {%- endif -%}
138
 
 
 
 
 
 
 
 
 
 
139
  {#- Collect tool_calls if present -#}
140
  {%- set _tc = message.tool_calls if message.tool_calls is defined and message.tool_calls else [] -%}
141
 
@@ -154,15 +191,27 @@
154
  {#- Think-block handling (BUG-001 fix + last-turn preservation):
155
  - Tool-call turns : never strip (think block is part of the tool-call format)
156
  - Last-history turn : preserve; inject non-thinking prefill when absent
157
- - Historical turns : strip the think block -#}
 
158
  {%- if not _tc -%}
159
  {%- if _is_last_hist -%}
160
  {%- if '<think>' not in _ac and not ns.enable_thinking -%}
161
  {%- set _ac = '<think>\n\n</think>\n\n' + _ac -%}
162
  {%- endif -%}
163
  {%- else -%}
 
 
164
  {%- if '</think>' in _ac -%}
165
- {%- set _ac = _ac.split('</think>')[-1].lstrip('\n') -%}
 
 
 
 
 
 
 
 
 
166
  {%- endif -%}
167
  {%- endif -%}
168
  {%- endif -%}
@@ -205,8 +254,10 @@
205
  {%- endif -%}
206
  {{- '<tool_response>\n' -}}
207
  {{- message.content | default('') -}}
208
- {{- '\n</tool_response>' -}}
209
- {%- if _next_role != 'tool' -%}
 
 
210
  {{- '<|im_end|>\n' -}}
211
  {%- endif -%}
212
 
@@ -218,12 +269,22 @@
218
  {%- endfor -%}
219
 
220
  {#- ===== SECTION 8: GENERATION PROMPT =====
221
- enable_thinking=True → no prefill (model generates <think> itself)
222
- enable_thinking=False exact 19-char non-thinking prefill (BUG-005 fix)
 
 
 
 
 
 
 
 
223
  -#}
224
  {%- if add_generation_prompt -%}
225
  {{- '<|im_start|>assistant\n' -}}
226
- {%- if not ns.enable_thinking -%}
 
 
227
  {{- '<think>\n\n</think>\n\n' -}}
228
  {%- endif -%}
229
  {%- endif -%}
 
28
 
29
  {#- ===== SECTION 2: NAMESPACE INITIALISATION =====
30
  Single ns object for all mutable state.
31
+ enable_thinking default=true (BUG-003 fix)
32
+ preserve_thinking default=true: when false, suppresses think-block output in
33
+ generation prompt and overrides enable_thinking to false.
34
+ Passed via --chat-template-kwargs {"preserve_thinking":false}.
35
  -#}
36
  {%- set ns = namespace(
37
  enable_thinking=true,
38
  image_count=0,
39
  video_count=0
40
  ) -%}
41
+
42
+ {#- Resolve enable_thinking kwarg -#}
43
  {%- if enable_thinking is defined -%}
44
  {%- if enable_thinking -%}
45
  {%- set ns.enable_thinking = true -%}
 
48
  {%- endif -%}
49
  {%- endif -%}
50
 
51
+ {#- Resolve preserve_thinking kwarg.
52
+ preserve_thinking=false => force non-thinking mode (same as enable_thinking=false).
53
+ preserve_thinking=true => default, no override (thinking controlled by enable_thinking).
54
+ When not defined => default, no override.
55
+ -#}
56
+ {%- if preserve_thinking is defined and not preserve_thinking -%}
57
+ {%- set ns.enable_thinking = false -%}
58
+ {%- endif -%}
59
+
60
  {#- ===== SECTION 3: PRE-SCAN =====
61
  Track last /no_think or /think flag in user messages.
62
+ Also scan system messages for <|think_off|> / <|think_on|> markers
63
+ (allows apps to control thinking mode via system prompt injection).
64
  The model follows the last flag encountered in multi-turn conversations.
65
  -#}
66
  {%- for i in range(messages | length) -%}
67
+ {%- set _msg = messages[i] -%}
68
+ {%- if _msg.role == 'user' -%}
69
+ {%- set _u = _msg.content if _msg.content is string else '' -%}
70
  {%- if _u.rstrip().endswith('/no_think') -%}
71
  {%- set ns.enable_thinking = false -%}
72
  {%- elif _u.rstrip().endswith('/think') -%}
73
  {%- set ns.enable_thinking = true -%}
74
  {%- endif -%}
75
+ {%- elif _msg.role == 'system' or _msg.role == 'developer' -%}
76
+ {%- set _s = _msg.content if _msg.content is string else '' -%}
77
+ {%- if '<|think_off|>' in _s -%}
78
+ {%- set ns.enable_thinking = false -%}
79
+ {%- elif '<|think_on|>' in _s -%}
80
+ {%- set ns.enable_thinking = true -%}
81
+ {%- endif -%}
82
  {%- endif -%}
83
  {%- endfor -%}
84
 
85
  {#- ===== SECTION 4: COLLECT SYSTEM CONTENT =====
86
  Merge all system/developer messages with \n\n separator (BUG-004 fix).
87
+ <|think_off|> / <|think_on|> markers are stripped from output.
88
  -#}
89
  {%- set ns_sys = namespace(content='') -%}
90
  {%- for msg in messages -%}
91
  {%- if msg.role == 'system' or msg.role == 'developer' -%}
92
  {%- set _c = render_content(msg.content | default('')) | trim -%}
93
+ {%- set _c = _c | replace('<|think_off|>', '') | replace('<|think_on|>', '') | trim -%}
94
  {%- if _c -%}
95
  {%- if ns_sys.content == '' -%}
96
  {%- set ns_sys.content = _c -%}
 
153
 
154
  {#- 7c: Assistant messages -#}
155
  {%- elif message.role == 'assistant' -%}
156
+ {#- Safely extract content as string — guard against absent key (BUG-002 fix).
157
+ Also support message.reasoning_content as an explicit think-block source
158
+ (used by some frameworks that store thinking separately from content). -#}
159
  {%- if message.content is defined and message.content is string -%}
160
  {%- set _ac = message.content -%}
161
  {%- elif message.content is defined and message.content is iterable and message.content is not mapping -%}
 
164
  {%- set _ac = '' -%}
165
  {%- endif -%}
166
 
167
+ {#- Reconstruct content from reasoning_content + content when the framework
168
+ stores thinking separately (e.g. OpenAI-style reasoning_content field).
169
+ Only apply when no think-block already present in _ac. -#}
170
+ {%- if message.reasoning_content is defined and message.reasoning_content is string
171
+ and message.reasoning_content | trim
172
+ and '<think>' not in _ac -%}
173
+ {%- set _ac = '<think>\n' + message.reasoning_content | trim + '\n</think>\n\n' + _ac -%}
174
+ {%- endif -%}
175
+
176
  {#- Collect tool_calls if present -#}
177
  {%- set _tc = message.tool_calls if message.tool_calls is defined and message.tool_calls else [] -%}
178
 
 
191
  {#- Think-block handling (BUG-001 fix + last-turn preservation):
192
  - Tool-call turns : never strip (think block is part of the tool-call format)
193
  - Last-history turn : preserve; inject non-thinking prefill when absent
194
+ - Historical turns : strip using fuzzy end-tag matching to handle
195
+ </think>, </thinking>, </ think>, </think > variants -#}
196
  {%- if not _tc -%}
197
  {%- if _is_last_hist -%}
198
  {%- if '<think>' not in _ac and not ns.enable_thinking -%}
199
  {%- set _ac = '<think>\n\n</think>\n\n' + _ac -%}
200
  {%- endif -%}
201
  {%- else -%}
202
+ {#- Fuzzy end-tag detection for historical turn stripping -#}
203
+ {%- set _think_end = '' -%}
204
  {%- if '</think>' in _ac -%}
205
+ {%- set _think_end = '</think>' -%}
206
+ {%- elif '</thinking>' in _ac -%}
207
+ {%- set _think_end = '</thinking>' -%}
208
+ {%- elif '</ think>' in _ac -%}
209
+ {%- set _think_end = '</ think>' -%}
210
+ {%- elif '</think >' in _ac -%}
211
+ {%- set _think_end = '</think >' -%}
212
+ {%- endif -%}
213
+ {%- if _think_end -%}
214
+ {%- set _ac = _ac.split(_think_end)[-1].lstrip('\n') -%}
215
  {%- endif -%}
216
  {%- endif -%}
217
  {%- endif -%}
 
254
  {%- endif -%}
255
  {{- '<tool_response>\n' -}}
256
  {{- message.content | default('') -}}
257
+ {%- if _next_role == 'tool' -%}
258
+ {{- '\n</tool_response>\n' -}}
259
+ {%- else -%}
260
+ {{- '\n</tool_response>' -}}
261
  {{- '<|im_end|>\n' -}}
262
  {%- endif -%}
263
 
 
269
  {%- endfor -%}
270
 
271
  {#- ===== SECTION 8: GENERATION PROMPT =====
272
+ enable_thinking=True → open <think>\n prefill so llama.cpp reasoning-budget
273
+ and other inference engines can hook into the think-stream.
274
+ The model continues generating inside the open block.
275
+ enable_thinking=False → exact non-thinking prefill: <think>\n\n</think>\n\n
276
+ (19-char closed block, BUG-005 fix)
277
+
278
+ NOTE: The <think>\n opener is EPHEMERAL — it lives only in the generation
279
+ prompt, never in chat history. Historical think-block stripping (BUG-001)
280
+ is handled in Section 7c and is entirely unaffected by this change.
281
+ No context poisoning risk.
282
  -#}
283
  {%- if add_generation_prompt -%}
284
  {{- '<|im_start|>assistant\n' -}}
285
+ {%- if ns.enable_thinking -%}
286
+ {{- '<think>\n' -}}
287
+ {%- else -%}
288
  {{- '<think>\n\n</think>\n\n' -}}
289
  {%- endif -%}
290
  {%- endif -%}
v1.0_writeup.md DELETED
@@ -1,646 +0,0 @@
1
- # Qwen3.5 / Qwen3.6 Jinja2 Chat Template — Implementation Writeup
2
-
3
- **File:** `qwen3_5-template.jinja`
4
- **Validation:** `validate_template.py` (17 fixtures, 0 failures)
5
- **Bugs fixed:** BUG-001 through BUG-006
6
-
7
- ---
8
-
9
- ## Table of Contents
10
-
11
- 1. [Why a New Template?](#1-why-a-new-template)
12
- 2. [Research Basis](#2-research-basis)
13
- 3. [Model Format Fundamentals](#3-model-format-fundamentals)
14
- 4. [Implementation Premises](#4-implementation-premises)
15
- 5. [enable_thinking Behavior](#5-enable_thinking-behavior)
16
- 6. [Tool Call Rendering](#6-tool-call-rendering)
17
- 7. [Bug Analysis and Fixes](#7-bug-analysis-and-fixes)
18
- 8. [Template Architecture](#8-template-architecture)
19
- 9. [Test Coverage](#9-test-coverage)
20
- 10. [Tool Ecosystem Compatibility](#10-tool-ecosystem-compatibility)
21
-
22
- ---
23
-
24
- ## 1. Why a New Template?
25
-
26
- The official Qwen3.5/3.6 chat template (as shipped with the HuggingFace model
27
- checkpoints) contains at least six correctness bugs that cause silent failures in
28
- production agent loops. These bugs were independently reported across GitHub
29
- issues, HuggingFace discussions, Reddit threads, and llama.cpp/vLLM bug trackers
30
- between early 2025 and mid-2026.
31
-
32
- An analysis of approximately five widely-used community replacement templates
33
- showed that each one fixed a different subset of the bugs while introducing new
34
- ones. None were derived systematically from the model's training format as
35
- documented in the official technical report.
36
-
37
- This template was written from scratch, grounded in:
38
-
39
- - **Qwen3 Technical Report** (arXiv:2505.09388) — authoritative description of
40
- the training format, thinking mechanism, and tool-calling protocol.
41
- - **Mid-Think Paper** (arXiv:2601.07036) — phase structure of reasoning chains and
42
- budget-stop format.
43
- - **Hermes tool-call format spec** (Nous Research / NousHermes) — the XML-based
44
- tool-call format on which Qwen3 tool-calling is modelled.
45
- - Community bug reports and vLLM/llama.cpp/Ollama source code analysis.
46
-
47
- ---
48
-
49
- ## 2. Research Basis
50
-
51
- ### 2.1 Qwen3 Technical Report (arXiv:2505.09388)
52
-
53
- Key facts extracted for template construction:
54
-
55
- - No BOS token. The model was trained without one; inserting one degrades output.
56
- - `<think>` and `</think>` are **regular BPE text tokens**, not special tokens.
57
- Tokenizer ID 151644 = `<|im_start|>`, 151645 = `<|im_end|>`.
58
- - Non-thinking mode is implemented by prepending an **empty think block** to the
59
- assistant generation: `<think>\n\n</think>\n\n`. The report states explicitly:
60
- *"For non-thinking mode samples, we retain an empty thinking block in the
61
- assistant's response. This design ensures internal format consistency."*
62
- - `/think` and `/no_think` are plain text suffixes in user messages, not special
63
- tokens. The model was fine-tuned to follow the **last** such flag encountered in
64
- a multi-turn conversation.
65
-
66
- ### 2.2 Vocab and Tokenizer Notes
67
-
68
- ```
69
- Token ID Note
70
- <|endoftext|> 151643 End-of-document / pad fallback
71
- <|im_start|> 151644 Begin-of-turn
72
- <|im_end|> 151645 End-of-turn, eos_token
73
- ```
74
-
75
- Qwen3.5/3.6 both use a padded vocabulary of 248,320 entries; tokens above 151,646
76
- are padding with no semantics. The tokenizer class is `Qwen2Tokenizer` (BBPE,
77
- no `<unk>`).
78
-
79
- ### 2.3 Tool-Call Format Origin
80
-
81
- Qwen3 tool-calling uses the **Hermes-2 XML format** (NousResearch):
82
-
83
- ```
84
- <tool_call>
85
- {"name": "function_name", "arguments": {"key": "value"}}
86
- </tool_call>
87
- ```
88
-
89
- This is identical to vLLM's `hermes` parser target and is the format recognised
90
- by Ollama's `parseTag()` heuristic (first text node following `.ToolCalls`).
91
-
92
- ---
93
-
94
- ## 3. Model Format Fundamentals
95
-
96
- ### 3.1 ChatML Base Structure
97
-
98
- Every conversation is encoded as a sequence of turns delimited by im-start/end
99
- control tokens. No newline appears before `<|im_end|>`.
100
-
101
- ```
102
- <|im_start|>system
103
- {system_content}<|im_end|>
104
- <|im_start|>user
105
- {user_content}<|im_end|>
106
- <|im_start|>assistant
107
- <think>
108
- {thinking}
109
- </think>
110
-
111
- {response}<|im_end|>
112
- ```
113
-
114
- The blank line between `</think>` and the response is mandatory. The model was
115
- trained on this exact whitespace layout.
116
-
117
- ### 3.2 Non-Thinking Prefill (Character-Exact)
118
-
119
- The non-thinking generation prefix is exactly 19 characters:
120
-
121
- ```
122
- <think>\n\n</think>\n\n
123
- ```
124
-
125
- Decomposed: `<think>` (7) + `\n` (1) + `\n` (1) + `</think>` (8) + `\n` (1) +
126
- `\n` (1) = 19. Any deviation (extra space, missing newline) moves the model off
127
- its training distribution.
128
-
129
- ### 3.3 Think-Block Scope Rules
130
-
131
- | Turn type | Think-block treatment |
132
- |---|---|
133
- | Historical assistant turn (non-last, no tool_calls) | **Strip entirely** — `split('</think>')[-1].lstrip('\n')` |
134
- | Historical assistant turn (has tool_calls) | **Preserve** — think block is part of the tool-call format |
135
- | Last assistant turn in history (`add_generation_prompt=False`) | **Preserve verbatim** |
136
- | Last assistant turn, no existing think, `enable_thinking=False` | **Inject** `<think>\n\n</think>\n\n` prefix |
137
- | Generation prompt, `enable_thinking=True` | **No prefix** — model generates its own `<think>` |
138
- | Generation prompt, `enable_thinking=False` | **Inject** `<think>\n\n</think>\n\n` prefix |
139
-
140
- ---
141
-
142
- ## 4. Implementation Premises
143
-
144
- ### 4.1 Single Namespace Object
145
-
146
- All mutable template state lives in one `ns` namespace object, avoiding
147
- Jinja2's scoping trap (variables set inside `{% for %}` blocks are not visible
148
- outside without a namespace):
149
-
150
- ```jinja2
151
- {%- set ns = namespace(
152
- enable_thinking=true,
153
- image_count=0,
154
- video_count=0
155
- ) -%}
156
- ```
157
-
158
- ### 4.2 Pre-Scan Before Rendering
159
-
160
- The template performs a full pre-scan of all messages before emitting any output.
161
- This is necessary because `/no_think` or `/think` can appear in any user message,
162
- and the final flag determines the generation prompt behaviour. A single-pass loop
163
- that both renders and tracks flags would have to look ahead, which Jinja2 cannot
164
- do.
165
-
166
- ```jinja2
167
- {%- for i in range(messages | length) -%}
168
- {%- if messages[i].role == 'user' -%}
169
- {%- set _u = messages[i].content if messages[i].content is string else '' -%}
170
- {%- if _u.rstrip().endswith('/no_think') -%}
171
- {%- set ns.enable_thinking = false -%}
172
- {%- elif _u.rstrip().endswith('/think') -%}
173
- {%- set ns.enable_thinking = true -%}
174
- {%- endif -%}
175
- {%- endif -%}
176
- {%- endfor -%}
177
- ```
178
-
179
- ### 4.3 Separate `{{ }}` Blocks for `tojson` Output
180
-
181
- Jinja2's `tojson` filter returns a `Markup` object (already HTML-safe). When a
182
- `Markup` value is Python-concatenated with a plain string using `+`, Jinja2
183
- auto-escapes the plain string and produces double-encoded output (`&quot;`,
184
- `&#34;`, etc.). This is BUG-003.
185
-
186
- The fix is to never concatenate `tojson` output with plain strings inside a
187
- Jinja2 expression. Each fragment is emitted through its own `{{ }}` block:
188
-
189
- ```jinja2
190
- {# WRONG — triggers HTML-escaping of the plain string #}
191
- {{- '{"name": ' + tc.function.name | tojson + '}' -}}
192
-
193
- {# CORRECT — separate blocks, no Python concatenation #}
194
- {{- '{"name": ' -}}{{- tc.function.name | tojson -}}{{- '}' -}}
195
- ```
196
-
197
- ### 4.4 System Message Collection Phase
198
-
199
- Multiple system messages are merged into a single `<|im_start|>system` turn
200
- with `\n\n` as separator (BUG-004 fix). This is done as a separate pre-pass
201
- (Section 4 in the template), so the main loop can unconditionally skip all
202
- `role == 'system'` messages.
203
-
204
- The user's system content always appears **before** the tools block in the
205
- system turn, matching the training format.
206
-
207
- ### 4.5 Tool Normalisation
208
-
209
- Some frameworks pass tool definitions with a top-level `function` key
210
- (`{"type": "function", "function": {...}}`), while others pass the function
211
- schema directly (`{"name": ..., "parameters": ...}`). The template normalises
212
- all entries to the canonical form before serialisation:
213
-
214
- ```jinja2
215
- {%- if tool.function is defined -%}
216
- {%- set ns_tb.list = ns_tb.list + [tool] -%}
217
- {%- else -%}
218
- {%- set ns_tb.list = ns_tb.list + [{"type": "function", "function": tool}] -%}
219
- {%- endif -%}
220
- ```
221
-
222
- ---
223
-
224
- ## 5. `enable_thinking` Behavior
225
-
226
- ### 5.1 Resolution Priority (Highest to Lowest)
227
-
228
- 1. **`/no_think` or `/think` text suffix** in the last user message that contains
229
- one. This is the highest priority because it represents the most recent
230
- explicit user intent and mirrors the model's fine-tuning data.
231
- 2. **`enable_thinking` template variable** passed at render time (e.g., via
232
- `tokenizer.apply_chat_template(..., enable_thinking=False)`).
233
- 3. **Default value** of `true` (thinking on by default, consistent with the model's
234
- training distribution).
235
-
236
- ### 5.2 Generation Prompt Behaviour
237
-
238
- When `add_generation_prompt=True`:
239
-
240
- ```
241
- enable_thinking=True → <|im_start|>assistant\n
242
- (model generates <think> itself)
243
-
244
- enable_thinking=False → <|im_start|>assistant\n<think>\n\n</think>\n\n
245
- (forces non-thinking mode by pre-filling empty block)
246
- ```
247
-
248
- ### 5.3 Last-History-Turn Behaviour (add_generation_prompt=False)
249
-
250
- When the conversation ends with an assistant message and no generation prompt
251
- is requested — typical when scoring a complete conversation or when the
252
- assistant message is being appended to the prompt for continuation:
253
-
254
- - **Think block present:** preserved verbatim regardless of `enable_thinking`.
255
- - **No think block, `enable_thinking=True`:** content left as-is (historical turns
256
- are already stripped; the last one is the current generation context).
257
- - **No think block, `enable_thinking=False`:** inject `<think>\n\n</think>\n\n`
258
- before the content.
259
-
260
- ### 5.4 Historical Think-Block Stripping (BUG-001)
261
-
262
- The official template collapses think blocks in historical turns to
263
- `<think>\n\n</think>` instead of removing them. In a long agentic loop this
264
- produces an ever-growing sequence of empty think blocks that degrades prompt
265
- quality ("prompt poisoning").
266
-
267
- The correct operation is full removal:
268
-
269
- ```python
270
- # Python equivalent
271
- content = content.split('</think>')[-1].lstrip('\n') if '</think>' in content else content
272
- ```
273
-
274
- ```jinja2
275
- {# Jinja2 equivalent #}
276
- {%- if '</think>' in _ac -%}
277
- {%- set _ac = _ac.split('</think>')[-1].lstrip('\n') -%}
278
- {%- endif -%}
279
- ```
280
-
281
- **Exception:** turns that also carry `tool_calls` keep their think block intact.
282
- The model is trained to produce thinking before tool invocations, and stripping
283
- the think block from a historical tool-call turn would misrepresent the prompt.
284
-
285
- ---
286
-
287
- ## 6. Tool Call Rendering
288
-
289
- ### 6.1 System Turn Tool Block Format
290
-
291
- The exact text injected into the system message when tools are present matches
292
- the Qwen3 Hermes training format:
293
-
294
- ```
295
- # Tools
296
-
297
- You may call one or more functions to assist with the user query.
298
-
299
- You are provided with function signatures within <tools></tools> XML tags:
300
- <tools>
301
- {"type": "function", "function": {"name": "...", ...}}
302
- </tools>
303
-
304
- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
305
- <tool_call>
306
- {"name": <function-name>, "arguments": <args-json-object>}
307
- </tool_call>
308
- ```
309
-
310
- All text — including the instruction sentences — is literal and must not be
311
- modified. The model was trained on this exact phrasing.
312
-
313
- ### 6.2 Assistant Tool-Call Block
314
-
315
- Each tool call is rendered as:
316
-
317
- ```
318
- <tool_call>
319
- {"name": "function_name", "arguments": {JSON_OBJECT}}
320
- </tool_call>
321
- ```
322
-
323
- Multiple parallel calls appear as consecutive blocks separated by `\n`:
324
-
325
- ```
326
- <tool_call>
327
- {"name": "f1", "arguments": {...}}
328
- </tool_call>
329
- <tool_call>
330
- {"name": "f2", "arguments": {...}}
331
- </tool_call><|im_end|>
332
- ```
333
-
334
- Note: the final `</tool_call>` is immediately followed by `<|im_end|>` with no
335
- intervening newline. This matches the training format.
336
-
337
- ### 6.3 Arguments: String vs Object (BUG-006)
338
-
339
- Some frameworks (notably older OpenAI-compatible clients and some streaming
340
- implementations) serialise tool-call arguments as a JSON string
341
- (`"{\"location\": \"Berlin\"}"`) rather than as an object
342
- (`{"location": "Berlin"}`). The template handles both:
343
-
344
- ```jinja2
345
- {%- if tc.function.arguments is string -%}
346
- {{- ', "arguments": ' + tc.function.arguments -}}
347
- {%- else -%}
348
- {{- ', "arguments": ' -}}{{- tc.function.arguments | tojson -}}
349
- {%- endif -%}
350
- ```
351
-
352
- When arguments are already a string they are passed through as-is (the caller
353
- is responsible for valid JSON). When they are a dict/object, `tojson` serialises
354
- them correctly including Unicode escaping and quote escaping.
355
-
356
- This arrangement also prevents the `"""` crash (BUG-006): Python triple-quoted
357
- strings inside Jinja2 template strings would crash the Jinja2 parser if the
358
- arguments dict happened to contain a value like `"""`. By using `tojson`
359
- (which produces a proper JSON string literal) the crash cannot occur.
360
-
361
- ### 6.4 Tool Results
362
-
363
- Tool results are wrapped in a user turn using `<tool_response>`:
364
-
365
- ```
366
- <|im_start|>user
367
- <tool_response>
368
- {result_content}
369
- </tool_response><|im_end|>
370
- ```
371
-
372
- Consecutive tool-response messages are merged into a single user turn — the
373
- template checks whether the previous message's role was also `tool` and
374
- suppresses the `<|im_start|>user\n` header if so.
375
-
376
- ---
377
-
378
- ## 7. Bug Analysis and Fixes
379
-
380
- ### BUG-001 — Historical Think Blocks Leaked (CRITICAL)
381
-
382
- **Symptom:** In multi-turn conversations with `enable_thinking=True`, every
383
- historical assistant message retains a collapsed `<think>\n\n</think>` block.
384
- Over many turns the prompt accumulates dozens of empty think blocks, degrading
385
- model performance.
386
-
387
- **Root cause:** Official template strips think content but leaves the surrounding
388
- `<think>` tags.
389
-
390
- **Fix:** Strip the entire block by splitting on `</think>` and taking the tail:
391
-
392
- ```jinja2
393
- {%- set _ac = _ac.split('</think>')[-1].lstrip('\n') -%}
394
- ```
395
-
396
- **Tests:** T10, T13, T16
397
-
398
- ---
399
-
400
- ### BUG-002 — KeyError on content=None / Missing content Key (HIGH)
401
-
402
- **Symptom:** When an assistant message contains only `tool_calls` and no `content`
403
- (or `content=None`, which is the OpenAI convention for pure tool-call responses),
404
- the template throws `UndefinedError` or `KeyError`.
405
-
406
- **Root cause:** Official template accesses `message.content` directly.
407
-
408
- **Fix:** Guard the access:
409
-
410
- ```jinja2
411
- {%- if message.content is defined and message.content is string -%}
412
- {%- set _ac = message.content -%}
413
- {%- elif message.content is defined and message.content is iterable ... -%}
414
- {%- set _ac = render_content(message.content) -%}
415
- {%- else -%}
416
- {%- set _ac = '' -%}
417
- {%- endif -%}
418
- ```
419
-
420
- **Tests:** T04, T11
421
-
422
- ---
423
-
424
- ### BUG-003 — Markup HTML-Escaping in Tool JSON (MEDIUM)
425
-
426
- **Symptom:** Tool definitions or tool-call arguments with characters like `<`, `>`,
427
- `&`, or `"` appear HTML-escaped in the rendered prompt (`&lt;`, `&gt;`, `&amp;`,
428
- `&#34;`). This causes the model to misread the tool schema.
429
-
430
- **Root cause:** `tojson` returns a Jinja2 `Markup` object. When `Markup` is
431
- concatenated with a plain Python string using `+` inside a Jinja2 expression,
432
- the plain string is auto-escaped and then concatenated with the already-safe
433
- `Markup` value.
434
-
435
- **Fix:** Never use `+` to join `tojson` output with plain strings. Emit each
436
- fragment through a separate `{{ }}` block:
437
-
438
- ```jinja2
439
- {# Every fragment in its own block #}
440
- {{- '{"name": ' -}}{{- tc.function.name | tojson -}}
441
- ```
442
-
443
- **Tests:** T03, T04, T12
444
-
445
- ---
446
-
447
- ### BUG-004 — Multiple System Messages Not Handled (MEDIUM)
448
-
449
- **Symptom:** Frameworks such as Open WebUI send more than one `role: system`
450
- message. The official template either crashes or emits multiple system turns,
451
- both of which confuse the model.
452
-
453
- **Root cause:** No merging logic for multiple system messages.
454
-
455
- **Fix:** Pre-scan all messages and concatenate system content with `\n\n`:
456
-
457
- ```jinja2
458
- {%- if ns_sys.content == '' -%}
459
- {%- set ns_sys.content = _c -%}
460
- {%- else -%}
461
- {%- set ns_sys.content = ns_sys.content + '\n\n' + _c -%}
462
- {%- endif -%}
463
- ```
464
-
465
- **Tests:** T02, T14
466
-
467
- ---
468
-
469
- ### BUG-005 — Wrong Non-Thinking Prefill Whitespace (LOW-MEDIUM)
470
-
471
- **Symptom:** Non-thinking mode produces a think block with incorrect whitespace,
472
- moving the model off its training distribution and causing output quality
473
- degradation or refusal to honour the non-thinking instruction.
474
-
475
- **Root cause:** The official template uses `<think>\n</think>\n\n` (missing the
476
- second newline inside the block), which does not match the format described in
477
- the technical report.
478
-
479
- **Fix:** Use the exact 19-character sequence:
480
-
481
- ```
482
- <think>\n\n</think>\n\n
483
- ```
484
-
485
- **Tests:** T08, T17
486
-
487
- ---
488
-
489
- ### BUG-006 — Triple-Quote Crash on Python String Arguments (MEDIUM)
490
-
491
- **Symptom:** Jinja2 raises a `TemplateSyntaxError` or produces garbled output when
492
- tool-call arguments contain triple-quote sequences (`"""` or `'''`) because the
493
- template previously embedded argument values using Python string literal
494
- concatenation.
495
-
496
- **Root cause:** Some community templates build the tool-call JSON via string
497
- interpolation (`'{"arguments": "' + args + '"}'`), which breaks for argument
498
- values containing quote characters.
499
-
500
- **Fix:** Use `tojson` for all non-string arguments (produces well-formed JSON) and
501
- pass string arguments through unchanged (caller provides valid JSON strings):
502
-
503
- ```jinja2
504
- {%- if tc.function.arguments is string -%}
505
- {{- ', "arguments": ' + tc.function.arguments -}}
506
- {%- else -%}
507
- {{- ', "arguments": ' -}}{{- tc.function.arguments | tojson -}}
508
- {%- endif -%}
509
- ```
510
-
511
- **Tests:** T12
512
-
513
- ---
514
-
515
- ## 8. Template Architecture
516
-
517
- The template is divided into eight clearly delimited sections, each with a
518
- comment header:
519
-
520
- ```
521
- Section 1 render_content macro
522
- Handles str / list (image/video/text) / None → plain text.
523
- Increments ns.image_count / ns.video_count for vision tokens.
524
-
525
- Section 2 Namespace initialisation
526
- Single ns object; enable_thinking defaults to true.
527
-
528
- Section 3 Pre-scan
529
- Walk all user messages; last /no_think or /think wins.
530
-
531
- Section 4 Collect system content
532
- Merge all system / developer messages with \n\n.
533
-
534
- Section 5 Build tools list
535
- Normalise every tool to {"type":"function","function":{...}}.
536
-
537
- Section 6 Output system turn
538
- Emit one <|im_start|>system turn (user content + tools block).
539
-
540
- Section 7 Main message loop
541
- 7a system/developer → skip (already emitted)
542
- 7b user → render with vision support
543
- 7c assistant → render with think-block logic + tool_calls
544
- 7d tool → group into user turns
545
- 7e unknown role → raise_exception
546
-
547
- Section 8 Generation prompt
548
- enable_thinking=True → bare <|im_start|>assistant\n
549
- enable_thinking=False → add <think>\n\n</think>\n\n prefix
550
- ```
551
-
552
- ### Design Decisions
553
-
554
- **No default system prompt.** Unlike some community templates, this template does
555
- not inject a default system prompt when none is provided. The model performs well
556
- without one, and injecting one would cause conflicts for applications that rely on
557
- the system prompt being exactly what they set.
558
-
559
- **No BOS token.** The Qwen3 family was trained without a BOS token. Adding one
560
- would consume a context window slot unnecessarily and may harm performance.
561
-
562
- **No `<|endoftext|>` in conversation.** This token is reserved for
563
- end-of-document signalling in the pre-training phase, not for conversation
564
- boundaries.
565
-
566
- ---
567
-
568
- ## 9. Test Coverage
569
-
570
- The 17 test fixtures in `validate_template.py` cover:
571
-
572
- | ID | Scenario | Key assertion |
573
- |---|---|---|
574
- | T01 | Simple user/assistant, no system, no tools | Exact ChatML output |
575
- | T02 | System message | System turn before user turn |
576
- | T03 | Tools defined, `enable_thinking=True` | Tools block in system; no prefill |
577
- | T04 | Tool call, `content=None` | No crash; `<tool_call>` present |
578
- | T05 | Parallel tool calls | `</tool_call>\n<tool_call>` separator |
579
- | T06 | Tool result (role=tool) | `<|im_start|>user\n<tool_response>` |
580
- | T07 | `enable_thinking=True` generation prompt | No think prefix emitted |
581
- | T08 | `enable_thinking=False` generation prompt | Exact 19-char prefill |
582
- | T09 | `/no_think` flag in user message | Non-thinking prefill applied |
583
- | T10 | Historical think blocks | Fully stripped, not collapsed |
584
- | T11 | Missing `content` key on assistant | No KeyError / UndefinedError |
585
- | T12 | Special chars in arguments | Correctly JSON-escaped |
586
- | T13 | Historical tool-call turn with think | Think block preserved |
587
- | T14 | Multiple system messages | Merged with `\n\n`; single system turn |
588
- | T15 | Parallel tool responses | Both inside single user turn |
589
- | T16 | Last history turn with existing think | Preserved verbatim |
590
- | T17 | Last history turn, no think, `enable_thinking=False` | Prefill injected |
591
-
592
- Run the suite:
593
-
594
- ```bash
595
- cd /workspace/project/qwen3_5-template
596
- python validate_template.py
597
- # Expected: 17 passed, 0 failed
598
- ```
599
-
600
- ---
601
-
602
- ## 10. Tool Ecosystem Compatibility
603
-
604
- An analysis of 51 tool-calling frameworks and inference backends was conducted to
605
- verify that the template's output is consumable by the broadest possible set of
606
- tools. Key findings:
607
-
608
- ### 10.1 OpenAI JSON Format Dominance
609
-
610
- 31 of the 51 analysed tools use the **OpenAI-compatible JSON function-call API**
611
- (Group A). These tools pass tool definitions as a `tools` array and receive tool
612
- calls back as `message.tool_calls` objects. The template's input format is fully
613
- compatible with this convention.
614
-
615
- Notable Group A members: OpenHands, LangChain, LangGraph, LiteLLM, CrewAI,
616
- Pydantic AI, Open WebUI, LibreChat, LM Studio, LlamaIndex, AutoGen, LiteLLM.
617
-
618
- ### 10.2 Inference Server Compatibility
619
-
620
- | Backend | Compatibility note |
621
- |---|---|
622
- | **vLLM** | Uses the `hermes` tool parser for Qwen models, matching this template's `<tool_call>` format exactly. |
623
- | **llama.cpp** | Recognises `<tool_call>` via the `--jinja` flag + chat template loading. Note: `--jinja` disables GBNF grammar (Issue #12204). |
624
- | **Ollama** | Auto-detects the tool-call tag via `parseTag()` which reads the first text node after `.ToolCalls` in the Go template tree — `<tool_call>` is one of the three known tags. |
625
- | **LM Studio** | Passes tool definitions as the `tools` API field; receives tool calls in `message.tool_calls`. |
626
- | **TabbyAPI** | Full OpenAI-compatible API; correct chat template is the only requirement. |
627
-
628
- ### 10.3 Non-Native Tool-Calling Frameworks
629
-
630
- Three framework groups (Cline/Roo Code XML, OpenCode `<parameter>`, Aider
631
- SEARCH/REPLACE) do not use the OpenAI tool-calling API at all. They inject their
632
- own tool descriptions into the system prompt and parse the model's text output
633
- directly. These frameworks do not interact with the chat template's tool-calling
634
- sections — they send no `tools` array and the template therefore emits no tool
635
- block.
636
-
637
- ### 10.4 Arguments as JSON String
638
-
639
- Several frameworks (notably some streaming clients and older OpenAI SDK versions)
640
- serialise `tool_calls[].function.arguments` as a JSON string rather than a parsed
641
- object. The template's dual-path arguments handling (Section 6.3) accommodates
642
- both cases transparently.
643
-
644
- ---
645
-
646
- *Generated as part of the `fix/qwen3-template-bugs` implementation.*