ScienceOne-AI commited on
Commit
816198f
·
verified ·
1 Parent(s): 6d30fe1

Upload 61 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +17 -0
  2. LICENSE +201 -0
  3. README.md +306 -3
  4. README_en.md +306 -0
  5. assets/benchmark_performance.png +3 -0
  6. cases/case_DeepResearchIF_general_en_01.png +3 -0
  7. cases/case_DeepResearchIF_general_zh_01.png +3 -0
  8. cases/case_DeepResearchIF_science_en_01.png +3 -0
  9. cases/case_DeepResearchIF_science_zh_01.png +3 -0
  10. cases/case_deepresearch-report-writing_general_zh_01.png +3 -0
  11. cases/case_deepresearch-report-writing_science_en_01.png +3 -0
  12. cases/case_deepresearch-report-writing_science_zh_01.png +3 -0
  13. cases/case_file-understanding-generation_general_en_01.png +3 -0
  14. cases/case_file-understanding-generation_science_zh_01.png +3 -0
  15. cases/case_long-horizon-reasoning_general_en_01.png +3 -0
  16. cases/case_long-horizon-reasoning_general_en_02.png +3 -0
  17. cases/case_long-horizon-reasoning_general_zh_01.png +3 -0
  18. cases/case_long-horizon-reasoning_general_zh_02.png +3 -0
  19. cases/case_skills_science_en_01.png +3 -0
  20. cases/case_skills_science_zh_01.png +3 -0
  21. inference/README.md +224 -0
  22. inference/README_en.md +226 -0
  23. inference/inference/run_batch_inference.py +373 -0
  24. inference/inference/run_single_inference.py +354 -0
  25. inference/models/tokenizer/added_tokens.json +28 -0
  26. inference/models/tokenizer/chat_template.jinja +89 -0
  27. inference/models/tokenizer/config.json +41 -0
  28. inference/models/tokenizer/merges.txt +0 -0
  29. inference/models/tokenizer/special_tokens_map.json +31 -0
  30. inference/models/tokenizer/tokenizer.json +3 -0
  31. inference/models/tokenizer/tokenizer_config.json +239 -0
  32. inference/models/tokenizer/vocab.json +0 -0
  33. inference/requirements.txt +176 -0
  34. inference/run_batch_inference_demo.sh +150 -0
  35. inference/run_batch_inference_online_demo.sh +183 -0
  36. inference/server/llm_api.py +665 -0
  37. inference/server/tool_api.py +59 -0
  38. inference/server/tool_execution.py +73 -0
  39. inference/test_all_tools.py +123 -0
  40. inference/tool_kits/__init__.py +48 -0
  41. inference/tool_kits/ask_question_about_image_toolkit.py +32 -0
  42. inference/tool_kits/ask_question_about_video_toolkit.py +32 -0
  43. inference/tool_kits/base.py +183 -0
  44. inference/tool_kits/bash_toolkit.py +30 -0
  45. inference/tool_kits/execute_code_toolkit.py +23 -0
  46. inference/tool_kits/file_wide_parse_toolkit.py +27 -0
  47. inference/tool_kits/image_search_toolkit.py +30 -0
  48. inference/tool_kits/scholar_search_toolkit.py +29 -0
  49. inference/tool_kits/wide_search_toolkit.py +29 -0
  50. inference/tool_kits/wide_visit_toolkit.py +32 -0
.gitattributes CHANGED
@@ -34,3 +34,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ assets/benchmark_performance.png filter=lfs diff=lfs merge=lfs -text
38
+ cases/case_deepresearch-report-writing_general_zh_01.png filter=lfs diff=lfs merge=lfs -text
39
+ cases/case_deepresearch-report-writing_science_en_01.png filter=lfs diff=lfs merge=lfs -text
40
+ cases/case_deepresearch-report-writing_science_zh_01.png filter=lfs diff=lfs merge=lfs -text
41
+ cases/case_DeepResearchIF_general_en_01.png filter=lfs diff=lfs merge=lfs -text
42
+ cases/case_DeepResearchIF_general_zh_01.png filter=lfs diff=lfs merge=lfs -text
43
+ cases/case_DeepResearchIF_science_en_01.png filter=lfs diff=lfs merge=lfs -text
44
+ cases/case_DeepResearchIF_science_zh_01.png filter=lfs diff=lfs merge=lfs -text
45
+ cases/case_file-understanding-generation_general_en_01.png filter=lfs diff=lfs merge=lfs -text
46
+ cases/case_file-understanding-generation_science_zh_01.png filter=lfs diff=lfs merge=lfs -text
47
+ cases/case_long-horizon-reasoning_general_en_01.png filter=lfs diff=lfs merge=lfs -text
48
+ cases/case_long-horizon-reasoning_general_en_02.png filter=lfs diff=lfs merge=lfs -text
49
+ cases/case_long-horizon-reasoning_general_zh_01.png filter=lfs diff=lfs merge=lfs -text
50
+ cases/case_long-horizon-reasoning_general_zh_02.png filter=lfs diff=lfs merge=lfs -text
51
+ cases/case_skills_science_en_01.png filter=lfs diff=lfs merge=lfs -text
52
+ cases/case_skills_science_zh_01.png filter=lfs diff=lfs merge=lfs -text
53
+ inference/models/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,3 +1,306 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <div align="center">
3
+
4
+ # S1-DeepResearch:面向长程深度研究的端到端模型
5
+
6
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg?style=for-the-badge)](./LICENSE)
7
+ [![HuggingFace](https://img.shields.io/badge/🤗%20HuggingFace-S1--DeepResearch--15k-0040A1?style=for-the-badge)](https://huggingface.co/datasets/ScienceOne-AI/S1-DeepResearch-15k)
8
+ [![HuggingFace](https://img.shields.io/badge/🤗%20HuggingFace-S1--DeepResearch--32B-ffd21e?style=for-the-badge)](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-32B)
9
+ [![ModelScope](https://img.shields.io/badge/🤖%20ModelScope-S1--DeepResearch--32B-mediumpurple?style=for-the-badge)](https://modelscope.cn/models/ScienceOne-AI/S1-DeepResearch-32B)
10
+
11
+ [English](./README_en.md) | 中文
12
+
13
+ </div>
14
+
15
+ <hr>
16
+
17
+ ## 🔥 最新动态 (News & Updates)
18
+
19
+ - **[2026/04/04]** 🎉 发布 [**S1-DeepResearch-32B**](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-32B):面向长程深度研究的端到端旗舰模型,更侧重**真实场景落地**——在**长链复杂推理**之外,重点强化**深度研究指令遵循**、**深度调研报告写作**、**文件理解与生成**、**技能调用**等能力。在 20 项智能体基准能力评测中,相对基座 **Qwen3-32B** 全方位显著领先,整体性能接近主流闭源旗舰模型(**GPT 5.2**、**Claude 4.6**、**GLM-5**)。推理代码及 [15K 智能体训练轨迹数据(开源版本)](https://huggingface.co/datasets/ScienceOne-AI/S1-DeepResearch-15k)同步发布。
20
+ - **[2025/12/31]** 我们开源了 [**S1-DeepResearch-8B-Preview**](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-8B-Preview):聚焦**通用长链路复杂推理**,以轻量参数探索深度研究场景下的可用空间。
21
+
22
+ ## 📝 概述 (Overview)
23
+
24
+ **S1-DeepResearch-32B** 是磐石团队(ScienceOne AI)研发的面向 **长程深度研究(Long-Horizon Deep Research)** 的端到端模型,其核心能力可概括为 **五大维度**:
25
+
26
+ - **长链复杂推理**:支持多阶段、多跳任务中的持续推理与行动推进,突破单步问答范式。通过跨文档检索、证据聚合、状态记忆与策略迭代,实现复杂任务中的路径规划、信息整合与结果收敛,确保推理过程的稳定性与结论的可靠性。
27
+
28
+ - **深度研究指令遵循**:精准解析深度研究场景下的多约束复杂指令,构建围绕「任务定义—方法机理—工具执行—结果呈现」等深度研究全链路的指令理解范式;并在认知、产物、执行与环境四层上协同约束,让复杂任务可控、过程可预期、结果与意图一致。
29
+
30
+ - **深度调研报告写作**:在信息整合之上输出可论证、可引用的报告体例;支持多源材料组织与证据核对,兼顾论述结构、可读性与事实可追溯,直接服务科研写作与决策研判。
31
+
32
+ - **文件理解与生成**:覆盖 PDF、表格、网页等多形态输入的理解,以及结构化、可交付的输出生成。在多轮工具增强交互中尽量保持语义与执行一致,形成「解析—加工—生成」的闭环,减轻科研与数据密集型流程中的重复手工环节。
33
+
34
+ - **技能使用(Skills)**:将文献检索、数据分析、实验设计、计算建模、可视化与报告生成等以可调用模块形式组织,按任务目标进行动态装配与渐进式加载,支撑从数据获取到结果呈现的连续工作流。
35
+
36
+ ### ✨ 核心特性
37
+
38
+ - **超长上下文建模**:支持 128K 上下文窗口,单会话承载更长证据链与多轮交互历史,适配长程研究任务。
39
+ - **长程工具调用**:可稳定执行 **150+** 轮连续工具调用,构建基于推理驱动的工具编排与决策闭环,实现多阶段任务的持续规划、执行与自我校正。
40
+ - **原生工具体系**:内置 **9** 种常用工具(如搜索、网页浏览、代码执行、命令行等),开箱即用。
41
+
42
+ ## 🚀 模型下载 (Model Download)
43
+
44
+ <div align="center">
45
+
46
+ | 模型名称 | 参数量 | 上下文长度 | 下载链接 |
47
+ | :---: | :---: | :---: | :---: |
48
+ | **S1-DeepResearch-32B** | 32B | 128k | [🤗 HuggingFace](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-32B) \| [🤖 ModelScope](https://modelscope.cn/models/ScienceOne-AI/S1-DeepResearch-32B) |
49
+ | **S1-DeepResearch-8B-Preview** | 8B | 128k | [🤗 HuggingFace](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-8B-Preview) \| [🤖 ModelScope](https://modelscope.cn/models/ScienceOne-AI/S1-DeepResearch-8B-Preview) |
50
+
51
+ </div>
52
+
53
+ ## 📊 性能评估 (Evaluation)
54
+
55
+ 我们在与模型 **五大能力** 相对应的 **5 个维度、共 20 项智能体能力基准** 上对 **S1-DeepResearch-32B** 进行了系统评估,各维度与基准对应关系如下:
56
+
57
+ - **长链复杂推理**:文本模态包括 GAIA (text)、BrowseComp、BrowseComp-ZH、XBench-DeepSearch、HLE (text);图文模态包括 LiveVQA、MM-Search、BrowseComp-VL、RealX-Bench、HLE-VL、MM-BrowseComp。
58
+ - **���度研究指令遵循**:ComplexBench、DeepResearchIF (in-house)。
59
+ - **深度调研报告写作**:DeepResearch Bench、DeepResearch Bench II、Research Rubrics。
60
+ - **文件理解与生成**:GAIA (file)、GTA、FileSys (in-house)。
61
+ - **技能调用**:SkillsUse (in-house)。
62
+
63
+ <div align="center">
64
+
65
+ <img src="./assets/benchmark_performance.png" alt="S1-DeepResearch-32B 与基座及闭源旗舰在 20 项智能体基准上的性能对比" width="800" />
66
+
67
+ </div>
68
+
69
+ **S1-DeepResearch-32B** 在所有榜单上相对基座 **Qwen3-32B** 及更大参数量模型 **Qwen3-235B** 均取得显著优势;在深度研究指令遵循、文件理解与生成、技能调用等维度的内部榜单中,亦超越 **Qwen3.5-397B**。整体性能接近主流闭源旗舰(**GPT 5.2**、**Claude 4.6**、**GLM-5**、**Kimi-K2.5**)。开放榜单与内部任务的结果相互印证,表明 S1-DeepResearch-32B 已具备面向真实业务场景部署与落地的能力。
70
+
71
+ ## 📂 任务样例 (Cases)
72
+
73
+ 以下展示 S1-DeepResearch-32B 在技能调用方面的案例,模型在进行材料建模的过程中,首先调用了科学技能`scientific-skills/pymatgen`补充专业知识,然后根据技能的指导,使用`pymatgen`完成建模,并输出cif文件。
74
+
75
+ <div align="center">
76
+
77
+ <img src="./cases/case_skills_science_en_01.png" alt="英文科学 Skills 协同任务样例节选" width="600" />
78
+
79
+ </div>
80
+
81
+ 更多案例将持续补充至 `cases/` 目录。
82
+
83
+ ## 🚀 快速开始
84
+
85
+ ### 环境配置
86
+
87
+ 1. **安装依赖**:
88
+
89
+ ```bash
90
+ pip install -r requirements.txt
91
+ ```
92
+
93
+ 2. **Docker 配置**:
94
+
95
+ 项目提供官方预构建 Docker 镜像,支持快速部署与运行。系统包含两个核心镜像:
96
+
97
+ - **toolkits-api**:工具服务主容器(对外提供 API 能力)
98
+ - **code-sandbox**:代码执行沙箱镜像(由服务按需创建,用于隔离执行任务)
99
+
100
+ 当前执行类工具(`execute_code`、`bash`)采用 **Docker-outside-of-Docker(DooD)** 模式:通过挂载宿主机 Docker socket,由工具容器直接调用宿主机 Docker daemon,按需创建隔离的沙箱容器执行任务。
101
+
102
+ **镜像地址:**
103
+
104
+ ```text
105
+ ghcr.io/wenge-research/toolkits-api:v2.0.260403
106
+ ghcr.io/wenge-research/code-sandbox:v1.0.260403
107
+ ```
108
+
109
+ **拉取镜像:**
110
+
111
+ ```text
112
+ docker pull ghcr.io/wenge-research/toolkits-api:v2.0.260403
113
+ docker pull ghcr.io/wenge-research/code-sandbox:v1.0.260403
114
+ ```
115
+
116
+ **运行容器:**
117
+
118
+ 运行容器时需要挂载配置文件 `src/config.yaml`、Docker socket(用于沙箱执行),以及日志和缓存目录(可选):
119
+
120
+ ```bash
121
+ docker run -d \
122
+ --name toolkits-api \
123
+ --network host \
124
+ -e API_PORT=8080 \
125
+ -e API_WORKERS=4 \
126
+ -e HOST_LOG_DIR=$(pwd)/logs \
127
+ -e SANDBOX_MODE=docker \
128
+ -e HTTP_PROXY=http://your-proxy:port \
129
+ -e HTTPS_PROXY=http://your-proxy:port \
130
+ -e PROXY_URL=http://your-proxy:port \
131
+ -v /etc/localtime:/etc/localtime:ro \
132
+ -v /etc/timezone:/etc/timezone:ro \
133
+ -v /var/run/docker.sock:/var/run/docker.sock \
134
+ -v $(pwd)/src/config.yaml:/app/src/config.yaml \
135
+ -v $(pwd)/logs:/app/logs \
136
+ -v $(pwd)/cache:/app/cache \
137
+ ghcr.io/wenge-research/toolkits-api:v2.0.260403
138
+ ```
139
+
140
+ **参数说明**:
141
+
142
+ | 参数 | 说明 |
143
+ |------|------|
144
+ | `-e API_PORT` | 服务监听端口,默认 8080 |
145
+ | `-e API_WORKERS` | worker 进程数,根据并发需求调整,默认 1 |
146
+ | `-e SANDBOX_MODE=docker` | 启用 Docker 沙箱模式(否则为 subprocess) |
147
+ | `-e HOST_LOG_DIR` | 当启用 Docker 沙箱模式时,需要传入宿主机日志目录,供沙箱容器挂载 |
148
+ | `-e HTTP_PROXY / HTTPS_PROXY / PROXY_URL` | 代理配置(可选) |
149
+ | `--network host` | 如果使用宿主机的代理端口,需要设置此参数(可选) |
150
+ | `-v /etc/localtime:/etc/localtime:ro` | 同步宿主机时区(只读) |
151
+ | `-v /etc/timezone:/etc/timezone:ro` | 同步宿主机时区文件(只读) |
152
+ | `-v /var/run/docker.sock` | 当启用 Docker 沙箱模式时,需要挂载宿主机 Docker socket,用于调度沙箱容器 |
153
+ | `-v config.yaml` | 挂载配置文件(API Key、模型配置、沙箱配置等) |
154
+ | `-v logs` | 挂载日志目录(可选) |
155
+ | `-v cache` | 挂载缓存目录,缓存数据形式参考容器内 /app/cache 中文件进行构造(可选) |
156
+
157
+
158
+ 3. **配置工具服务地址**
159
+
160
+ 推荐通过 JSON 配置文件或环境变量覆盖默认项。不建议直接编辑 `utils/configs.py`。
161
+
162
+ **方式一(推荐):本地 JSON 配置**
163
+
164
+ 从示例文件复制并生成本地配置:
165
+
166
+ ```bash
167
+ cp utils/config/config.example.json utils/config/config.local.json
168
+ ```
169
+
170
+ 在 `utils/config/config.local.json` 中设置工具服务基地址,例如:
171
+
172
+ ```json
173
+ {
174
+ "TOOLS_SERVER_BASE_ENDPOINT_URL": [
175
+ "http://127.0.0.1:8080"
176
+ ]
177
+ }
178
+ ```
179
+
180
+ **方式二:环境变量**
181
+
182
+ 指定配置文件路径,或对单项进行覆盖:
183
+
184
+ ```bash
185
+ export S1_DR_CONFIG_JSON="utils/config/config.local.json"
186
+ # 或仅覆盖 TOOLS_SERVER_BASE_ENDPOINT_URL
187
+ export TOOLS_SERVER_BASE_ENDPOINT_URL='["http://127.0.0.1:8080"]'
188
+ ```
189
+
190
+ 4. **配置 API 密钥**
191
+
192
+ 建议通过 `utils/config/config.local.json` 配置各服务商密钥,或覆盖同名环境变量:
193
+
194
+ ```json
195
+ {
196
+ "AIHUBMIX_KEY": "<your_aihubmix_key>",
197
+ "AZURE_KEY": "<your_azure_key>",
198
+ "VOLCANO_KEY": "<your_volcano_key>",
199
+ "ALIYUN_KEY": "<your_aliyun_key>"
200
+ }
201
+ ```
202
+
203
+ 环境变量示例:
204
+
205
+ ```bash
206
+ export AIHUBMIX_KEY="<your_aihubmix_key>"
207
+ export AZURE_KEY="<your_azure_key>"
208
+ export VOLCANO_KEY="<your_volcano_key>"
209
+ export ALIYUN_KEY="<your_aliyun_key>"
210
+ ```
211
+
212
+ ### 单条推理示例
213
+
214
+ ```python
215
+ import asyncio
216
+
217
+ from server.llm_api import LLMClient
218
+ from server.tool_api import return_all_tools
219
+ from inference.run_single_inference import run_one_query
220
+ from utils.prompts import DEEPRESEARCH_SYSTEM_PROMPT
221
+
222
+
223
+ async def main():
224
+ llm_client_urls = ["http://127.0.0.1:10777/v1/chat/completions"]
225
+ llm_client_models = ["S1-DeepResearch-32B"]
226
+ llm_client = LLMClient(llm_client_urls, llm_client_models)
227
+
228
+ all_tools = return_all_tools()
229
+
230
+ result = await run_one_query(
231
+ llm=llm_client,
232
+ user_query="阿里巴巴成立时,18位创始团队成员中,姓马、姓蔡、姓张的创始人的平均年龄,保留一位小数",
233
+ file_path=[],
234
+ system=DEEPRESEARCH_SYSTEM_PROMPT,
235
+ max_rounds=15,
236
+ temperature=0.4,
237
+ top_p=0.95,
238
+ extra_payload={},
239
+ debug=True,
240
+ all_tools=all_tools,
241
+ system_format="deep_research",
242
+ log_label="quick_start_single",
243
+ )
244
+
245
+ final_answer = result[-1]["final_answer"] if result else ""
246
+ print(final_answer)
247
+
248
+
249
+ if __name__ == "__main__":
250
+ asyncio.run(main())
251
+ ```
252
+
253
+ 说明:
254
+
255
+ - `file_path` 在当前实现中应传 `list`(如 `[]` 或 `['/path/a.pdf']`)。
256
+ - `system_format` 可选:`deep_research`、`azure`、`aihubmix`、`aihubmix_claude`、`aihubmix_glm`、`volcano`、`aliyun`。
257
+
258
+ ### 批量推理示例
259
+
260
+ 本地/vLLM:
261
+
262
+ ```bash
263
+ cd inference
264
+ cp run_batch_inference_demo.sh run_batch_local.sh
265
+ # 编辑 run_batch_local.sh 里的参数(LLM_CLIENT_URLS、LLM_CLIENT_MODELS、TEST_DATA_FILE 等)
266
+ bash run_batch_local.sh
267
+ ```
268
+
269
+ 在线平台:
270
+
271
+ ```bash
272
+ cd inference
273
+ cp run_batch_inference_online_demo.sh run_batch_online.sh
274
+ # 编辑 run_batch_online.sh 里的参数(LLM_CLIENT_URLS、LLM_CLIENT_MODELS、SYSTEM_FORMAT 等)
275
+ bash run_batch_online.sh
276
+ ```
277
+
278
+ 日志查看:
279
+
280
+ ```bash
281
+ tail -f run_logs/*.log
282
+ ```
283
+
284
+ 更多推理功能详见 📖 **[进阶使用方法](./inference/README.md)**。
285
+
286
+ ## 🔭 未来工作 (Future Work)
287
+
288
+ - **S1-DeepResearch 论文:** 预计两周内发布S1-DeepResearch论文,详细介绍支撑 S1-DeepResearch 五大能力特性的数据合成策略、模型训练与推理机制设计,以及推理时扩展等关键评测结论与实践经验。
289
+ - **S1-DeepResearch-VL 版本:** 2026年上半年,将推出支持视觉理解与跨模态推理的 S1-DeepResearch-VL 模型,以覆盖更丰富的研究型任务场景。
290
+
291
+ ## 📜 协议 (License)
292
+
293
+ 本项目采用 **[Apache License 2.0](./LICENSE)** 开源协议。
294
+
295
+ ## 引用 (Citation)
296
+
297
+ 如果您觉得 S1-DeepResearch 对您的工作有帮助,请考虑引用我们的工作:
298
+
299
+ ```bibtex
300
+ @software{s1deepresearch2026,
301
+ title={S1-DeepResearch: End-to-End Deep Research Models},
302
+ author={ScienceOne Team},
303
+ year={2026},
304
+ url={https://github.com/ScienceOne-AI/S1-DeepResearch},
305
+ }
306
+ ```
README_en.md ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <div align="center">
3
+
4
+ # S1-DeepResearch: End-to-End Models for Long-Horizon Deep Research
5
+
6
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg?style=for-the-badge)](./LICENSE)
7
+ [![HuggingFace](https://img.shields.io/badge/🤗%20HuggingFace-S1--DeepResearch--15k-0040A1?style=for-the-badge)](https://huggingface.co/datasets/ScienceOne-AI/S1-DeepResearch-15k)
8
+ [![HuggingFace](https://img.shields.io/badge/🤗%20HuggingFace-S1--DeepResearch--32B-ffd21e?style=for-the-badge)](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-32B)
9
+ [![ModelScope](https://img.shields.io/badge/🤖%20ModelScope-S1--DeepResearch--32B-mediumpurple?style=for-the-badge)](https://modelscope.cn/models/ScienceOne-AI/S1-DeepResearch-32B)
10
+
11
+ English | [中文](./README.md)
12
+
13
+ </div>
14
+
15
+ <hr>
16
+
17
+ ## 🔥 News & Updates
18
+
19
+ - **[2026/04/04]** 🎉 We release [**S1-DeepResearch-32B**](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-32B), an end-to-end agentic model for long-horizon deep research, with stronger emphasis on **real-world deployment**—beyond **long-chain complex reasoning**, it focuses on **deep-research instruction following**, **deep research report writing**, **file understanding and generation**, and **skills using**. On **20 agentic capability benchmarks**, it **outperforms the base model Qwen3-32B by a clear margin across the board**, and overall performance is close to mainstream closed-source flagship models (**GPT 5.2**, **Claude 4.6**, **GLM-5**). Inference code and the [**15K agent training trajectory dataset**](https://huggingface.co/datasets/ScienceOne-AI/S1-DeepResearch-15k) (a subset of the full training data) are released together.
20
+ - **[2025/12/31]** We open-sourced [**S1-DeepResearch-8B-Preview**](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-8B-Preview), focusing on **general long-chain complex reasoning** and exploring what is feasible in deep research at a smaller parameter scale.
21
+
22
+ ## 📝 Overview
23
+
24
+ **S1-DeepResearch-32B** is an end-to-end model developed by the ScienceOne AI for **long-horizon deep research**. Its core capabilities span **five dimensions**:
25
+
26
+ - **Long-chain complex reasoning**: Supports sustained reasoning and action across multi-stage, multi-hop tasks, going beyond single-step Q&A. Through cross-document retrieval, evidence aggregation, state memory, and policy iteration, it plans paths, integrates information, and converges results in complex settings, keeping the reasoning process stable and conclusions reliable.
27
+
28
+ - **Deep research instruction following**: Parses multi-constraint instructions in deep research scenarios and builds an instruction-understanding paradigm along the full research chain—**task definition → mechanisms → tool execution → result presentation**—with coordinated constraints across cognition, artifacts, execution, and environment so complex tasks stay controllable, processes predictable, and outputs aligned with intent.
29
+
30
+ - **Deep research report writing**: Produces arguable, citable report-style outputs on top of information integration; organizes multi-source material and evidence checks while balancing structure, readability, and traceability—suited for scientific writing and decision support.
31
+
32
+ - **File understanding and generation**: Covers PDFs, tables, web pages, and other modalities for input understanding, plus structured, deliverable outputs. In multi-turn tool-augmented interaction, it keeps semantics and execution aligned, closing the loop **parse → process → generate** and reducing repetitive manual work in research and data-heavy workflows.
33
+
34
+ - **Skills Using**: Organizes literature search, data analysis, experiment design, computational modeling, visualization, report generation, and more as callable modules, dynamically assembled and progressively loaded toward task goals, supporting continuous workflows from data acquisition to presentation.
35
+
36
+ ### ✨ Key Features
37
+
38
+ - **Ultra-long context modeling**: A **128K** context window lets a single session hold longer evidence chains and multi-turn interaction history, suited to long-horizon research tasks.
39
+ - **Long-horizon tool calling**: Stably runs **150+** consecutive tool-call rounds, building reasoning-driven tool orchestration and a decision closed loop—enabling continuous planning, execution, and self-correction across multi-stage tasks.
40
+ - **Native tool ecosystem**: **9** built-in common tools (e.g., search, web browsing, code execution, command line) ready to use out of the box.
41
+
42
+ ## 🚀 Model Download
43
+
44
+ <div align="center">
45
+
46
+ | Model | Parameters | Context length | Download |
47
+ | :---: | :---: | :---: | :---: |
48
+ | **S1-DeepResearch-32B** | 32B | 128k | [🤗 HuggingFace](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-32B) \| [🤖 ModelScope](https://modelscope.cn/models/ScienceOne-AI/S1-DeepResearch-32B) |
49
+ | **S1-DeepResearch-8B-Preview** | 8B | 128k | [🤗 HuggingFace](https://huggingface.co/ScienceOne-AI/S1-DeepResearch-8B-Preview) \| [🤖 ModelScope](https://modelscope.cn/models/ScienceOne-AI/S1-DeepResearch-8B-Preview) |
50
+
51
+ </div>
52
+
53
+ ## 📊 Evaluation
54
+
55
+ We systematically evaluated **S1-DeepResearch-32B** on **20 agentic capability benchmarks** grouped into **5 dimensions** aligned with the five capability areas:
56
+
57
+ - **Long-chain complex reasoning**: Text—GAIA (text), BrowseComp, BrowseComp-ZH, XBench-DeepSearch, HLE (text); vision-language—LiveVQA, MM-Search, BrowseComp-VL, RealX-Bench, HLE-VL, MM-BrowseComp.
58
+ - **Deep research instruction following**: ComplexBench, DeepResearchIF (in-house).
59
+ - **Deep research report writing**: DeepResearch Bench, DeepResearch Bench II, Research Rubrics.
60
+ - **File understanding and generation**: GAIA (file), GTA, FileSys (in-house).
61
+ - **Skills Using**: SkillsUse (in-house).
62
+
63
+ <div align="center">
64
+
65
+ <img src="./assets/benchmark_performance.png" alt="S1-DeepResearch-32B vs. base and closed-source flagships on 20 agentic benchmarks" width="800" />
66
+
67
+ </div>
68
+
69
+ **S1-DeepResearch-32B** gains a **clear advantage** over the base **Qwen3-32B** and the larger **Qwen3-235B** on all listed benchmarks; on in-house leaderboards for deep-research instruction following, file understanding and generation, and skill invocation, it also **surpasses Qwen3.5-397B**. Overall performance is close to mainstream closed-source flagships (**GPT 5.2**, **Claude 4.6**, **GLM-5**, **Kimi-K2.5**). Public benchmarks and internal tasks are mutually consistent, indicating that S1-DeepResearch-32B is **ready for real business deployment**.
70
+
71
+ ## 📂 Example Cases
72
+
73
+ Below is an example of **S1-DeepResearch-32B** using skills: during materials modeling, the model first invokes the scientific skill `scientific-skills/pymatgen` for domain knowledge, then follows the skill guidance to run modeling with `pymatgen` and outputs a CIF file.
74
+
75
+ <div align="center">
76
+
77
+ <img src="./cases/case_skills_science_en_01.png" alt="English scientific skills collaboration example" width="600" />
78
+
79
+ </div>
80
+
81
+ More cases will be added under the `cases/` directory.
82
+
83
+ ## 🚀 Quick Start
84
+
85
+ ### Environment setup
86
+
87
+ 1. **Install dependencies**:
88
+
89
+ ```bash
90
+ pip install -r requirements.txt
91
+ ```
92
+
93
+ 2. **Docker setup**
94
+
95
+ The project provides official pre-built Docker images for fast deployment. There are two core images:
96
+
97
+ - **toolkits-api**: Main tool-service container (exposes API capabilities)
98
+ - **code-sandbox**: Code-execution sandbox image (created on demand by the service for isolated runs)
99
+
100
+ Execution-oriented tools (`execute_code`, `bash`) use **Docker-outside-of-Docker (DooD)**: by mounting the host Docker socket, the tool container talks to the host Docker daemon and creates isolated sandbox containers as needed.
101
+
102
+ **Image tags:**
103
+
104
+ ```text
105
+ ghcr.io/wenge-research/toolkits-api:v2.0.260403
106
+ ghcr.io/wenge-research/code-sandbox:v1.0.260403
107
+ ```
108
+
109
+ **Pull images:**
110
+
111
+ ```text
112
+ docker pull ghcr.io/wenge-research/toolkits-api:v2.0.260403
113
+ docker pull ghcr.io/wenge-research/code-sandbox:v1.0.260403
114
+ ```
115
+
116
+ **Run the container**
117
+
118
+ Mount `src/config.yaml`, the Docker socket (for sandbox execution), and optionally log and cache directories:
119
+
120
+ ```bash
121
+ docker run -d \
122
+ --name toolkits-api \
123
+ --network host \
124
+ -e API_PORT=8080 \
125
+ -e API_WORKERS=4 \
126
+ -e HOST_LOG_DIR=$(pwd)/logs \
127
+ -e SANDBOX_MODE=docker \
128
+ -e HTTP_PROXY=http://your-proxy:port \
129
+ -e HTTPS_PROXY=http://your-proxy:port \
130
+ -e PROXY_URL=http://your-proxy:port \
131
+ -v /etc/localtime:/etc/localtime:ro \
132
+ -v /etc/timezone:/etc/timezone:ro \
133
+ -v /var/run/docker.sock:/var/run/docker.sock \
134
+ -v $(pwd)/src/config.yaml:/app/src/config.yaml \
135
+ -v $(pwd)/logs:/app/logs \
136
+ -v $(pwd)/cache:/app/cache \
137
+ ghcr.io/wenge-research/toolkits-api:v2.0.260403
138
+ ```
139
+
140
+ **Parameter reference**
141
+
142
+ | Flag / env | Description |
143
+ |------|------|
144
+ | `-e API_PORT` | Listen port, default 8080 |
145
+ | `-e API_WORKERS` | Number of worker processes; tune for concurrency, default 1 |
146
+ | `-e SANDBOX_MODE=docker` | Enable Docker sandbox mode (otherwise subprocess) |
147
+ | `-e HOST_LOG_DIR` | Host log directory for sandbox mounts when Docker sandbox is enabled |
148
+ | `-e HTTP_PROXY / HTTPS_PROXY / PROXY_URL` | Proxy settings (optional) |
149
+ | `--network host` | Use if you rely on a proxy bound on the host (optional) |
150
+ | `-v /etc/localtime:/etc/localtime:ro` | Sync host timezone (read-only) |
151
+ | `-v /etc/timezone:/etc/timezone:ro` | Sync host timezone file (read-only) |
152
+ | `-v /var/run/docker.sock` | Required for Docker sandbox mode to schedule sandbox containers |
153
+ | `-v config.yaml` | Mount config (API keys, model and sandbox settings) |
154
+ | `-v logs` | Mount log directory (optional) |
155
+ | `-v cache` | Mount cache directory; structure mirrors `/app/cache` inside the container (optional) |
156
+
157
+
158
+ 3. **Configure the tool service URL**
159
+
160
+ Prefer JSON config or environment variables to override defaults. Avoid editing `utils/configs.py` directly.
161
+
162
+ **Option A (recommended): local JSON**
163
+
164
+ Copy from the example and edit locally:
165
+
166
+ ```bash
167
+ cp utils/config/config.example.json utils/config/config.local.json
168
+ ```
169
+
170
+ Set the tool service base URL in `utils/config/config.local.json`, for example:
171
+
172
+ ```json
173
+ {
174
+ "TOOLS_SERVER_BASE_ENDPOINT_URL": [
175
+ "http://127.0.0.1:8080"
176
+ ]
177
+ }
178
+ ```
179
+
180
+ **Option B: environment variables**
181
+
182
+ Point to a config file or override individual keys:
183
+
184
+ ```bash
185
+ export S1_DR_CONFIG_JSON="utils/config/config.local.json"
186
+ # or override TOOLS_SERVER_BASE_ENDPOINT_URL only
187
+ export TOOLS_SERVER_BASE_ENDPOINT_URL='["http://127.0.0.1:8080"]'
188
+ ```
189
+
190
+ 4. **API keys**
191
+
192
+ Prefer `utils/config/config.local.json` for provider keys, or mirror the same names with environment variables:
193
+
194
+ ```json
195
+ {
196
+ "AIHUBMIX_KEY": "<your_aihubmix_key>",
197
+ "AZURE_KEY": "<your_azure_key>",
198
+ "VOLCANO_KEY": "<your_volcano_key>",
199
+ "ALIYUN_KEY": "<your_aliyun_key>"
200
+ }
201
+ ```
202
+
203
+ Environment variables:
204
+
205
+ ```bash
206
+ export AIHUBMIX_KEY="<your_aihubmix_key>"
207
+ export AZURE_KEY="<your_azure_key>"
208
+ export VOLCANO_KEY="<your_volcano_key>"
209
+ export ALIYUN_KEY="<your_aliyun_key>"
210
+ ```
211
+
212
+ ### Single-query inference
213
+
214
+ ```python
215
+ import asyncio
216
+
217
+ from server.llm_api import LLMClient
218
+ from server.tool_api import return_all_tools
219
+ from inference.run_single_inference import run_one_query
220
+ from utils.prompts import DEEPRESEARCH_SYSTEM_PROMPT
221
+
222
+
223
+ async def main():
224
+ llm_client_urls = ["http://127.0.0.1:10777/v1/chat/completions"]
225
+ llm_client_models = ["S1-DeepResearch-32B"]
226
+ llm_client = LLMClient(llm_client_urls, llm_client_models)
227
+
228
+ all_tools = return_all_tools()
229
+
230
+ result = await run_one_query(
231
+ llm=llm_client,
232
+ user_query="阿里巴巴成立时,18位创始团队成员中,姓马、姓蔡、姓张的创始人的平均年龄,保留一位小数",
233
+ file_path=[],
234
+ system=DEEPRESEARCH_SYSTEM_PROMPT,
235
+ max_rounds=15,
236
+ temperature=0.4,
237
+ top_p=0.95,
238
+ extra_payload={},
239
+ debug=True,
240
+ all_tools=all_tools,
241
+ system_format="deep_research",
242
+ log_label="quick_start_single",
243
+ )
244
+
245
+ final_answer = result[-1]["final_answer"] if result else ""
246
+ print(final_answer)
247
+
248
+
249
+ if __name__ == "__main__":
250
+ asyncio.run(main())
251
+ ```
252
+
253
+ Notes:
254
+
255
+ - `file_path` must be a `list` in the current implementation (e.g. `[]` or `['/path/a.pdf']`).
256
+ - `system_format` options: `deep_research`, `azure`, `aihubmix`, `aihubmix_claude`, `aihubmix_glm`, `volcano`, `aliyun`.
257
+
258
+ ### Batch inference
259
+
260
+ Local / vLLM:
261
+
262
+ ```bash
263
+ cd inference
264
+ cp run_batch_inference_demo.sh run_batch_local.sh
265
+ # Edit run_batch_local.sh (LLM_CLIENT_URLS, LLM_CLIENT_MODELS, TEST_DATA_FILE, etc.)
266
+ bash run_batch_local.sh
267
+ ```
268
+
269
+ Hosted APIs:
270
+
271
+ ```bash
272
+ cd inference
273
+ cp run_batch_inference_online_demo.sh run_batch_online.sh
274
+ # Edit run_batch_online.sh (LLM_CLIENT_URLS, LLM_CLIENT_MODELS, SYSTEM_FORMAT, etc.)
275
+ bash run_batch_online.sh
276
+ ```
277
+
278
+ Logs:
279
+
280
+ ```bash
281
+ tail -f run_logs/*.log
282
+ ```
283
+
284
+ 📖 **[Advanced usage](./inference/README.md)**.
285
+
286
+ ## 🔭 Future Work
287
+
288
+ - **S1-DeepResearch Paper:** We expect to release the paper within about two weeks, covering data synthesis for the five capability areas, training and inference design, test-time scaling, and key evaluation takeaways.
289
+ - **S1-DeepResearch-VL:** In the first half of 2026, we plan to release **S1-DeepResearch-VL** with vision understanding and cross-modal reasoning for richer research-style tasks.
290
+
291
+ ## 📜 License
292
+
293
+ This project is licensed under the **[Apache License 2.0](./LICENSE)**.
294
+
295
+ ## Citation
296
+
297
+ If S1-DeepResearch is useful to your work, please consider citing:
298
+
299
+ ```bibtex
300
+ @software{s1deepresearch2026,
301
+ title={S1-DeepResearch: End-to-End Deep Research Models},
302
+ author={ScienceOne Team},
303
+ year={2026},
304
+ url={https://github.com/ScienceOne-AI/S1-DeepResearch},
305
+ }
306
+ ```
assets/benchmark_performance.png ADDED

Git LFS Details

  • SHA256: e321dc036928d58c2d0491fc86d3b368b903da2d8c9f04e2aaaae1e1873d4e09
  • Pointer size: 132 Bytes
  • Size of remote file: 1.36 MB
cases/case_DeepResearchIF_general_en_01.png ADDED

Git LFS Details

  • SHA256: 3464e28b961abc897e18b5936ff64205c3788db7e063880273e061c6c81b5e56
  • Pointer size: 132 Bytes
  • Size of remote file: 2.16 MB
cases/case_DeepResearchIF_general_zh_01.png ADDED

Git LFS Details

  • SHA256: 99b2338a3b7897b89f65a4351250ab795fbdc0652e839e92544009e6436b78c8
  • Pointer size: 132 Bytes
  • Size of remote file: 2.08 MB
cases/case_DeepResearchIF_science_en_01.png ADDED

Git LFS Details

  • SHA256: ad1fad3f60ba985fb39e0f11d7ad8aa1c3a9484b249ecbd09c6b0347ce696ebc
  • Pointer size: 132 Bytes
  • Size of remote file: 2.18 MB
cases/case_DeepResearchIF_science_zh_01.png ADDED

Git LFS Details

  • SHA256: 99b55c00fbf5ff3ec9ba4273e51299b0b215c5b5ac2db1a9d899e735390b1c54
  • Pointer size: 132 Bytes
  • Size of remote file: 2.4 MB
cases/case_deepresearch-report-writing_general_zh_01.png ADDED

Git LFS Details

  • SHA256: ca784be83c6940f8a3cf445e071938c37c6ec0d58c932d15aee6d8eca41f5c11
  • Pointer size: 132 Bytes
  • Size of remote file: 2.63 MB
cases/case_deepresearch-report-writing_science_en_01.png ADDED

Git LFS Details

  • SHA256: 29f2e2dfb62696329af323cac80c4c4b734ba5facc7372d5e4e17ddde3440da2
  • Pointer size: 132 Bytes
  • Size of remote file: 2.35 MB
cases/case_deepresearch-report-writing_science_zh_01.png ADDED

Git LFS Details

  • SHA256: c71665a8ef51a5ca3a5871e35dff6ac554e5dcdf0136a58947e8adb09e542a75
  • Pointer size: 132 Bytes
  • Size of remote file: 2.62 MB
cases/case_file-understanding-generation_general_en_01.png ADDED

Git LFS Details

  • SHA256: cea3755b7d5c696cf547b9212a3ad5665398a8318ac97d183381a4d2069b9bca
  • Pointer size: 132 Bytes
  • Size of remote file: 1.84 MB
cases/case_file-understanding-generation_science_zh_01.png ADDED

Git LFS Details

  • SHA256: 76845d578b3cbdd7be5334a22dfedca6d44180ef22d67b63ceaf97c7de766322
  • Pointer size: 132 Bytes
  • Size of remote file: 2.4 MB
cases/case_long-horizon-reasoning_general_en_01.png ADDED

Git LFS Details

  • SHA256: 18c574807d8a8551303aff231ccd173fe5cfcc48569cc1a3562abbbf448b3514
  • Pointer size: 132 Bytes
  • Size of remote file: 1.95 MB
cases/case_long-horizon-reasoning_general_en_02.png ADDED

Git LFS Details

  • SHA256: e35aea9449dc126867159be6fe7ba4bf86e9e30c0c59c006a8402fa0213ee67b
  • Pointer size: 132 Bytes
  • Size of remote file: 2.08 MB
cases/case_long-horizon-reasoning_general_zh_01.png ADDED

Git LFS Details

  • SHA256: de7da110e0ff422e5e781491ca0d9b11495ae3a8fde0537ff8271c82dd3f6b2f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.19 MB
cases/case_long-horizon-reasoning_general_zh_02.png ADDED

Git LFS Details

  • SHA256: 9818e91488abb2567d3b39de9f2cd2a947128b7652b72b68b8cf749fa224ad5b
  • Pointer size: 132 Bytes
  • Size of remote file: 2.07 MB
cases/case_skills_science_en_01.png ADDED

Git LFS Details

  • SHA256: 31b8e4bf2026f880100bb80bd999b2e4320ccc6b162f9eca2c6565391f954520
  • Pointer size: 132 Bytes
  • Size of remote file: 2.05 MB
cases/case_skills_science_zh_01.png ADDED

Git LFS Details

  • SHA256: b2310377245f09ca0fd8912cafb0556347a705812cb7a22fed968f5ea73c7e98
  • Pointer size: 132 Bytes
  • Size of remote file: 2.24 MB
inference/README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 中文 | [English](./README_en.md)
2
+
3
+ # S1-deepresearch 推理框架
4
+
5
+ ## 核心特性
6
+
7
+ - **多 LLM 客户端**: 支持 vLLM、Azure OpenAI、AIHubMix 等多种 LLM 服务
8
+ - **丰富的工具集**: 提供 9 种工具,涵盖搜索、网页访问、文件解析、代码执行、多模态问答、bash 等
9
+ - **批量推理**: 支持并发批量推理,自动断点续传,定期保存结果
10
+ - **单条推理**: 支持单条查询的详细调试和测试
11
+ - **负载均衡**: 支持多 LLM 节点的负载均衡和一致性调度
12
+ - **详细日志**: 为每个查询生成独立的日志文件,便于问题追踪和分析
13
+
14
+ ## 项目结构(当前)
15
+
16
+ ```text
17
+ ./
18
+ ├── run_batch_inference_demo.sh # 本地/vLLM 脚本模板
19
+ ├── run_batch_inference_online_demo.sh # 在线平台脚本模板
20
+ ├── inference/
21
+ │ ├── run_batch_inference.py
22
+ │ └── run_single_inference.py
23
+ ├── server/
24
+ ├── tool_kits/
25
+ ├── utils/
26
+ │ └── config/
27
+ │ ├── config.example.json
28
+ │ └── README.md
29
+ ├── models/tokenizer/
30
+ └── test_all_tools.py
31
+ ```
32
+
33
+ ## 快速开始
34
+
35
+ ### 1. 安装依赖
36
+
37
+ ```bash
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ ### 2. 配置(推荐 JSON 或环境变量)
42
+
43
+ 配置优先级:`自定义 JSON > 环境变量 > utils/config.py 默认值`。
44
+
45
+ 常用做法:
46
+
47
+ ```bash
48
+ cp utils/config/config.example.json utils/config/config.local.json
49
+ ```
50
+
51
+ 然后按需修改 `config.local.json`,例如:
52
+
53
+ - `TOOLS_SERVER_BASE_ENDPOINT_URL`
54
+ - `AIHUBMIX_KEY` / `AZURE_KEY` / `VOLCANO_KEY` / `ALIYUN_KEY`
55
+ - `CLIENT_TIMEOUT`
56
+
57
+ 也可以通过环境变量覆盖,例如:
58
+
59
+ ```bash
60
+ export S1_DR_CONFIG_JSON="utils/config/config.local.json"
61
+ ```
62
+
63
+ ### 3. 准备输入 JSONL
64
+
65
+ 输入文件每行一个 JSON。最少建议包含 `question`,通常同时包含 `id` 与 `file_path`。
66
+
67
+ #### 3.1 jsonl 示例(涉及文件输入)
68
+
69
+ ```json
70
+ {"id":"query_001","question":"阿里巴巴成立时,18位创始团队成员中,姓马、姓蔡、姓张的创始人的平均年龄,保留一位小数","file_path":[]}
71
+ {"id":"query_002","question":"阅读当前说明书,大疆发布的起飞重量最大的AIR系列无人机飞完半程马拉松,电池还剩多少毫安时的电能?(注1:假设水平无风,最低耗能的情况为最大航速的60%飞行;注2:耗电可以按最长飞行时间换算)","file_path":["/path/to/file.pdf"]}
72
+ ```
73
+
74
+ #### 3.2 jsonl 示例(涉及 Skill 使用)
75
+
76
+ ```json
77
+ {"id":"query_003","question":"Use pymatgen to build a simple TiO2 surface slab. Please generate a common low-index surface, report the Miller index, slab thickness, and vacuum size, and briefly describe the resulting surface structure.","skills":[{"name": "skill_name1", "description": "description1", "skill_path": "skill_path1"}, {"name": "skill_name2", "description": "description2", "skill_path": "skill_path2"}]}
78
+ ```
79
+
80
+ ## 推荐启动方式:复制脚本后运行
81
+
82
+ ### A. 本地 / vLLM(`run_batch_inference_demo.sh`)
83
+
84
+ ```bash
85
+ cp run_batch_inference_demo.sh run_batch_local.sh
86
+ mkdir -p run_logs
87
+ # 编辑 run_batch_local.sh 中的参数
88
+ bash run_batch_local.sh
89
+ ```
90
+
91
+ 说明:
92
+
93
+ - 脚本内部已使用 `nohup ... &` 启动 Python,会打印后台 PID。
94
+ - 常看日志:`tail -f run_logs/run.log`
95
+
96
+ ### B. 在线平台(`run_batch_inference_online_demo.sh`)
97
+
98
+ ```bash
99
+ cp run_batch_inference_online_demo.sh run_batch_online.sh
100
+ mkdir -p run_logs
101
+ # 编辑 run_batch_online.sh 中的参数
102
+ bash run_batch_online.sh
103
+ ```
104
+
105
+ 说明:
106
+
107
+ - 重点修改:`LLM_CLIENT_URLS`、`LLM_CLIENT_MODELS`、`SYSTEM_FORMAT`
108
+ - 常看日志:`tail -f run_logs/run_batch_*.log`
109
+
110
+ ## 脚本参数说明
111
+
112
+ ### 基础参数
113
+
114
+ - `LLM_CLIENT_URLS`:模型服务地址,多个地址用空格分隔(与模型列表一一对应)
115
+ - `LLM_CLIENT_MODELS`:模型名列表,多个模型用空格分隔
116
+ - `TEST_DATA_FILE`:输入 JSONL 路径
117
+ - `OUTPUT_FILE`:`ROLLOUT_NUM=1` 时的输出文件
118
+ - `OUTPUT_DIR`:`ROLLOUT_NUM>1` 时输出目录(生成 `rollout_01.jsonl` 等)
119
+ - `ROLLOUT_NUM`:每条样本重复推理次数
120
+ - `RESUME_FROM_FILE`:断点续跑文件(可空)
121
+ - `AVAILABLE_TOOLS`:启用工具列表(空格分隔)
122
+ - `TASK_TYPE`:是否按“输入仅文本”场景处理,默认 `input_only`
123
+
124
+ ### 推理控制参数
125
+
126
+ - `MAX_ROUNDS`:单 query 最大轮次
127
+ - `CONCURRENCY_WORKERS`:并发 worker 数
128
+ - `SAVE_BATCH_SIZE`:每处理多少条就自动落盘一次
129
+ - `TEMPERATURE`:采样温度
130
+ - `TOP_P`:top-p(`run_batch_inference_demo.sh` 已包含)
131
+ - `EXTRA_PAYLOAD`:额外模型 payload(JSON 字符串,`run_batch_inference_demo.sh` 已包含)
132
+ - `TIMEOUT_FOR_ONE_QUERY`:单 query 超时时间(秒)
133
+ - `LLM_API_RETRY_TIMES`:LLM 请求失败后的重试次数(不含首次)
134
+ - `SYSTEM_PROMPT`:自定义 system prompt;留空时使用内置默认 prompt
135
+ - `SYSTEM_FORMAT`:平台格式(主要在 `run_batch_inference_online_demo.sh`)
136
+
137
+ ### 上下���截断相关参数
138
+
139
+ - `DISCARD_ALL_MODE`:是否启用 discard-all(`true/false`)
140
+ - `MODEL_MAX_CONTEXT_TOKENS`:模型最大上下文长度
141
+ - `DISCARD_RATIO`:触发 discard 的比例阈值
142
+ - `TOKENIZER_PATH`:token 统计所用 tokenizer 路径
143
+
144
+ ### 日志参数
145
+
146
+ - `LOG_LABEL`:日志标签,目录形如 `logs/YYYY_MM_DD_<LOG_LABEL>/`
147
+ - `LOG_FILE`:脚本启动日志文件(`run_logs/*.log`)
148
+ - `LOGGING_ROOT`:日志根路径(`run_batch_inference_demo.sh` 已包含,可空)
149
+
150
+ ## `SYSTEM_FORMAT` 可选值
151
+
152
+ `SYSTEM_FORMAT` 将对应不同的平台处理逻辑,根据该关键词进入不同的处理分支。
153
+
154
+ - `deep_research`:本地 deep research 格式(vLLM 部署)
155
+ - `azure`:Azure OpenAI
156
+ - `aihubmix`:AIHubMix(OpenAI 兼容)
157
+ - `aihubmix_claude`:AIHubMix Claude 格式
158
+ - `aihubmix_glm`:AIHubMix GLM 格式
159
+ - `volcano`:火山引擎
160
+ - `aliyun`:阿里云百炼平台格式
161
+
162
+ ## 当前默认可用工具(9 个)
163
+
164
+ - `wide_search`:基于 Serp 进行通用网页搜索,支持一轮提交多个 query
165
+ - `scholar_search`:基于 Google Scholar 进行学术检索 + web 结果)
166
+ - `image_search`:图片检索,支持多 query。
167
+ - `wide_visit`:访问网页并按目标 `goal` 产出摘要
168
+ - `file_wide_parse`:解析本地/在线文件(PDF、DOCX、MD、CSV等)
169
+ - `execute_code`:执行 Python 代码
170
+ - `ask_question_about_image`:图像理解与问答
171
+ - `ask_question_about_video`:视频理解与问答
172
+ - `bash`:执行 shell 脚本
173
+
174
+ 各工具对应的 schema 定义详见 utils/prompts.py 下的 `DEEPRESEARCH_SYSTEM_PROMPT`
175
+
176
+ ## 输出与日志
177
+
178
+ ### 输出 JSONL(字段详解)
179
+
180
+ `run_batch_inference.py` 写出的每行字段如下:
181
+
182
+ - `time_stamp`:该行结果写入时的时间戳(`YYYY-MM-DD HH:MM:SS`)。
183
+ - `query_id`:批处理层生成的 query 标识(基于 `question` 哈希)。
184
+ - `query`:本条输入的 `question` 文本。
185
+ - `result`:单个 segment 的详细结果对象(来自 `run_single_inference.py`)。
186
+ - `status`:任务状态,`success` / `timeout` / `error`。
187
+ - `discard_segments`:被 `discard-all` 截断并做 summary 的段数(不含最终段)。
188
+ - `elapsed_sec`:该 query 本次 rollout 的总耗时(秒)。
189
+ - `rollout_idx`:第几次 rollout(从 1 开始)。
190
+ - `src`:原始输入行完整内容(通常含 `id`、`question`、`file_path`、skills 等)。
191
+ - `segment_idx`:当前是第几个 segment(从 1 开始)。
192
+ - `segment_total`:该 query 共拆成多少个 segment。若无有效 `result`,会写成 `0`。
193
+
194
+ 其中 `result` 常见字段(`run_single_inference.py`):
195
+
196
+ - `query_id`:单次运行实例 ID(含时间后缀)。
197
+ - `tools`:本次启用的 tools schema(字符串形式)。
198
+ - `messages`:用于模型推理与工具交互的日志消息。
199
+ - `final_answer`:当前 segment 的回答文本。
200
+ - `transcript`:更完整的对话轨迹(含工具回填)。
201
+ - `rounds`:该 segment 执行到的轮数。
202
+ - `stopped_reason`:停止原因(如 `no_tool_calls`、`discard_all_01`、`discard_all_final`、`max_rounds_exceeded`)。
203
+ - `error`:仅在异常时可能出现。
204
+
205
+ ### 日志目录
206
+
207
+ 默认日志结构如下(`LOGGING_ROOT` 为空时):
208
+
209
+ ```text
210
+ logs/
211
+ └── YYYY_MM_DD_<LOG_LABEL>/
212
+ ├── collect.log
213
+ └── <query_id>/
214
+ ├── run.log
215
+ └── result.json
216
+ ```
217
+
218
+ ## 工具测试
219
+
220
+ 运行工具测试脚本:
221
+ ```bash
222
+ python test_all_tools.py
223
+ ```
224
+ 该脚本会测试所有注册的工具,验证其基本功能是否正常。
inference/README_en.md ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [中文](./README.md) | English
2
+
3
+ # S1-DeepResearch Inference Framework
4
+
5
+ ## Key Features
6
+
7
+ - **Multiple LLM clients**: Supports vLLM, Azure OpenAI, AIHubMix, and other LLM services
8
+ - **Rich toolset**: Nine tools covering search, web browsing, file parsing, code execution, multimodal Q&A, bash, and more
9
+ - **Batch inference**: Concurrent batch inference with resume-from-checkpoint and periodic result saving
10
+ - **Single-query inference**: Detailed debugging and testing for individual queries
11
+ - **Load balancing**: Multi-node LLM load balancing and consistent scheduling
12
+ - **Detailed logging**: Per-query log files for easier troubleshooting and analysis
13
+
14
+ ## Project Layout (current)
15
+
16
+ ```text
17
+ ./
18
+ ├── run_batch_inference_demo.sh # Local / vLLM script template
19
+ ├── run_batch_inference_online_demo.sh # Online platform script template
20
+ ├── inference/
21
+ │ ├── run_batch_inference.py
22
+ │ └── run_single_inference.py
23
+ ├── server/
24
+ ├── tool_kits/
25
+ ├── utils/
26
+ │ └── config/
27
+ │ ├── config.example.json
28
+ │ └── README.md
29
+ ├── models/tokenizer/
30
+ └── test_all_tools.py
31
+ ```
32
+
33
+ ## Quick Start
34
+
35
+ ### 1. Install dependencies
36
+
37
+ ```bash
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ ### 2. Configuration (JSON or environment variables recommended)
42
+
43
+ Precedence: **custom JSON > environment variables > defaults in `utils/config.py`**.
44
+
45
+ Typical workflow:
46
+
47
+ ```bash
48
+ cp utils/config/config.example.json utils/config/config.local.json
49
+ ```
50
+
51
+ Edit `config.local.json` as needed, for example:
52
+
53
+ - `TOOLS_SERVER_BASE_ENDPOINT_URL`
54
+ - `AIHUBMIX_KEY` / `AZURE_KEY` / `VOLCANO_KEY` / `ALIYUN_KEY`
55
+ - `CLIENT_TIMEOUT`
56
+
57
+ You can also override via environment variables, for example:
58
+
59
+ ```bash
60
+ export S1_DR_CONFIG_JSON="utils/config/config.local.json"
61
+ ```
62
+
63
+ ### 3. Prepare input JSONL
64
+
65
+ Each line is one JSON object. At minimum include `question`; usually also `id` and `file_path`.
66
+
67
+ #### 3.1 JSONL example (file inputs)
68
+
69
+ ```json
70
+ {"id":"query_001","question":"When Alibaba was founded, what was the average age of the founders whose surnames are Ma, Cai, or Zhang among the 18 co-founders? Round to one decimal place.","file_path":[]}
71
+ {"id":"query_002","question":"According to the manual, for DJI's heaviest AIR-series drone by takeoff weight, how many mAh of battery energy remain after flying half a marathon? (Note 1: assume calm air; minimum energy use is flying at 60% of max speed. Note 2: power draw can be converted from max flight time.)","file_path":["/path/to/file.pdf"]}
72
+ ```
73
+
74
+ #### 3.2 JSONL example (using Skills)
75
+
76
+ ```json
77
+ {"id":"query_003","question":"Use pymatgen to build a simple TiO2 surface slab. Please generate a common low-index surface, report the Miller index, slab thickness, and vacuum size, and briefly describe the resulting surface structure.","skills":[{"name": "skill_name1", "description": "description1", "skill_path": "skill_path1"}, {"name": "skill_name2", "description": "description2", "skill_path": "skill_path2"}]}
78
+ ```
79
+
80
+ ## Recommended workflow: copy a script, then run
81
+
82
+ ### A. Local / vLLM (`run_batch_inference_demo.sh`)
83
+
84
+ ```bash
85
+ cp run_batch_inference_demo.sh run_batch_local.sh
86
+ mkdir -p run_logs
87
+ # Edit parameters inside run_batch_local.sh
88
+ bash run_batch_local.sh
89
+ ```
90
+
91
+ Notes:
92
+
93
+ - The script starts Python with `nohup ... &` and prints the background PID.
94
+ - Tail logs: `tail -f run_logs/run.log`
95
+
96
+ ### B. Online platform (`run_batch_inference_online_demo.sh`)
97
+
98
+ ```bash
99
+ cp run_batch_inference_online_demo.sh run_batch_online.sh
100
+ mkdir -p run_logs
101
+ # Edit parameters inside run_batch_online.sh
102
+ bash run_batch_online.sh
103
+ ```
104
+
105
+ Notes:
106
+
107
+ - Focus on: `LLM_CLIENT_URLS`, `LLM_CLIENT_MODELS`, `SYSTEM_FORMAT`
108
+ - Tail logs: `tail -f run_logs/run_batch_*.log`
109
+
110
+ ## Script parameters
111
+
112
+ ### Basic
113
+
114
+ - `LLM_CLIENT_URLS`: Model service URLs, space-separated (paired with the model list)
115
+ - `LLM_CLIENT_MODELS`: Model names, space-separated
116
+ - `TEST_DATA_FILE`: Input JSONL path
117
+ - `OUTPUT_FILE`: Output file when `ROLLOUT_NUM=1`
118
+ - `OUTPUT_DIR`: Output directory when `ROLLOUT_NUM>1` (e.g. `rollout_01.jsonl`, …)
119
+ - `ROLLOUT_NUM`: Number of rollouts per sample
120
+ - `RESUME_FROM_FILE`: Resume checkpoint file (may be empty)
121
+ - `AVAILABLE_TOOLS`: Enabled tools, space-separated
122
+ - `TASK_TYPE`: Whether to treat input as text-only; default `input_only`
123
+
124
+ ### Inference control
125
+
126
+ - `MAX_ROUNDS`: Max rounds per query
127
+ - `CONCURRENCY_WORKERS`: Number of concurrent workers
128
+ - `SAVE_BATCH_SIZE`: Flush results to disk every N samples
129
+ - `TEMPERATURE`: Sampling temperature
130
+ - `TOP_P`: Top-p (included in `run_batch_inference_demo.sh`)
131
+ - `EXTRA_PAYLOAD`: Extra model payload (JSON string; included in `run_batch_inference_demo.sh`)
132
+ - `TIMEOUT_FOR_ONE_QUERY`: Per-query timeout (seconds)
133
+ - `LLM_API_RETRY_TIMES`: Retries after LLM failure (not counting the first attempt)
134
+ - `SYSTEM_PROMPT`: Custom system prompt; empty uses the built-in default
135
+ - `SYSTEM_FORMAT`: Platform format (mainly in `run_batch_inference_online_demo.sh`)
136
+
137
+ ### Context truncation
138
+
139
+ - `DISCARD_ALL_MODE`: Enable discard-all (`true`/`false`)
140
+ - `MODEL_MAX_CONTEXT_TOKENS`: Model max context length
141
+ - `DISCARD_RATIO`: Threshold ratio to trigger discard
142
+ - `TOKENIZER_PATH`: Path to tokenizer used for token counting
143
+
144
+ ### Logging
145
+
146
+ - `LOG_LABEL`: Log label; directory shape `logs/YYYY_MM_DD_<LOG_LABEL>/`
147
+ - `LOG_FILE`: Script log file under `run_logs/*.log`
148
+ - `LOGGING_ROOT`: Log root (set in `run_batch_inference_demo.sh`; may be empty)
149
+
150
+ ## `SYSTEM_FORMAT` values
151
+
152
+ `SYSTEM_FORMAT` selects platform-specific handling via keyword branches.
153
+
154
+ - `deep_research`: Local deep-research format (vLLM deployment)
155
+ - `azure`: Azure OpenAI
156
+ - `aihubmix`: AIHubMix (OpenAI-compatible)
157
+ - `aihubmix_claude`: AIHubMix Claude format
158
+ - `aihubmix_glm`: AIHubMix GLM format
159
+ - `volcano`: Volcano Engine
160
+ - `aliyun`: Alibaba Cloud Bailian format
161
+
162
+ ## Currently available tools (9)
163
+
164
+ - `wide_search`: General web search via Serp; multiple queries in one round
165
+ - `scholar_search`: Google Scholar academic search (+ web results)
166
+ - `image_search`: Image search; multiple queries supported
167
+ - `wide_visit`: Visit pages and summarize toward a `goal`
168
+ - `file_wide_parse`: Parse local/remote files (PDF, DOCX, MD, CSV, etc.)
169
+ - `execute_code`: Run Python code
170
+ - `ask_question_about_image`: Image understanding and Q&A
171
+ - `ask_question_about_video`: Video understanding and Q&A
172
+ - `bash`: Run shell commands
173
+
174
+ Tool schemas are defined in `DEEPRESEARCH_SYSTEM_PROMPT` in `utils/prompts.py`.
175
+
176
+ ## Outputs and logs
177
+
178
+ ### Output JSONL fields
179
+
180
+ Each line written by `run_batch_inference.py` contains:
181
+
182
+ - `time_stamp`: Write time for that row (`YYYY-MM-DD HH:MM:SS`).
183
+ - `query_id`: Batch-level query id (hash of `question`).
184
+ - `query`: This row’s `question` text.
185
+ - `result`: Detailed result object for one segment (from `run_single_inference.py`).
186
+ - `status`: `success` / `timeout` / `error`.
187
+ - `discard_segments`: Segments truncated by discard-all and summarized (excluding the final segment).
188
+ - `elapsed_sec`: Total seconds for this rollout of the query.
189
+ - `rollout_idx`: Rollout index (1-based).
190
+ - `src`: Full original input line (often includes `id`, `question`, `file_path`, skills, etc.).
191
+ - `segment_idx`: Current segment index (1-based).
192
+ - `segment_total`: Total segments for this query; `0` if there is no valid `result`.
193
+
194
+ Common fields inside `result` (`run_single_inference.py`):
195
+
196
+ - `query_id`: Single-run instance id (includes a time suffix).
197
+ - `tools`: Enabled tool schemas (string form).
198
+ - `messages`: Messages for model reasoning and tool interaction.
199
+ - `final_answer`: Answer text for this segment.
200
+ - `transcript`: Fuller trajectory (including tool returns).
201
+ - `rounds`: Rounds executed in this segment.
202
+ - `stopped_reason`: Why it stopped (e.g. `no_tool_calls`, `discard_all_01`, `discard_all_final`, `max_rounds_exceeded`).
203
+ - `error`: Present only on failure.
204
+
205
+ ### Log directories
206
+
207
+ Default layout when `LOGGING_ROOT` is empty:
208
+
209
+ ```text
210
+ logs/
211
+ └── YYYY_MM_DD_<LOG_LABEL>/
212
+ ├── collect.log
213
+ └── <query_id>/
214
+ ├── run.log
215
+ └── result.json
216
+ ```
217
+
218
+ ## Tool tests
219
+
220
+ Run the tool test script:
221
+
222
+ ```bash
223
+ python test_all_tools.py
224
+ ```
225
+
226
+ This exercises all registered tools and checks that basic behavior works.
inference/inference/run_batch_inference.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import asyncio
3
+ import datetime
4
+ import json
5
+ import os
6
+ import sys
7
+ import time
8
+
9
+ from anyio import Path
10
+ from numpy._core.numerictypes import str_
11
+
12
+
13
+ # 获取项目根目录路径,并加入 sys.path
14
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15
+ sys.path.append(project_root)
16
+
17
+ from inference.run_single_inference import run_one_query
18
+ from server.llm_api import LLMClient
19
+ from server.tool_api import return_all_tools
20
+ from utils.configs import LITERATURE_SEED_DATA_DIR
21
+
22
+ from tqdm import tqdm # pyright: ignore[reportMissingModuleSource]
23
+ from typing import Any, Dict
24
+ from utils.common import _to_bool, get_query_uuid, load_jsonl
25
+ from utils.logger import setup_collect_logger
26
+ from utils.prompts import DEEPRESEARCH_SYSTEM_PROMPT
27
+ from utils.skill_prompt import build_skills_system_text, extract_skills_from_row, resolve_skill_source_dirs
28
+ from utils.build_prompt import build_openai_schema
29
+
30
+ def parse_args():
31
+ parser = argparse.ArgumentParser(description="批量推理脚本")
32
+ parser.add_argument("--llm_client_urls", type=str, nargs='+', default=["http://10.20.4.18:10777/vllm_generate"], help="vllm 远程挂载的模型 URL (可传多个,用空格分隔)")
33
+ parser.add_argument("--llm_client_models", type=str, nargs='+', default=["LLM_CLIENT_NAME"], help="vllm 远程挂载的模型名称 (可传多个,用空格分隔)")
34
+ parser.add_argument("--test_data_file", type=str, default="test_files/test.jsonl", help="测试需要生成答案的文件(.jsonl)")
35
+ parser.add_argument("--available_tools", type=str, nargs="+", default=["web_search", "visit_url", "execute_code"], help="可用的tool名称(列表)")
36
+ parser.add_argument("--resume_from_file", type=str, default="test_files/test_result_20251112.jsonl", help="已完成结果的本地文件(可选),自动跳过已完成样本")
37
+ parser.add_argument("--concurrency_workers", type=int, default=10, help="并发进程数量")
38
+ parser.add_argument("--save_batch_size", type=int, default=1, help="每得到多少条数据结果就存储一次")
39
+ parser.add_argument("--rollout_num", type=int, default=1, help="每条数据的推理次数,每次推理结果以rollout_xx.jsonl命名保存到output_dir中")
40
+ parser.add_argument("--max_rounds", type=int, default=100, help="与模型交互的最大轮数")
41
+ parser.add_argument("--temperature", type=float, default=0.7, help="采样温度")
42
+ parser.add_argument("--top_p", type=float, default=0.95, help="nucleus sampling 的 top_p 参数")
43
+ parser.add_argument("--extra_payload", type=str, default="{}", help="额外的 payload 参数(JSON 字符串),如 '{\"presence_penalty\": 1.1}'")
44
+ parser.add_argument("--timeout_for_one_query", type=int, default=7200, help="单个query最大执行时长(秒)")
45
+ parser.add_argument("--llm_api_retry_times", type=int, default=2, help="LLM API 请求失败后的重试次数,不含首次请求")
46
+ parser.add_argument("--output_file", type=str, default="test_files/test_result_today.jsonl", help="结果输出文件路径")
47
+ parser.add_argument("--output_dir", type=str, default="test_files/output", help="结果输出目录路径(每个rollout结果以rollout_xx.jsonl保存)")
48
+ parser.add_argument('--system_format', type=str, default="deep_research", help="采用什么模型的prompt拼接方式(默认用 deep_research 的)")
49
+ parser.add_argument('--log_label', type=str, default="", help=f"log 路径加入自定义文本标记,同时也是附件类数据暂存附件的存储路径 {LITERATURE_SEED_DATA_DIR}/{{log_label}}")
50
+ parser.add_argument('--system_prompt', type=str, default=None, help="自定义全局system_prompt的文件路径或字符串(默认用DEEPRESEARCH)")
51
+ parser.add_argument('--verbose', action='store_true', default=True, help="是否输出debug日志")
52
+ parser.add_argument('--clean_files_copy_dir', action='store_true', default=False, help="执行完后是否删除files_copy_dir临时文件夹")
53
+ parser.add_argument("--discard_all_mode", type=str, default="false", help="是否开启 discard-all 模式(true/false)")
54
+ parser.add_argument("--model_max_context_tokens", type=int, default=128000, help="模型最大上下文长度")
55
+ parser.add_argument("--discard_ratio", type=float, default=0.8, help="触发 discard 的上下文比例阈值")
56
+ parser.add_argument("--tokenizer_path", type=str, default="models/tokenizer", help="用于 token 统计的 tokenizer 路径")
57
+ parser.add_argument("--logging_root", type=str, default=None, help="用于自定义 log 存储路径")
58
+ return parser.parse_args()
59
+
60
+ # 工具注册 全局,减少取用延迟
61
+ ALL_TOOLS = return_all_tools()
62
+
63
+ async def main_async(args):
64
+ # --- 日志
65
+ logging_root = args.logging_root if args.logging_root else project_root
66
+ logger, log_path = setup_collect_logger(logging_root, args.log_label)
67
+ logger.info(f"[Collector] Script Start. Log file: {log_path}")
68
+
69
+ # --------- 参数处理与初始化 ---------
70
+ def abs_path_if_needed(path):
71
+ if not path:
72
+ return path
73
+ # 如果是相对路径,且不是以.或..开头,拼接到project_root;否则用os.path.abspath
74
+ if not os.path.isabs(path):
75
+ if path.startswith("./") or path.startswith("../"):
76
+ return os.path.abspath(path)
77
+ else:
78
+ return os.path.join(project_root, path)
79
+ return path
80
+
81
+ llm_client_urls = args.llm_client_urls
82
+ llm_client_models = args.llm_client_models
83
+ test_data_file = abs_path_if_needed(args.test_data_file)
84
+ available_tools = args.available_tools
85
+ resume_from_file = abs_path_if_needed(args.resume_from_file)
86
+ concurrency_workers = args.concurrency_workers
87
+ save_batch_size = args.save_batch_size
88
+ max_rounds = args.max_rounds
89
+ temperature = args.temperature
90
+ top_p = args.top_p
91
+ _extra_raw = (args.extra_payload or "").strip()
92
+ extra_payload = json.loads(_extra_raw if _extra_raw else "{}")
93
+ system_format = args.system_format
94
+ timeout_for_one_query = args.timeout_for_one_query
95
+ llm_api_retry_times = max(0, args.llm_api_retry_times)
96
+ output_file = abs_path_if_needed(args.output_file)
97
+ output_dir = abs_path_if_needed(args.output_dir)
98
+ rollout_num = args.rollout_num
99
+ discard_all_mode = _to_bool(args.discard_all_mode)
100
+ model_max_context_tokens = args.model_max_context_tokens
101
+ discard_ratio = args.discard_ratio
102
+ tokenizer_path = abs_path_if_needed(args.tokenizer_path)
103
+ # 检查并创建 output_file 的文件夹(如果不存在)
104
+ output_dir_from_file = os.path.dirname(output_file)
105
+ if output_dir_from_file and not os.path.exists(output_dir_from_file):
106
+ logger.warning(f"[Save Dir created] Make the dir {output_dir_from_file}")
107
+ os.makedirs(output_dir_from_file, exist_ok=True)
108
+ if output_dir and not os.path.exists(output_dir):
109
+ logger.warning(f"[Save Dir created] Make the dir {output_dir}")
110
+ os.makedirs(output_dir, exist_ok=True)
111
+ verbose = args.verbose
112
+ # system prompt来源:命令行 > 默认常量
113
+ if args.system_prompt:
114
+ if os.path.isfile(args.system_prompt):
115
+ with open(args.system_prompt, encoding="utf-8") as f:
116
+ system_prompt = f.read()
117
+ else:
118
+ system_prompt = args.system_prompt
119
+ else:
120
+ system_prompt = DEEPRESEARCH_SYSTEM_PROMPT
121
+
122
+ # 剔除未启用的工具
123
+ selected_tools = {name: spec for name, spec in ALL_TOOLS.items() if name in available_tools}
124
+
125
+ logger.info(f"[Selected_tools] {build_openai_schema(selected_tools)}")
126
+
127
+ llm_client = LLMClient(
128
+ llm_client_urls,
129
+ llm_client_models,
130
+ max_retries=llm_api_retry_times,
131
+ )
132
+ data_list = load_jsonl(test_data_file) # 加载全部待推理数据
133
+
134
+ logger.info(f"Number of rollouts per query: {rollout_num}")
135
+ logger.info(f"LLM API retry times: {llm_api_retry_times}")
136
+
137
+ # 为每个rollout准备输出文件路径
138
+ rollout_output_files = {}
139
+ if rollout_num > 1:
140
+ for rollout_idx in range(1, rollout_num + 1):
141
+ rollout_output_file = os.path.join(output_dir, f"rollout_{rollout_idx:02d}.jsonl")
142
+ rollout_output_files[rollout_idx] = rollout_output_file
143
+ logger.info(f"Rollout {rollout_idx}: output_file={rollout_output_file}")
144
+ else:
145
+ rollout_output_files[1] = output_file
146
+
147
+ # 记录本轮中有用到文件拷贝的目录,最后用来删除
148
+ files_copy_dir = None
149
+ if args.log_label:
150
+ files_copy_dir = f"data/{args.log_label}"
151
+
152
+ # 处理每个rollout
153
+ for rollout_idx in range(1, rollout_num + 1):
154
+ logger.info(f"{'='*50}")
155
+ logger.info(f"Starting Rollout {rollout_idx}/{rollout_num}")
156
+ logger.info(f"{'='*50}")
157
+
158
+ rollout_output_file = rollout_output_files.get(rollout_idx, output_file)
159
+
160
+ results = []
161
+ finished_keys = set()
162
+
163
+ def _get_finish_key_from_item(item: Dict[str, Any]) -> str:
164
+ """
165
+ 用于判定一条 query 是否已完成的 key。
166
+ - 修改 key 的逻辑,将 id + query 的内容同时作为 id 进行判定,这样能够避免某些数据集 id 没有处理干净的情况
167
+ """
168
+ if not isinstance(item, dict):
169
+ return ""
170
+ now_id = ""
171
+ _id = item.get("id", None)
172
+ if _id is not None and str(_id) != "":
173
+ now_id += str(_id)
174
+ q = item.get("question", None)
175
+ if q is not None and str(q) != "":
176
+ now_id += "__" + str(q)
177
+ return now_id
178
+
179
+ # --------- 恢复完成数据 ---------
180
+ # 优先用 args.resume_from_file,其次用当前rollout的output_file
181
+ resume_path = None
182
+ if resume_from_file and os.path.isfile(resume_from_file):
183
+ resume_path = resume_from_file
184
+ elif os.path.isfile(rollout_output_file):
185
+ resume_path = rollout_output_file
186
+
187
+ if resume_path:
188
+ logger.info(f"[Resume Rollout {rollout_idx}] Loading finished IDs from: {resume_path}")
189
+ with open(resume_path, "r", encoding="utf-8") as f:
190
+ for line in f:
191
+ try:
192
+ obj = json.loads(line)
193
+ src = obj.get("src") or {}
194
+ # 兼容旧输出:src 里可能没有 id;同时回退到 obj.query
195
+ key = _get_finish_key_from_item(src)
196
+ if not key:
197
+ q = obj.get("query", None)
198
+ key = "" if q is None else str(q)
199
+ # if key and obj.get("status", "") == "success":
200
+ if key and obj.get("status", "") == "success" and obj.get("result", {}).get("final_answer"):
201
+ # 正确且 success 的留下
202
+ finished_keys.add(key)
203
+ results.append(obj)
204
+ except Exception:
205
+ continue
206
+ # 只保留未完成的条目
207
+ original_num = len(data_list)
208
+ data_list_for_rollout = [item for item in data_list if _get_finish_key_from_item(item) not in finished_keys]
209
+ logger.info(f"[Resume Rollout {rollout_idx}] Skipped {len(finished_keys)} finished items, {len(data_list_for_rollout)} remaining (total={original_num}).")
210
+
211
+ if not data_list_for_rollout:
212
+ logger.info(f"[Rollout {rollout_idx}] All queries already processed, skipping.")
213
+ continue
214
+
215
+ # --------- 并发控制 ---------
216
+ sem = asyncio.Semaphore(concurrency_workers)
217
+ save_every = save_batch_size
218
+
219
+ async def _worker(idx: int, item: Dict[str, Any], rollout_idx: int):
220
+ async with sem:
221
+ query = item.get("question")
222
+ id = item.get("id")
223
+ query_id = get_query_uuid(str(query))
224
+ file_path = item.get("file_path", "")
225
+
226
+ # 统一将 file_path 转为 list 处理(兼容 str 和 list 和 None)
227
+ file_paths = []
228
+ if isinstance(file_path, list):
229
+ file_paths = file_path
230
+ elif isinstance(file_path, str) and file_path:
231
+ file_paths = [file_path]
232
+
233
+ start = time.time()
234
+ progress = {}
235
+ # 抽取行中的 skill 字段信息
236
+ row_skills = extract_skills_from_row(item)
237
+ # 拼接成 # Skill 部分的文本内容
238
+ system_skill_text = build_skills_system_text(row_skills) if row_skills else None
239
+ # 记录所有的 skill 的绝对路径
240
+ skill_source_dirs = resolve_skill_source_dirs(row_skills, project_root) if row_skills else []
241
+ try:
242
+ result = await asyncio.wait_for(
243
+ run_one_query(
244
+ llm=llm_client,
245
+ user_query=str(query),
246
+ file_path=file_paths, # 将真实的 file_paths 传入,在内部进行图像的拷贝
247
+ system=system_prompt,
248
+ max_rounds=max_rounds,
249
+ temperature=temperature,
250
+ top_p=top_p,
251
+ extra_payload=extra_payload,
252
+ debug=verbose,
253
+ progress=progress,
254
+ all_tools=selected_tools,
255
+ system_format = system_format,
256
+ log_label = args.log_label,
257
+ file_prefix = "", # 由内部进行设定,因为现在还不知道 query_id (也就是工具需要的 conversation_id) 是多少
258
+ discard_all_mode=discard_all_mode,
259
+ model_max_context_tokens=model_max_context_tokens,
260
+ discard_ratio=discard_ratio,
261
+ tokenizer_path=tokenizer_path,
262
+ logging_root=logging_root,
263
+ skill_source_dirs=skill_source_dirs,
264
+ system_skill_text=system_skill_text,
265
+ ),
266
+ timeout=timeout_for_one_query
267
+ )
268
+ status = "success"
269
+ except asyncio.TimeoutError:
270
+ status = "timeout"
271
+ result = progress.get('result', [])
272
+ logger.error(f"[Timeout] id={id}, query_id={query_id}, elapsed={round(time.time() - start, 3)}s")
273
+ except Exception as e:
274
+ status = "error"
275
+ result = progress.get('result', [])
276
+ if isinstance(result, list):
277
+ if not result:
278
+ result = [{"error": str(e)}]
279
+ else:
280
+ result[-1]["error"] = str(e)
281
+ logger.error(f"[Error] id={id}, query_id={query_id}, err={e}")
282
+ elapsed = time.time() - start
283
+ llm_client.pop_query_id(query_id) # llm_client 弹出当前 query 用于动态记录负载
284
+ logger.info(f"[Finish] id={id}, query_id={query_id}, status={status}, elapsed={round(elapsed,3)}s")
285
+ discard_segments = sum(
286
+ 1
287
+ for rr in (result or [])
288
+ if isinstance(rr, dict) and str(rr.get("stopped_reason", "")).startswith("discard_all_")
289
+ and rr.get("stopped_reason") != "discard_all_final"
290
+ )
291
+ return {
292
+ "time_stamp": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
293
+ "query_id": query_id,
294
+ "query": query,
295
+ "result": result,
296
+ "status": status,
297
+ "discard_segments": discard_segments, # 统计当前有多少个 summary 碎片(即没有 <answer>..</answer>, 而是被强行截断的)
298
+ "elapsed_sec": round(elapsed, 3),
299
+ "rollout_idx": rollout_idx,
300
+ "src": item,
301
+ }
302
+
303
+ # --------- 分派任务 & tqdm进度 ---------
304
+ tasks = [asyncio.create_task(_worker(i, item, rollout_idx)) for i, item in enumerate(data_list_for_rollout)]
305
+ pbar = tqdm(total=len(tasks), desc=f"Rollout {rollout_idx}/{rollout_num}", ncols=80)
306
+ finished = 0
307
+
308
+ # --------- result循环处理&定期保存 ---------
309
+ for coro in asyncio.as_completed(tasks):
310
+ r = await coro
311
+ seg_results = r.get("result", [])
312
+ if isinstance(seg_results, dict):
313
+ seg_results = [seg_results]
314
+ if not isinstance(seg_results, list):
315
+ seg_results = []
316
+
317
+ if not seg_results:
318
+ row = dict(r)
319
+ row["result"] = {}
320
+ row["segment_idx"] = 1
321
+ row["segment_total"] = 0
322
+ results.append(row)
323
+ else:
324
+ total = len(seg_results)
325
+ for seg_idx, seg in enumerate(seg_results, start=1):
326
+ row = dict(r)
327
+ row["result"] = seg
328
+ row["segment_idx"] = seg_idx # 下标从 1 开始
329
+ row["segment_total"] = total # 一共有几个片段(最后的有 <answer> 的也算一个片段)
330
+ results.append(row)
331
+
332
+ finished += 1
333
+ pbar.update(1)
334
+ # 定期保存
335
+ if rollout_output_file and save_every and finished % save_every == 0:
336
+ with open(rollout_output_file, "w", encoding="utf-8") as f:
337
+ for rr in results:
338
+ f.write(json.dumps(rr, ensure_ascii=False) + "\n")
339
+ logger.info(f"[AutoSave Rollout {rollout_idx}] Progress saved to: {rollout_output_file} ({finished}/{len(tasks)})")
340
+ pbar.close()
341
+
342
+ # --------- 最后一次保存 ---------
343
+ if rollout_output_file:
344
+ with open(rollout_output_file, "w", encoding="utf-8") as f:
345
+ for r in results:
346
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
347
+ logger.info(f"[Rollout {rollout_idx}] Wrote results to: {rollout_output_file}")
348
+
349
+ logger.info(f"{'='*50}")
350
+ logger.info(f"All {rollout_num} rollouts completed!")
351
+ logger.info(f"{'='*50}")
352
+ logger.info("[Collector] Script finished.")
353
+
354
+ # ===== Final clean-up: 删除文件拷贝过去的目录 =====
355
+ if args.clean_files_copy_dir: # 只有开启该参数才清理
356
+ if files_copy_dir and os.path.exists(files_copy_dir):
357
+ try:
358
+ import shutil
359
+ shutil.rmtree(files_copy_dir)
360
+ logger.info(f"[Cleanup] Removed copied files directory: {files_copy_dir}")
361
+ except Exception as e:
362
+ logger.error(f"[Cleanup] Failed to remove directory {files_copy_dir}: {e}")
363
+
364
+
365
+ def main():
366
+ args = parse_args()
367
+ try:
368
+ asyncio.run(main_async(args))
369
+ except KeyboardInterrupt:
370
+ print("Interrupted by user.")
371
+
372
+ if __name__ == "__main__":
373
+ main()
inference/inference/run_single_inference.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import copy
3
+ import logging
4
+ import sys
5
+ import os
6
+ import time
7
+ import datetime
8
+ import json
9
+
10
+
11
+
12
+ # 获取项目根目录路径,并加入 sys.path
13
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
14
+ sys.path.append(PROJECT_ROOT)
15
+
16
+ from typing import Any, Dict, List, Tuple
17
+ from anyio import Path
18
+
19
+ from server.llm_api import LLMClient
20
+ from server.tool_api import return_all_tools
21
+ from server.tool_execution import execute_tool_call
22
+ from utils.configs import LITERATURE_SEED_DATA_DIR, ONLINE_PLATFORM
23
+
24
+ from utils.build_prompt import (
25
+ _build_summary_message,
26
+ build_initial_messages,
27
+ build_openai_schema,
28
+ build_tongyi_schema,
29
+ build_user_payload,
30
+ get_tools_json,
31
+ wrap_tool_responses_into_user_message,
32
+ )
33
+ from utils.common import _estimate_message_tokens, count_tokens, get_query_uuid
34
+ from utils.extract_schemas_nlp import extract_nlp_tool_calls # 更新工具读取逻辑
35
+ from utils.extract_schemas_online import extract_aihubmix_tool_calls # 新增 aihubmix 的工具读取逻辑
36
+ from utils.logger import save_result_to_log_dir, setup_logger_for_query # 新增:用于结果的json序列化
37
+ from utils.skill_prompt import normalize_skill_dir_path
38
+
39
+ # 读取 json 文件,对于每一行 qa 执行任务
40
+ async def run_one_query(
41
+ llm: LLMClient,
42
+ user_query: str,
43
+ file_path: List,
44
+ system: str,
45
+ max_rounds: int,
46
+ temperature: float,
47
+ top_p: float = 0.95,
48
+ extra_payload: dict = {},
49
+ debug: bool = False,
50
+ args=None,
51
+ progress: dict = {}, # 新增参数
52
+ all_tools: Dict = {},
53
+ system_format: str = "deep_research",
54
+ log_label:str = "",
55
+ file_prefix:str = "",
56
+ discard_all_mode: bool = False,
57
+ model_max_context_tokens: int = 128000,
58
+ discard_ratio: float = 0.8,
59
+ tokenizer_path: str = "models/tokenizer",
60
+ logging_root=None,
61
+ skill_source_dirs: List[str] | None = None,
62
+ system_skill_text: str | None = None,
63
+ # 默认采用 deep_research 的 system_format,如果是其他模型,就采用 system + tool_list_shcema 的形式拼接
64
+ ) -> List[Dict[str, Any]]:
65
+ """
66
+ Run a complete multi-step tool-calling session until the model stops calling tools
67
+ or max_rounds is reached. Returns a dict with the transcript and final answer.
68
+ """
69
+ query_id = get_query_uuid(user_query) + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
70
+
71
+ project_root = logging_root if logging_root else PROJECT_ROOT
72
+
73
+ logger, log_file_path = setup_logger_for_query(query_id, project_root, log_label)
74
+ if debug:
75
+ print(f"[run_single_inference.py] Logging configured. Query log file: {log_file_path}")
76
+
77
+ def copy_file_to_docker_path(file_paths:List) -> List:
78
+ """
79
+ 文件存在性检查和文件拷贝
80
+ 将文件拷贝到挂载到 docker 内部的路经下,然后返回文件名称
81
+ e.g.
82
+ 处理过程:/test/test_file.jsonl -> /LITERATURE_SEED_DATA_DIR/log_label/query_id/test_file.jsonl
83
+ 返回内容:[test_file.jsonl]
84
+
85
+ 同时,skills 文件也会检测,如果满足存在 skills/skill_name 的 skill,就会把它拷贝到 data/{log_label}/{query_id} 文件夹下面
86
+ """
87
+ file_names = []
88
+ dest_dir = f"data/{log_label}/{query_id}"
89
+ os.makedirs(dest_dir, exist_ok=True)
90
+ for fp in file_paths:
91
+ if fp:
92
+ if os.path.isfile(fp):
93
+ logger.info(f"[File load] Get file `{fp}` for id={id}, query_id={query_id}")
94
+ dest_path = os.path.join(dest_dir, Path(fp).name)
95
+ try:
96
+ import shutil
97
+ shutil.copy2(fp, dest_path)
98
+ logger.info(f"[File copy] Copied file to `{dest_path}`")
99
+ except Exception as e:
100
+ logger.error(f"[File copy error] Failed to copy file `{fp}` to `{dest_path}`: {e}")
101
+ file_names.append(Path(fp).name)
102
+ else:
103
+ logger.warning(f"File not found for id={id}, query_id={query_id}: expected {fp}")
104
+
105
+ copied_skill_dirs = set()
106
+ for skill_dir in (skill_source_dirs or []):
107
+ if not skill_dir:
108
+ continue
109
+ abs_skill_dir = skill_dir if os.path.isabs(skill_dir) else os.path.join(PROJECT_ROOT, skill_dir)
110
+ if os.path.isfile(abs_skill_dir) and os.path.basename(abs_skill_dir) == "SKILL.md":
111
+ abs_skill_dir = os.path.dirname(abs_skill_dir)
112
+ if not os.path.isdir(abs_skill_dir):
113
+ logger.warning(f"[Skill copy] Skill abs_skill_dir not found: {abs_skill_dir}")
114
+ continue
115
+ normalized_skill_dir = normalize_skill_dir_path(abs_skill_dir)
116
+ rel_after_skills = normalized_skill_dir
117
+ if rel_after_skills.startswith("skills/"):
118
+ rel_after_skills = rel_after_skills[len("skills/"):]
119
+ if not rel_after_skills:
120
+ logger.warning(f"[Skill copy] Skill rel_after_skills not found: {rel_after_skills}")
121
+ # 说明这个 skill 不存在呗,那也需要 continue
122
+ continue
123
+ dest_skill_dir = os.path.join(dest_dir, "skills", rel_after_skills)
124
+ if dest_skill_dir in copied_skill_dirs:
125
+ continue
126
+ copied_skill_dirs.add(dest_skill_dir)
127
+ try:
128
+ import shutil
129
+ shutil.copytree(abs_skill_dir, dest_skill_dir, dirs_exist_ok=True)
130
+ logger.info(f"[Skill copy] Copied skill dir `{abs_skill_dir}` -> `{dest_skill_dir}`")
131
+ except Exception as e:
132
+ logger.error(f"[Skill copy error] Failed to copy skill dir `{abs_skill_dir}` -> `{dest_skill_dir}`: {e}")
133
+ # 若 file_names 为空,则传空字符串,否则传列表
134
+ send_file_path = file_names if file_names else []
135
+
136
+ return send_file_path
137
+
138
+ # 将文件拷贝至 docker 内部
139
+ file_path = copy_file_to_docker_path(file_path)
140
+ # 定义 file_prefix 前缀,用于 ask_xxx, parse_file 等工具能够找到这个附件文件在哪
141
+ file_prefix = os.path.join(LITERATURE_SEED_DATA_DIR, log_label, query_id)
142
+
143
+ messages = build_initial_messages(
144
+ user_query,
145
+ file_path,
146
+ system=system,
147
+ system_format=system_format,
148
+ tool_mapping=all_tools,
149
+ system_skill_text=system_skill_text,
150
+ )
151
+ processed_system_start = copy.deepcopy(messages[0])
152
+ # 找到 messages 中第一个 role 为 user 的 dict
153
+ processed_user_start = next((copy.deepcopy(msg) for msg in messages if msg.get('role') == 'user'), None)
154
+ if processed_user_start is None:
155
+ processed_user_start = {"role": "user", "content": user_query}
156
+ transcript: List[Dict[str, Any]] = list(messages) # shallow copy
157
+ result_objs: List[Dict[str, Any]] = []
158
+ discard_count = 0
159
+ discard_threshold = int(model_max_context_tokens * discard_ratio)
160
+
161
+ # For json log
162
+ log_messages: List[Dict[str, Any]] = []
163
+ # Add system and user
164
+ log_messages.append({"role": "system", "content": system}) # 没有把动态的拼接回去,原始的 system prompt
165
+ log_messages.append({"role": "user", "content": build_user_payload(user_query, file_path, system_format)})
166
+
167
+ round_idx = 1
168
+ while round_idx <= max_rounds:
169
+ tmp_token_numbers = _estimate_message_tokens(log_messages, tokenizer_path)
170
+ if discard_all_mode and _estimate_message_tokens(log_messages, tokenizer_path) >= discard_threshold:
171
+ print(f"当前 token 数量:{tmp_token_numbers} > 阈值:{discard_threshold}...")
172
+ discard_count += 1
173
+ summary_dict = await _build_summary_message(llm, messages, temperature, logger, query_id, system_format)
174
+ discard_marker = {
175
+ "role": "assistant",
176
+ "content": '<tool_call>{"name": "new_context_tool", "arguments": {"begin_new_context": True}}</tool_call>',
177
+ }
178
+ discard_tool_result = {"role": "tool", "content": summary_dict['content'], "usage": summary_dict['usage']}
179
+ discard_tool_result_for_transcript = {"role": "user", "content": f"{summary_dict['content']}", "usage": summary_dict['usage']}
180
+ discard_follow_up = {"role": "assistant", "content": "Start new conversation to continue the task..."}
181
+ discard_log_messages = copy.deepcopy(log_messages) + [discard_marker, discard_tool_result, discard_follow_up]
182
+ # discard_transcript 中 role: tool 需要替换成 user
183
+ discard_transcript = copy.deepcopy(transcript) + [discard_marker, discard_tool_result_for_transcript, discard_follow_up]
184
+ discard_result = {
185
+ "query_id": query_id,
186
+ "tools": get_tools_json(all_tools) if all_tools is not None else "[]",
187
+ "messages": discard_log_messages,
188
+ "final_answer": discard_follow_up["content"],
189
+ "transcript": discard_transcript,
190
+ "rounds": round_idx,
191
+ "stopped_reason": f"discard_all_{discard_count:02d}",
192
+ }
193
+ result_objs.append(discard_result)
194
+ if progress is not None:
195
+ progress["result"] = copy.deepcopy(result_objs)
196
+ # 兜底,上下文还是在 summary 的时候爆了(导致 summary_dict['content'] 为空),回退到 user query
197
+ summary_start = {"role": "user", "content": summary_dict['content'] if summary_dict['content'] else processed_user_start['content']}
198
+ if not summary_dict['content']:
199
+ logger.info(f"Summary failed due to exceeding max context length, fallback to user query: {processed_user_start['content']}")
200
+ log_system_start = {"role": "system", "content": system}
201
+ transcript = copy.deepcopy([processed_system_start, summary_start] if processed_system_start['role'] == 'system' else [summary_start])
202
+ log_messages = copy.deepcopy([log_system_start, summary_start])
203
+ # fix: messages 没有改,所以后续长度还在不断增加,messages 也需要重置
204
+ messages = copy.deepcopy([processed_system_start, summary_start] if processed_system_start['role'] == 'system' else [summary_start])
205
+ # 同时 round_idx 也需要重置
206
+ round_idx = 1
207
+ continue
208
+
209
+ llm_start = time.time()
210
+ logger.info(f"[round {round_idx}] Round {round_idx} starting...")
211
+ tool_call_ids = [] # 用于在线平台的 tool_id 记录
212
+ # 不同的调用方式采用不同的 chat,tongyi 不需要传入 tool_list,aihubmix 需要把 aihubmix_chat 传入
213
+ response = {}
214
+ if system_format == "deep_research":
215
+ response = await llm.chat(messages, temperature=temperature, top_p=top_p, extra_payload=extra_payload, logger=logger, query_id=query_id)
216
+ assistant_text = response['content']
217
+ usage = response['usage']
218
+ if debug and response['error']:
219
+ logger.info(f"[round {round_idx}] llm.chat error: {response['error']}")
220
+ llm_elapsed_time = time.time() - llm_start
221
+ # 修复用于部分模型的输出中已经预制了 <think>,我们需要将它补全
222
+ assistant_fix_prefix_think_text = assistant_text if assistant_text.lstrip().startswith("<think>") else "<think>\n" + assistant_text.lstrip()
223
+ transcript.append({"role": "assistant", "content": assistant_fix_prefix_think_text, "elapsed_time": llm_elapsed_time, "usage": usage})
224
+ log_messages.append({"role": "assistant", "content": assistant_text, "elapsed_time": llm_elapsed_time, "usage": usage})
225
+ tool_calls = extract_nlp_tool_calls(assistant_text, file_prefix=file_prefix, prefix_mode="benchmark")
226
+ elif system_format in ONLINE_PLATFORM:
227
+ if system_format == "azure":
228
+ response = await llm.azure_chat(messages, temperature=temperature, tool_list=build_openai_schema(all_tools), logger=logger, query_id=query_id)
229
+ elif system_format in ["aihubmix", "aihubmix_claude"]:
230
+ response = await llm.aihubmix_chat(messages, temperature=temperature, tool_list=build_openai_schema(all_tools), logger=logger, query_id=query_id)
231
+ elif system_format in ["aihubmix_glm"]:
232
+ response = await llm.aihubmix_chat(messages, temperature=temperature, tool_list=build_tongyi_schema(all_tools), logger=logger, query_id=query_id)
233
+ elif system_format == "volcano":
234
+ response = await llm.volcano_chat(messages, temperature=temperature, tool_list=build_tongyi_schema(all_tools), logger=logger, query_id=query_id)
235
+ elif system_format == "aliyun":
236
+ response = await llm.aliyun_chat(messages, temperature=temperature, tool_list=build_tongyi_schema(all_tools), logger=logger, query_id=query_id)
237
+
238
+ usage = response['usage']
239
+ messages = response['next_messages'] # 带 openai 类的下一次 message,用于回传给在线平台(和平台交互用)
240
+ now_log_messages = response['log_messages'] # 带 openai 类的下一次 message,本地记录,会把一些耗时等信息也打印到 dict 里面(本地落盘用)
241
+ tool_call_ids = response['tool_call_ids']
242
+ meta_data = response['meta_data'] # 人工拼接的形成的 {"role":"...", "content": "..."}
243
+ llm_elapsed_time = time.time() - llm_start
244
+
245
+ if debug and 'error' in response:
246
+ logger.info(f"[round {round_idx}] llm.chat error: {response['error']}")
247
+
248
+ transcript.append({**meta_data, "elapsed_time": llm_elapsed_time, "usage": usage})
249
+ log_messages.extend(now_log_messages)
250
+ tool_calls = extract_aihubmix_tool_calls(meta_data['content'], all_tools, file_prefix=file_prefix, prefix_mode="benchmark")
251
+ assistant_text = meta_data['content']
252
+ else:
253
+ raise ValueError(f"[system_format={system_format} failed] Please define a function to extract calls like `utils -> extract_schemas -> extract_nlp_tool_calls`")
254
+
255
+ if debug:
256
+ logger.info(f"[round {round_idx}] tool_calls: {tool_calls}")
257
+
258
+
259
+ result_obj = {
260
+ "query_id": query_id,
261
+ "tools": get_tools_json(all_tools) if all_tools is not None else "[]",
262
+ "messages": copy.deepcopy(log_messages),
263
+ "final_answer": assistant_text,
264
+ "transcript": copy.deepcopy(transcript),
265
+ "rounds": round_idx,
266
+ "stopped_reason": "no_tool_calls" if not tool_calls else None
267
+ }
268
+
269
+ # 新增:每轮都更新 progress['result']
270
+ if progress is not None:
271
+ progress['result'] = copy.deepcopy(result_objs + [result_obj])
272
+
273
+ #如果没有工具调用就到这里为止停止调用了
274
+ if not tool_calls:
275
+ logger.info("[run_one_query] Stopping: no tool calls in round %d", round_idx)
276
+ result_obj["stopped_reason"] = "discard_all_final" if discard_count > 0 else "no_tool_calls"
277
+ result_objs.append(result_obj)
278
+ # 保存最终结果到logs_base_dir
279
+ save_result_to_log_dir(query_id, result_objs, project_root, log_label)
280
+ return result_objs
281
+
282
+
283
+ # Execute all tool calls sequentially for this assistant turn (the model may emit multiple).
284
+ responses: List[Tuple[str, str]] = []
285
+ tool_total_time = 0.0
286
+ for idx, call in enumerate(tool_calls):
287
+ name = call.get("name")
288
+ args = call.get("arguments", {})
289
+ tool_start = time.time()
290
+ resp = await execute_tool_call(name, args, all_tools, logger, None, f"{log_label}/{query_id}") # fix 传递的 conversation id 和 skill 上传的路径一致,这样子才能够找到对应的文件
291
+ tool_elapsed = time.time() - tool_start
292
+ tool_total_time += tool_elapsed
293
+ responses.append(resp)
294
+ # Log each tool response as a message
295
+ if system_format in ONLINE_PLATFORM:
296
+ tool_response = {}
297
+ if system_format in ["azure", "aihubmix"]:
298
+ tool_response = {
299
+ "type": "function_call_output",
300
+ "call_id": tool_call_ids[idx], # 在线平台需要和 工具 id 绑定
301
+ "output": resp[1], # 工具执行结果
302
+ }
303
+ elif system_format in ["aihubmix_claude"]:
304
+ tool_response = {
305
+ "role": "user",
306
+ "content": [{
307
+ "type": "tool_result",
308
+ "tool_use_id": tool_call_ids[idx], # 在线平台需要和 工具 id 绑定
309
+ "content": resp[1], # 工具执行结果
310
+ }]
311
+ }
312
+ elif system_format in ["volcano", "aihubmix_glm"]:
313
+ tool_response = {
314
+ "role": "tool",
315
+ "tool_call_id": tool_call_ids[idx], # 在线平台需要和 工具 id 绑定
316
+ "content": resp[1], # 工具执行结果
317
+ }
318
+ elif system_format in ["aliyun"]:
319
+ tool_response = {
320
+ "role": "tool",
321
+ "tool_call_id": tool_call_ids[idx], # 在线平台需要和 工具 id 绑定
322
+ "content": resp[1], # 工具执行结果
323
+ }
324
+ messages.append(copy.deepcopy(tool_response))
325
+ tool_response["elapsed_time"] = tool_elapsed # 这个参数不能传进去
326
+ log_messages.append(tool_response)
327
+ else:
328
+ log_messages.append({"role": "tool", "content": resp[1], "elapsed_time": tool_elapsed})
329
+ # Feed tool responses back as a single 'user' message (matching the template behavior)
330
+ tool_user_msg = wrap_tool_responses_into_user_message(responses)
331
+ if system_format not in ONLINE_PLATFORM:
332
+ messages.extend([{"role": "assistant", "content": assistant_text}, tool_user_msg])
333
+
334
+ transcript.extend([tool_user_msg])
335
+ round_idx += 1
336
+
337
+ # If we get here, we hit the max rounds without a clean finish
338
+ logger.info("[run_one_query] Max rounds (%d) exceeded for query: %s", max_rounds, user_query)
339
+
340
+ result_obj = {
341
+ "query_id": query_id,
342
+ "tools": get_tools_json(all_tools) if all_tools is not None else "[]",
343
+ "messages": copy.deepcopy(log_messages),
344
+ "final_answer": transcript[-1]["content"] if transcript else "",
345
+ "transcript": copy.deepcopy(transcript),
346
+ "rounds": max_rounds,
347
+ "stopped_reason": "discard_all_final" if discard_count > 0 else "max_rounds_exceeded"
348
+ }
349
+ if progress is not None:
350
+ progress['result'] = copy.deepcopy(result_objs + [result_obj])
351
+ result_objs.append(result_obj)
352
+ # 保存最终结果到logs_base_dir
353
+ save_result_to_log_dir(query_id, result_objs, project_root, log_label)
354
+ return result_objs
inference/models/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
inference/models/tokenizer/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
inference/models/tokenizer/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3MoeForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "decoder_sparse_step": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 5472,
15
+ "max_position_embeddings": 131072,
16
+ "max_window_layers": 28,
17
+ "mlp_only_layers": [],
18
+ "model_type": "qwen3_moe",
19
+ "moe_intermediate_size": 768,
20
+ "norm_topk_prob": true,
21
+ "num_attention_heads": 32,
22
+ "num_experts": 128,
23
+ "num_experts_per_tok": 8,
24
+ "num_hidden_layers": 48,
25
+ "num_key_value_heads": 4,
26
+ "output_router_logits": false,
27
+ "pad_token_id": 151643,
28
+ "qkv_bias": false,
29
+ "rms_norm_eps": 1e-06,
30
+ "rope_scaling": null,
31
+ "rope_theta": 5000000,
32
+ "router_aux_loss_coef": 0.0,
33
+ "sliding_window": null,
34
+ "tie_word_embeddings": false,
35
+ "torch_dtype": "bfloat16",
36
+ "transformers_version": "4.51.3",
37
+ "use_cache": true,
38
+ "use_qk_norm": true,
39
+ "use_sliding_window": false,
40
+ "vocab_size": 151936
41
+ }
inference/models/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
inference/models/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
inference/models/tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
inference/models/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
inference/models/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
inference/requirements.txt ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.13.3
4
+ aiosignal==1.4.0
5
+ annotated-doc==0.0.4
6
+ annotated-types==0.7.0
7
+ anthropic==0.75.0
8
+ anyio==4.12.1
9
+ async-timeout==5.0.1
10
+ asyncer==0.0.12
11
+ asyncpg==0.31.0
12
+ attrs==25.4.0
13
+ azure-core==1.38.0
14
+ azure-storage-blob==12.28.0
15
+ beautifulsoup4==4.14.3
16
+ bidict==0.23.1
17
+ blinker==1.9.0
18
+ boto3==1.42.27
19
+ botocore==1.42.27
20
+ certifi==2026.1.4
21
+ cffi==2.0.0
22
+ chainlit==2.9.5
23
+ charset-normalizer==3.4.4
24
+ chevron==0.14.0
25
+ click==8.3.1
26
+ colorama==0.4.6
27
+ cryptography==46.0.3
28
+ cuid==0.4
29
+ dataclasses-json==0.6.7
30
+ Deprecated==1.3.1
31
+ distro==1.9.0
32
+ docstring_parser==0.17.0
33
+ fastapi==0.128.0
34
+ filelock==3.20.3
35
+ filetype==1.2.0
36
+ Flask==3.1.2
37
+ frozenlist==1.8.0
38
+ fsspec==2026.1.0
39
+ google==3.0.0
40
+ google-ai-generativelanguage==0.6.15
41
+ google-api-core==2.29.0
42
+ google-api-python-client==2.188.0
43
+ google-auth==2.47.0
44
+ google-auth-httplib2==0.3.0
45
+ google-cloud-core==2.5.0
46
+ google-cloud-storage==3.8.0
47
+ google-crc32c==1.8.0
48
+ google-generativeai==0.8.6
49
+ google-resumable-media==2.8.0
50
+ googleapis-common-protos==1.72.0
51
+ grpcio==1.76.0
52
+ grpcio-status==1.71.2
53
+ h11==0.16.0
54
+ hf-xet==1.2.0
55
+ httpcore==1.0.9
56
+ httplib2==0.31.2
57
+ httpx==0.28.1
58
+ httpx-sse==0.4.3
59
+ huggingface-hub==0.36.0
60
+ idna==3.11
61
+ importlib_metadata==8.7.1
62
+ inflection==0.5.1
63
+ isodate==0.7.2
64
+ itsdangerous==2.2.0
65
+ Jinja2==3.1.6
66
+ jiter==0.12.0
67
+ jmespath==1.0.1
68
+ jsonschema==4.26.0
69
+ jsonschema-specifications==2025.9.1
70
+ Lazify==0.4.0
71
+ literalai==0.1.201
72
+ MarkupSafe==3.0.3
73
+ marshmallow==3.26.2
74
+ mcp==1.25.0
75
+ multidict==6.7.0
76
+ mypy_extensions==1.1.0
77
+ numpy==2.2.6
78
+ openai==2.15.0
79
+ opentelemetry-api==1.39.1
80
+ opentelemetry-exporter-otlp-proto-common==1.39.1
81
+ opentelemetry-exporter-otlp-proto-grpc==1.39.1
82
+ opentelemetry-exporter-otlp-proto-http==1.39.1
83
+ opentelemetry-instrumentation==0.60b1
84
+ opentelemetry-instrumentation-agno==0.50.1
85
+ opentelemetry-instrumentation-alephalpha==0.50.1
86
+ opentelemetry-instrumentation-anthropic==0.50.1
87
+ opentelemetry-instrumentation-bedrock==0.50.1
88
+ opentelemetry-instrumentation-chromadb==0.50.1
89
+ opentelemetry-instrumentation-cohere==0.50.1
90
+ opentelemetry-instrumentation-crewai==0.50.1
91
+ opentelemetry-instrumentation-google-generativeai==0.50.1
92
+ opentelemetry-instrumentation-groq==0.50.1
93
+ opentelemetry-instrumentation-haystack==0.50.1
94
+ opentelemetry-instrumentation-lancedb==0.50.1
95
+ opentelemetry-instrumentation-langchain==0.50.1
96
+ opentelemetry-instrumentation-llamaindex==0.50.1
97
+ opentelemetry-instrumentation-logging==0.60b1
98
+ opentelemetry-instrumentation-marqo==0.50.1
99
+ opentelemetry-instrumentation-mcp==0.50.1
100
+ opentelemetry-instrumentation-milvus==0.50.1
101
+ opentelemetry-instrumentation-mistralai==0.50.1
102
+ opentelemetry-instrumentation-ollama==0.50.1
103
+ opentelemetry-instrumentation-openai==0.50.1
104
+ opentelemetry-instrumentation-openai-agents==0.50.1
105
+ opentelemetry-instrumentation-pinecone==0.50.1
106
+ opentelemetry-instrumentation-qdrant==0.50.1
107
+ opentelemetry-instrumentation-redis==0.60b1
108
+ opentelemetry-instrumentation-replicate==0.50.1
109
+ opentelemetry-instrumentation-requests==0.60b1
110
+ opentelemetry-instrumentation-sagemaker==0.50.1
111
+ opentelemetry-instrumentation-sqlalchemy==0.60b1
112
+ opentelemetry-instrumentation-threading==0.60b1
113
+ opentelemetry-instrumentation-together==0.50.1
114
+ opentelemetry-instrumentation-transformers==0.50.1
115
+ opentelemetry-instrumentation-urllib3==0.60b1
116
+ opentelemetry-instrumentation-vertexai==0.50.1
117
+ opentelemetry-instrumentation-watsonx==0.50.1
118
+ opentelemetry-instrumentation-weaviate==0.50.1
119
+ opentelemetry-instrumentation-writer==0.50.1
120
+ opentelemetry-proto==1.39.1
121
+ opentelemetry-sdk==1.39.1
122
+ opentelemetry-semantic-conventions==0.60b1
123
+ opentelemetry-semantic-conventions-ai==0.4.13
124
+ opentelemetry-util-http==0.60b1
125
+ pandas==2.3.3
126
+ pillow==12.1.0
127
+ propcache==0.4.1
128
+ proto-plus==1.27.0
129
+ protobuf==5.29.5
130
+ pyasn1==0.6.1
131
+ pyasn1_modules==0.4.2
132
+ pycparser==2.23
133
+ pydantic==2.12.5
134
+ pydantic-settings==2.12.0
135
+ pydantic_core==2.41.5
136
+ PyJWT==2.10.1
137
+ pyparsing==3.3.2
138
+ python-dotenv==1.2.1
139
+ python-engineio==4.13.0
140
+ python-multipart==0.0.21
141
+ python-socketio==5.16.0
142
+ pytz==2025.2
143
+ PyYAML==6.0.3
144
+ referencing==0.37.0
145
+ regex==2025.11.3
146
+ requests==2.32.5
147
+ rpds-py==0.30.0
148
+ rsa==4.9.1
149
+ s3transfer==0.16.0
150
+ safetensors==0.7.0
151
+ shellingham==1.5.4
152
+ simple-websocket==1.1.0
153
+ sniffio==1.3.1
154
+ soupsieve==2.8.3
155
+ sse-starlette==3.1.2
156
+ starlette==0.50.0
157
+ syncer==2.0.3
158
+ tenacity==9.1.2
159
+ tokenizers==0.22.2
160
+ tomli==2.4.0
161
+ tqdm==4.67.1
162
+ traceloop-sdk==0.50.1
163
+ transformers==4.57.3
164
+ typer-slim==0.21.1
165
+ typing-inspect==0.9.0
166
+ typing-inspection==0.4.2
167
+ tzdata==2025.3
168
+ uritemplate==4.2.0
169
+ urllib3==2.6.3
170
+ uvicorn==0.40.0
171
+ watchfiles==1.1.1
172
+ Werkzeug==3.1.5
173
+ wrapt==1.17.3
174
+ wsproto==1.3.2
175
+ yarl==1.22.0
176
+ zipp==3.23.0
inference/run_batch_inference_demo.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 可选:通过 JSON / 环境变量覆盖 `utils.config` 中的配置
4
+ # 示例 1:指定 JSON 配置文件(优先级最高)
5
+ # export S1_DR_CONFIG_JSON="utils/config/config.example.json"
6
+ #
7
+ # 示例 2:直接用环境变量覆盖
8
+ # export CLIENT_TIMEOUT=7200
9
+ # export USE_NLP_FORMAT_RETURN=true
10
+ # export MY_NEW_FLAG="demo_value"
11
+
12
+ # system prompt 参数
13
+ SYSTEM_PROMPT="" # 标准的 9 个工具 system prompt
14
+
15
+ LLM_CLIENT_URLS="http://url:port/v1/chat/completions http://url:port/v1/chat/completions"
16
+ LLM_CLIENT_MODELS="model_name1 model_name1"
17
+
18
+ TEST_DATA_FILE="test.jsonl"
19
+ OUTPUT_FILE="test_results.jsonl"
20
+
21
+ # OUTPUT_DIR 仅在 Pass@K(多次 rollout)场景下生效。
22
+ # 当 ROLLOUT_NUM=1 时,忽略 OUTPUT_DIR;单次推理结果写入 OUTPUT_FILE 所指定的 jsonl 路径。
23
+ # 当 ROLLOUT_NUM≠1 时,忽略 OUTPUT_FILE;在 OUTPUT_DIR 下按 xxx_01.jsonl、xxx_02.jsonl、… 命名保存各次 rollout 结果。
24
+ OUTPUT_DIR=""
25
+ # 启动所有的 9 个工具
26
+ AVAILABLE_TOOLS="wide_search scholar_search file_wide_parse execute_code wide_visit ask_question_about_image ask_question_about_video image_search bash"
27
+ ROLLOUT_NUM=1
28
+ RESUME_FROM_FILE=""
29
+
30
+ # LOGGING_ROOT:日志根路径;在其下创建 `logs` 子目录。例如设为 "/app" 时,日志目录为 "/app/logs"。
31
+ LOGGING_ROOT=""
32
+ # LOG_LABEL:日志标签;运行日志写入 `logs/YYYY_MM_DD_<LOG_LABEL>/` 子目录(与 LOGGING_ROOT 组合使用时,位于上述 logs 路径之下)。
33
+ LOG_LABEL="test"
34
+ LOG_FILE="run_logs/run.log"
35
+
36
+ # 仅在与附件相关的任务中需调整 TASK_TYPE;其余推理任务可沿用默认值,无需修改该参数。
37
+ TASK_TYPE="input_only"
38
+ MAX_ROUNDS=100
39
+ CONCURRENCY_WORKERS=16
40
+ SAVE_BATCH_SIZE=10
41
+ TEMPERATURE=0.7
42
+ TOP_P=0.95
43
+ # 额外的 payload 参数(JSON 字符串),透传给模型 API,可按需增减字段
44
+ # 不需要额外参数时设为空 JSON:EXTRA_PAYLOAD='{}'
45
+ EXTRA_PAYLOAD='{"presence_penalty": 0.0}'
46
+ TIMEOUT_FOR_ONE_QUERY=3600
47
+ LLM_API_RETRY_TIMES=2
48
+ # discard-all:此处为 false,表示不启用该模式
49
+ DISCARD_ALL_MODE="false"
50
+ MODEL_MAX_CONTEXT_TOKENS=128000
51
+ DISCARD_RATIO=0.8
52
+ TOKENIZER_PATH="models/tokenizer"
53
+
54
+ PARAM_INFO=$(
55
+ cat <<EOF
56
+ ========== Run Parameters ==========
57
+ Start Time: $(date)
58
+ LLM_CLIENT_URLS: $LLM_CLIENT_URLS
59
+ LLM_CLIENT_MODELS: $LLM_CLIENT_MODELS
60
+ TEST_DATA_FILE: $TEST_DATA_FILE
61
+ OUTPUT_FILE: $OUTPUT_FILE
62
+ OUTPUT_DIR: $OUTPUT_DIR
63
+ AVAILABLE_TOOLS: $AVAILABLE_TOOLS
64
+ CONCURRENCY_WORKERS: $CONCURRENCY_WORKERS
65
+ SAVE_BATCH_SIZE: $SAVE_BATCH_SIZE
66
+ ROLLOUT_NUM: $ROLLOUT_NUM
67
+ MAX_ROUNDS: $MAX_ROUNDS
68
+ TEMPERATURE: $TEMPERATURE
69
+ TOP_P: $TOP_P
70
+ EXTRA_PAYLOAD: $EXTRA_PAYLOAD
71
+ TIMEOUT_FOR_ONE_QUERY: $TIMEOUT_FOR_ONE_QUERY
72
+ LLM_API_RETRY_TIMES: $LLM_API_RETRY_TIMES
73
+ DISCARD_ALL_MODE: $DISCARD_ALL_MODE
74
+ MODEL_MAX_CONTEXT_TOKENS: $MODEL_MAX_CONTEXT_TOKENS
75
+ DISCARD_RATIO: $DISCARD_RATIO
76
+ TOKENIZER_PATH: $TOKENIZER_PATH
77
+ RESUME_FROM_FILE: $RESUME_FROM_FILE
78
+ LOG_LABEL: $LOG_LABEL
79
+ TASK_TYPE: $TASK_TYPE
80
+ LOGGING_ROOT: $LOGGING_ROOT
81
+ SYSTEM_PROMPT: $SYSTEM_PROMPT
82
+ Shell PID: $$
83
+ ====================================
84
+ EOF
85
+ )
86
+ echo "$PARAM_INFO"
87
+ echo "$PARAM_INFO" > "$LOG_FILE"
88
+
89
+ # 使用 nohup 在后台启动 Python:标准输出与标准错误追加写入 LOG_FILE;随后将进程 PID 输出至终端,并同步追加至 LOG_FILE。
90
+ # 当 TASK_TYPE 为 input_only 时,须在命令行中加入 --clean_files_copy_dir。
91
+ if [ "$TASK_TYPE" = "input_only" ]; then
92
+ nohup python inference/run_batch_inference.py \
93
+ --llm_client_urls $LLM_CLIENT_URLS \
94
+ --llm_client_models $LLM_CLIENT_MODELS \
95
+ --test_data_file "$TEST_DATA_FILE" \
96
+ --output_file "$OUTPUT_FILE" \
97
+ --output_dir "$OUTPUT_DIR" \
98
+ --available_tools $AVAILABLE_TOOLS \
99
+ --concurrency_workers $CONCURRENCY_WORKERS \
100
+ --save_batch_size $SAVE_BATCH_SIZE \
101
+ --rollout_num $ROLLOUT_NUM \
102
+ --max_rounds $MAX_ROUNDS \
103
+ --temperature $TEMPERATURE \
104
+ --top_p $TOP_P \
105
+ --extra_payload "$EXTRA_PAYLOAD" \
106
+ --timeout_for_one_query $TIMEOUT_FOR_ONE_QUERY \
107
+ --llm_api_retry_times $LLM_API_RETRY_TIMES \
108
+ --discard_all_mode "$DISCARD_ALL_MODE" \
109
+ --model_max_context_tokens $MODEL_MAX_CONTEXT_TOKENS \
110
+ --discard_ratio $DISCARD_RATIO \
111
+ --tokenizer_path "$TOKENIZER_PATH" \
112
+ --resume_from_file "$RESUME_FROM_FILE" \
113
+ --log_label "$LOG_LABEL" \
114
+ --logging_root "$LOGGING_ROOT" \
115
+ --system_prompt "$SYSTEM_PROMPT" \
116
+ --verbose \
117
+ --clean_files_copy_dir \
118
+ >> "$LOG_FILE" 2>&1 &
119
+ else
120
+ nohup python inference/run_batch_inference.py \
121
+ --llm_client_urls $LLM_CLIENT_URLS \
122
+ --llm_client_models $LLM_CLIENT_MODELS \
123
+ --test_data_file "$TEST_DATA_FILE" \
124
+ --output_file "$OUTPUT_FILE" \
125
+ --output_dir "$OUTPUT_DIR" \
126
+ --available_tools $AVAILABLE_TOOLS \
127
+ --concurrency_workers $CONCURRENCY_WORKERS \
128
+ --save_batch_size $SAVE_BATCH_SIZE \
129
+ --rollout_num $ROLLOUT_NUM \
130
+ --max_rounds $MAX_ROUNDS \
131
+ --temperature $TEMPERATURE \
132
+ --top_p $TOP_P \
133
+ --extra_payload "$EXTRA_PAYLOAD" \
134
+ --timeout_for_one_query $TIMEOUT_FOR_ONE_QUERY \
135
+ --llm_api_retry_times $LLM_API_RETRY_TIMES \
136
+ --discard_all_mode "$DISCARD_ALL_MODE" \
137
+ --model_max_context_tokens $MODEL_MAX_CONTEXT_TOKENS \
138
+ --discard_ratio $DISCARD_RATIO \
139
+ --tokenizer_path "$TOKENIZER_PATH" \
140
+ --resume_from_file "$RESUME_FROM_FILE" \
141
+ --log_label "$LOG_LABEL" \
142
+ --logging_root "$LOGGING_ROOT" \
143
+ --system_prompt "$SYSTEM_PROMPT" \
144
+ --verbose \
145
+ >> "$LOG_FILE" 2>&1 &
146
+ fi
147
+
148
+ PY_PID=$!
149
+ echo "Python running as PID: $PY_PID"
150
+ echo "Python running as PID: $PY_PID" >> "$LOG_FILE"
inference/run_batch_inference_online_demo.sh ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 可选:通过 JSON / 环境变量覆盖 `utils.config` 中的配置
2
+ # 示例 1:指定 JSON 配置文件(优先级最高)
3
+ # export S1_DR_CONFIG_JSON="utils/config/config.example.json"
4
+ #
5
+ # 示例 2:直接用环境变量覆盖
6
+ # export CLIENT_TIMEOUT=7200
7
+ # export USE_NLP_FORMAT_RETURN=true
8
+ # export MY_NEW_FLAG="demo_value"
9
+
10
+ # GLM5
11
+ LLM_CLIENT_URLS="https://aihubmix.com/v1/chat/completions"
12
+ LLM_CLIENT_MODELS="glm-5"
13
+ SYSTEM_FORMAT="aihubmix_glm"
14
+
15
+ # # azure GPT 系列
16
+ # LLM_CLIENT_URLS="https://<your_special_id>.openai.azure.com/openai/v1/"
17
+ # LLM_CLIENT_MODELS="gpt-5"
18
+ # SYSTEM_FORMAT="azure"
19
+
20
+ # # aihubmix GPT 系列
21
+ # LLM_CLIENT_URLS="https://aihubmix.com/v1"
22
+ # LLM_CLIENT_MODELS="gpt-5"
23
+ # SYSTEM_FORMAT="aihubmix"
24
+
25
+ # # aihubmix Claude 系列
26
+ # LLM_CLIENT_URLS="https://aihubmix.com/v1"
27
+ # LLM_CLIENT_MODELS="claude-3.5-sonnet"
28
+ # SYSTEM_FORMAT="aihubmix_claude"
29
+
30
+ # # 火山引擎
31
+ # LLM_CLIENT_URLS="https://ark.cn-beijing.volces.com/api/v3"
32
+ # LLM_CLIENT_MODELS="ep-xxx"
33
+ # SYSTEM_FORMAT="volcano"
34
+
35
+ # # 阿里云百炼接口
36
+ # LLM_CLIENT_URLS="https://dashscope.aliyuncs.com/compatible-mode/v1"
37
+ # LLM_CLIENT_MODELS="qwen-plus"
38
+ # SYSTEM_FORMAT="aliyun"
39
+
40
+
41
+ TEST_DATA_FILE="test_files/test_one_query.jsonl"
42
+ OUTPUT_FILE="test_files/test_one_query_results.jsonl"
43
+
44
+ # OUTPUT_DIR 仅在 Pass@K(多次 rollout)场景下生效。
45
+ # 当 ROLLOUT_NUM=1 时,忽略 OUTPUT_DIR;单次推理结果写入 OUTPUT_FILE 所指定的 jsonl 路径。
46
+ # 当 ROLLOUT_NUM≠1 时,忽略 OUTPUT_FILE;在 OUTPUT_DIR 下按 xxx_01.jsonl、xxx_02.jsonl、… 命名保存各次 rollout 结果。
47
+ OUTPUT_DIR="run_logs/GAIA_0126/rollouts"
48
+ # 启动所有的 9 个工具
49
+ AVAILABLE_TOOLS="wide_search scholar_search file_wide_parse execute_code wide_visit ask_question_about_image ask_question_about_video image_search bash"
50
+ ROLLOUT_NUM=1
51
+ RESUME_FROM_FILE=""
52
+ LOG_LABEL="glm-5"
53
+ LOG_FILE="run_logs/run_batch_glm-5.log"
54
+
55
+ SYSTEM_PROMPT="You are a deep research assistant. Your core function is to conduct thorough, multi-source investigations into any topic. You must handle both broad, open-domain inquiries and queries within specialized academic fields. For every request, synthesize information from credible, diverse sources to deliver a comprehensive, accurate, and objective response. When you have gathered sufficient information and are ready to provide the definitive response, you must enclose the entire final answer within <answer></answer> tags.
56
+
57
+ # Note
58
+
59
+ ## General Rules
60
+
61
+ - The current working directory (cwd) is `.`. Treat the cwd as the project root.
62
+ - You are authorized to read, edit, or create files within this directory. **You must use relative paths** for all operations; absolute paths are strictly forbidden.
63
+
64
+ ## Citation & Reference Policy
65
+
66
+ - User instructions always override this policy.
67
+ - If the response does not use external sources, do not include citations or references.
68
+ - External sources include web searches, user-uploaded files, or explicitly cited webpages.
69
+ - If external sources are used:
70
+ - For lightweight factual or real-time information (e.g., weather, simple lookups), include in-text citation only.
71
+ - For research, analysis, or document-based tasks
72
+ (e.g., using multiple external sources or any user-uploaded file),
73
+ include both in-text citations and a reference list.
74
+ - Reference lists are for source traceability only; do not introduce new information.
75
+ - For citation-only cases, keep responses concise and avoid research-style structuring.
76
+
77
+ Current date: $(date +"%Y-%m-%d")"
78
+
79
+ # 仅在与附件相关的任务中需调整 TASK_TYPE;其余推理任务可沿用默认值,无需修改该参数。
80
+ TASK_TYPE="input_only"
81
+ MAX_ROUNDS=100
82
+ CONCURRENCY_WORKERS=16
83
+ SAVE_BATCH_SIZE=10
84
+ TEMPERATURE=0.85
85
+ TIMEOUT_FOR_ONE_QUERY=3600
86
+ LLM_API_RETRY_TIMES=2
87
+ # discard-all:此处为 false,表示不启用该模式
88
+ DISCARD_ALL_MODE="false"
89
+ MODEL_MAX_CONTEXT_TOKENS=131072
90
+ DISCARD_RATIO=0.8
91
+ TOKENIZER_PATH="models/tokenizer"
92
+
93
+
94
+ PARAM_INFO=$(
95
+ cat <<EOF
96
+ ========== Run Parameters ==========
97
+ Start Time: $(date)
98
+ LLM_CLIENT_URLS: $LLM_CLIENT_URLS
99
+ LLM_CLIENT_MODELS: $LLM_CLIENT_MODELS
100
+ TEST_DATA_FILE: $TEST_DATA_FILE
101
+ OUTPUT_FILE: $OUTPUT_FILE
102
+ OUTPUT_DIR: $OUTPUT_DIR
103
+ AVAILABLE_TOOLS: $AVAILABLE_TOOLS
104
+ CONCURRENCY_WORKERS: $CONCURRENCY_WORKERS
105
+ SAVE_BATCH_SIZE: $SAVE_BATCH_SIZE
106
+ ROLLOUT_NUM: $ROLLOUT_NUM
107
+ MAX_ROUNDS: $MAX_ROUNDS
108
+ TEMPERATURE: $TEMPERATURE
109
+ TIMEOUT_FOR_ONE_QUERY: $TIMEOUT_FOR_ONE_QUERY
110
+ LLM_API_RETRY_TIMES: $LLM_API_RETRY_TIMES
111
+ DISCARD_ALL_MODE: $DISCARD_ALL_MODE
112
+ MODEL_MAX_CONTEXT_TOKENS: $MODEL_MAX_CONTEXT_TOKENS
113
+ DISCARD_RATIO: $DISCARD_RATIO
114
+ TOKENIZER_PATH: $TOKENIZER_PATH
115
+ RESUME_FROM_FILE: $RESUME_FROM_FILE
116
+ TASK_TYPE: $TASK_TYPE
117
+ LOG_LABEL: $LOG_LABEL
118
+ SYSTEM_FORMAT: $SYSTEM_FORMAT
119
+ Shell PID: $$
120
+ ====================================
121
+ EOF
122
+ )
123
+ echo "$PARAM_INFO"
124
+ echo "$PARAM_INFO" > "$LOG_FILE"
125
+
126
+ # 使用 nohup 在后台启动 Python:标准输出与标准错误追加写入 LOG_FILE;随后将进程 PID 输出至终端,并同步��加至 LOG_FILE。
127
+ # 当 TASK_TYPE 为 input_only 时,须在命令行中加入 --clean_files_copy_dir。
128
+ if [ "$TASK_TYPE" = "input_only" ]; then
129
+ nohup python inference/run_batch_inference.py \
130
+ --llm_client_urls $LLM_CLIENT_URLS \
131
+ --llm_client_models $LLM_CLIENT_MODELS \
132
+ --test_data_file "$TEST_DATA_FILE" \
133
+ --output_file "$OUTPUT_FILE" \
134
+ --output_dir "$OUTPUT_DIR" \
135
+ --available_tools $AVAILABLE_TOOLS \
136
+ --concurrency_workers $CONCURRENCY_WORKERS \
137
+ --save_batch_size $SAVE_BATCH_SIZE \
138
+ --rollout_num $ROLLOUT_NUM \
139
+ --max_rounds $MAX_ROUNDS \
140
+ --temperature $TEMPERATURE \
141
+ --timeout_for_one_query $TIMEOUT_FOR_ONE_QUERY \
142
+ --llm_api_retry_times $LLM_API_RETRY_TIMES \
143
+ --discard_all_mode "$DISCARD_ALL_MODE" \
144
+ --model_max_context_tokens $MODEL_MAX_CONTEXT_TOKENS \
145
+ --discard_ratio $DISCARD_RATIO \
146
+ --tokenizer_path "$TOKENIZER_PATH" \
147
+ --resume_from_file "$RESUME_FROM_FILE" \
148
+ --log_label "$LOG_LABEL" \
149
+ --system_format "$SYSTEM_FORMAT" \
150
+ --system_prompt "$SYSTEM_PROMPT" \
151
+ --verbose \
152
+ --clean_files_copy_dir \
153
+ >> "$LOG_FILE" 2>&1 &
154
+ else
155
+ nohup python inference/run_batch_inference.py \
156
+ --llm_client_urls $LLM_CLIENT_URLS \
157
+ --llm_client_models $LLM_CLIENT_MODELS \
158
+ --test_data_file "$TEST_DATA_FILE" \
159
+ --output_file "$OUTPUT_FILE" \
160
+ --output_dir "$OUTPUT_DIR" \
161
+ --available_tools $AVAILABLE_TOOLS \
162
+ --concurrency_workers $CONCURRENCY_WORKERS \
163
+ --save_batch_size $SAVE_BATCH_SIZE \
164
+ --rollout_num $ROLLOUT_NUM \
165
+ --max_rounds $MAX_ROUNDS \
166
+ --temperature $TEMPERATURE \
167
+ --timeout_for_one_query $TIMEOUT_FOR_ONE_QUERY \
168
+ --llm_api_retry_times $LLM_API_RETRY_TIMES \
169
+ --discard_all_mode "$DISCARD_ALL_MODE" \
170
+ --model_max_context_tokens $MODEL_MAX_CONTEXT_TOKENS \
171
+ --discard_ratio $DISCARD_RATIO \
172
+ --tokenizer_path "$TOKENIZER_PATH" \
173
+ --resume_from_file "$RESUME_FROM_FILE" \
174
+ --log_label "$LOG_LABEL" \
175
+ --system_format "$SYSTEM_FORMAT" \
176
+ --system_prompt "$SYSTEM_PROMPT" \
177
+ --verbose \
178
+ >> "$LOG_FILE" 2>&1 &
179
+ fi
180
+
181
+ PY_PID=$!
182
+ echo "Python running as PID: $PY_PID"
183
+ echo "Python running as PID: $PY_PID" >> "$LOG_FILE"
inference/server/llm_api.py ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from copy import deepcopy
3
+ import json
4
+ import copy
5
+ from typing import Awaitable, Callable, Dict, List
6
+ import requests
7
+ import aiohttp
8
+ from openai import AsyncAzureOpenAI, AsyncOpenAI, OpenAI
9
+ import random
10
+ from collections import defaultdict
11
+
12
+ from utils.common import reorder_keys
13
+ from utils.configs import AIHUBMIX_KEY, ALIYUN_KEY, AZURE_KEY, CLIENT_TIMEOUT, VOLCANO_KEY
14
+
15
+ class LLMClient:
16
+ """
17
+ 调用远端启动的 vllm 接口
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ url: List,
23
+ model_names: List,
24
+ client_timeout: int | float | None = None,
25
+ api_keys: dict | None = None,
26
+ max_retries: int = 0,
27
+ ):
28
+ self.base_urls = url
29
+ self.model_names = model_names
30
+ self.client_timeout = client_timeout or CLIENT_TIMEOUT
31
+ self.max_retries = max(0, int(max_retries))
32
+ self.retry_backoff_seconds = 30.0
33
+ self.api_keys = api_keys or {
34
+ "aihubmix": AIHUBMIX_KEY,
35
+ "azure": AZURE_KEY,
36
+ "volcano": VOLCANO_KEY,
37
+ "aliyun": ALIYUN_KEY,
38
+ }
39
+ # 优化的路由分配结构:
40
+ # 用一个 dict 记录 query_id => url(一一绑定,方便直接查找 query_id 所属 url,而不用遍历所有列表)
41
+ self.queryid_to_url: Dict[str, str] = {}
42
+ # 统计每个 url 当前负载(每个 url 被分配了多少个 query_id),直接用 defaultdict(int)
43
+ self.url_load: Dict[str, int] = defaultdict(int)
44
+ for u in self.base_urls:
45
+ self.url_load[u] = 0
46
+
47
+ def pop_query_id(self, query_id: str):
48
+ """
49
+ 将 query 弹出 url 记录表
50
+ """
51
+ url = self.queryid_to_url.pop(query_id, None)
52
+ if url is not None:
53
+ if url in self.url_load and self.url_load[url] > 0:
54
+ self.url_load[url] -= 1
55
+
56
+ def allocate_url_by_query_id(self, query_id: str, logger = None) -> str:
57
+ # 已有绑定
58
+ if query_id in self.queryid_to_url:
59
+ return self.queryid_to_url[query_id]
60
+ # 分配给当前负载最小的 url
61
+
62
+ min_load_url = min(self.url_load.items(), key=lambda x: x[1])[0]
63
+ self.queryid_to_url[query_id] = min_load_url
64
+ self.url_load[min_load_url] += 1
65
+ if logger:
66
+ logger.info(f"[vllm allocate] {query_id} allocated to {min_load_url}, Running: {self.url_load[min_load_url]} reqs")
67
+ return min_load_url
68
+
69
+ async def _run_with_retry(
70
+ self,
71
+ request_name: str,
72
+ request_coro_factory: Callable[[], Awaitable[dict]],
73
+ logger = None,
74
+ query_id: str = "",
75
+ ) -> dict:
76
+ total_attempts = self.max_retries + 1
77
+ last_error: Exception | None = None
78
+ query_suffix = f", query_id={query_id}" if query_id else ""
79
+
80
+ for attempt in range(1, total_attempts + 1):
81
+ if logger is not None and attempt > 1:
82
+ logger.info(
83
+ "[llm retry] %s retry attempt %d/%d started%s",
84
+ request_name,
85
+ attempt,
86
+ total_attempts,
87
+ query_suffix,
88
+ )
89
+ try:
90
+ result = await request_coro_factory()
91
+ if isinstance(result, dict) and result.get("error"):
92
+ raise RuntimeError(str(result["error"]))
93
+ if logger is not None and attempt > 1:
94
+ logger.info(
95
+ "[llm retry] %s attempt %d/%d succeeded%s",
96
+ request_name,
97
+ attempt,
98
+ total_attempts,
99
+ query_suffix,
100
+ )
101
+ return result
102
+ except Exception as exc:
103
+ last_error = exc
104
+ if logger is not None:
105
+ logger.warning(
106
+ "[llm retry] %s attempt %d/%d failed%s: %s",
107
+ request_name,
108
+ attempt,
109
+ total_attempts,
110
+ query_suffix,
111
+ exc,
112
+ )
113
+ if attempt >= total_attempts:
114
+ break
115
+ retry_delay = self.retry_backoff_seconds * attempt
116
+ if logger is not None:
117
+ logger.info(
118
+ "[llm retry] %s will retry in %.1fs%s",
119
+ request_name,
120
+ retry_delay,
121
+ query_suffix,
122
+ )
123
+ await asyncio.sleep(retry_delay)
124
+
125
+ if last_error is None:
126
+ raise RuntimeError(f"{request_name} failed without an explicit error{query_suffix}")
127
+ raise last_error
128
+
129
+ async def chat(self, messages: List[Dict[str, str]], tool_list = [], temperature=0.7, top_p=0.95, extra_payload: dict = {}, logger= None, query_id = "") -> dict:
130
+ """
131
+ 注意:requests.post 是同步阻塞的,不适用于 async def���即它会在请求时阻塞当前协程,不能发挥异步优势。
132
+ 当前函数用 aiohttp 替代了 requests,实现了真正的异步非阻塞网络请求,可以提升异步环境下的并发性能,
133
+ 不会像同步阻塞那样导致协程队列卡顿或效率低下。
134
+ extra_payload 支持传入任意额外的 payload 参数(如 presence_penalty 等),会覆盖默认值。
135
+ """
136
+ payload = {
137
+ "messages": messages,
138
+ "temperature": temperature,
139
+ "top_p": top_p,
140
+ }
141
+ payload.update(extra_payload)
142
+ if len(tool_list) > 0:
143
+ payload['tools'] = tool_list
144
+
145
+ # 选择 URL,优先根据 query_id 做负载均衡&一致性调度
146
+ if query_id:
147
+ chosen_url = self.allocate_url_by_query_id(query_id, logger)
148
+ else:
149
+ chosen_url = random.choice(self.base_urls)
150
+
151
+ chosen_idx = self.base_urls.index(chosen_url)
152
+ choose_model = self.model_names[chosen_idx]
153
+
154
+ payload['model'] = choose_model # 兼容 vllm
155
+
156
+ resp_json = None
157
+
158
+ async def _request_once() -> dict:
159
+ nonlocal resp_json
160
+ async with aiohttp.ClientSession() as session:
161
+ async with session.post(chosen_url, json=payload, timeout=self.client_timeout) as resp:
162
+ resp.raise_for_status()
163
+ resp_json = await resp.json()
164
+ return {
165
+ "content": resp_json['choices'][0]['message']['content'],
166
+ "usage": resp_json['usage'],
167
+ "error": ""
168
+ }
169
+
170
+ try:
171
+ return await self._run_with_retry(
172
+ request_name=f"chat url={chosen_url} model={choose_model}",
173
+ request_coro_factory=_request_once,
174
+ logger=logger,
175
+ query_id=query_id,
176
+ )
177
+ except Exception as e:
178
+ try:
179
+ if logger is not None:
180
+ logger.info("[vllm response] %s", resp_json)
181
+ except:
182
+ pass
183
+ return {
184
+ "content": "",
185
+ "usage": {
186
+ 'completion_tokens': -1,
187
+ 'prompt_tokens': -1,
188
+ 'prompt_tokens_details': None,
189
+ 'total_tokens': -1
190
+ },
191
+ "error": str(e)
192
+ }
193
+
194
+ async def _call_openai_chat(self,
195
+ raw_messages: List[Dict[str, str]],
196
+ tool_list = [],
197
+ temperature=0.7,
198
+ top_p=0.95,
199
+ logger = None,
200
+ api_key=None,
201
+ query_id: str = "") -> dict:
202
+ idx = random.randrange(len(self.base_urls))
203
+ chosen_url = self.base_urls[idx]
204
+ chosen_model = self.model_names[idx]
205
+
206
+ if 'claude' in chosen_model or 'glm' in chosen_model:
207
+ # 路由成 requests 调用的方式,这样就可以和火山引擎兼容
208
+ # 此时需要传递 idx 来保证选取的还是这个模型和 URL
209
+ return await self._call_request_chat(raw_messages, tool_list, temperature, top_p, logger, api_key, idx, query_id)
210
+
211
+ client = OpenAI(
212
+ base_url = chosen_url,
213
+ api_key = api_key,
214
+ )
215
+
216
+ meta_data = {
217
+ "role": "assistant",
218
+ "content": ""
219
+ }
220
+ tool_call_ids = []
221
+ response_json = None
222
+ messages = copy.deepcopy(raw_messages)
223
+
224
+ for msg in messages:
225
+ if isinstance(msg, dict) and msg.get('role') == 'user' and isinstance(msg.get('content'), list):
226
+ for item in msg['content']:
227
+ if isinstance(item, dict) and item.get('type') == 'text':
228
+ item['type'] = 'input_text'
229
+
230
+ async def _request_once() -> dict:
231
+ nonlocal response_json, meta_data, tool_call_ids
232
+
233
+ tool_call_ids = []
234
+ meta_data = {
235
+ "role": "assistant",
236
+ "content": ""
237
+ }
238
+ loop = asyncio.get_event_loop()
239
+ if chosen_model in ["gpt-4.1", "gpt-4o"]:
240
+ func = lambda: client.responses.create(
241
+ input=messages,
242
+ model=chosen_model,
243
+ tools=tool_list
244
+ )
245
+ else:
246
+ func = lambda: client.responses.create(
247
+ input=messages,
248
+ model=chosen_model,
249
+ tools=tool_list,
250
+ reasoning={'effort': 'medium', 'summary': 'detailed'}
251
+ )
252
+ try:
253
+ response = await asyncio.wait_for(
254
+ loop.run_in_executor(None, func),
255
+ timeout=self.client_timeout
256
+ )
257
+ except Exception as run_executor_exc:
258
+ print(f"[client error] {run_executor_exc}")
259
+ raise
260
+
261
+ response_json = response.model_dump()
262
+
263
+ next_messages = messages + response.output
264
+
265
+ summary_list = []
266
+ answer_content_list = []
267
+
268
+ tool_calls = ""
269
+ for msg in response_json['output']:
270
+ if msg['type'] == 'reasoning':
271
+ summary_items = msg.get("summary", [])
272
+ summary_list.extend(s for s in summary_items if s.get("type") == "summary_text")
273
+ elif msg['type'] == 'function_call':
274
+ now_tool_call = {
275
+ "name": msg['name'],
276
+ "arguments": json.loads(msg['arguments'])
277
+ }
278
+ tool_call_ids.append(msg['call_id'])
279
+ tool_calls += "<tool_call>\n" + json.dumps(now_tool_call, ensure_ascii=False) + "\n</tool_call>\n"
280
+ elif msg['type'] == 'message':
281
+ for block in msg.get("content", []):
282
+ if block.get("type") == "output_text":
283
+ answer_content_list.append(block.get("text", "").strip())
284
+
285
+ reasoning_content = "\n".join([i.get('text', "") for i in summary_list if i.get("text", "")]).strip()
286
+ content = "\n".join(answer_content_list).strip()
287
+ tool_calls = tool_calls.strip()
288
+ meta_data_content = ""
289
+ meta_data_content += "<think>\n"
290
+ meta_data_content += f"{reasoning_content}\n</think>" if reasoning_content else "</think>"
291
+ meta_data_content += f"\n{content}"
292
+ meta_data_content += f"\n" if content else ""
293
+ meta_data_content += f"{tool_calls}" if tool_calls else ""
294
+
295
+ meta_data['content'] = meta_data_content
296
+
297
+ return {
298
+ "next_messages": next_messages,
299
+ "log_messages": [reorder_keys(rep) for rep in response_json['output']],
300
+ "meta_data": meta_data,
301
+ "tool_call_ids": tool_call_ids,
302
+ "usage": response_json['usage'],
303
+ }
304
+
305
+ try:
306
+ return await self._run_with_retry(
307
+ request_name=f"openai_chat url={chosen_url} model={chosen_model}",
308
+ request_coro_factory=_request_once,
309
+ logger=logger,
310
+ query_id=query_id,
311
+ )
312
+ except Exception as e:
313
+ try:
314
+ if logger is not None:
315
+ logger.info("[vllm response] %s", response_json)
316
+ except:
317
+ pass
318
+
319
+ return {
320
+ "next_messages": messages,
321
+ "log_messages": [],
322
+ "meta_data": meta_data,
323
+ "tool_call_ids": tool_call_ids,
324
+ "usage": response_json['usage'] if response_json is not None and 'usage' in response_json else None,
325
+ "error": str(e)
326
+ }
327
+
328
+ async def _call_request_chat(self,
329
+ raw_messages: List[Dict[str, str]],
330
+ tool_list = [],
331
+ temperature=0.7,
332
+ top_p=0.95,
333
+ logger = None,
334
+ api_key=None,
335
+ idx = None,
336
+ query_id: str = "") -> dict:
337
+ idx = random.randrange(len(self.base_urls)) if idx is None else idx
338
+ chosen_url = self.base_urls[idx]
339
+ chosen_model = self.model_names[idx]
340
+
341
+ messages = copy.deepcopy(raw_messages)
342
+
343
+ if "claude" in chosen_model:
344
+ headers={
345
+ "X-Api-Key": f"Bearer {api_key}",
346
+ "Content-Type": "application/json",
347
+ }
348
+ # claude 真服了, tool_list 也不统一
349
+ for tool in tool_list:
350
+ if isinstance(tool, dict):
351
+ tool['type'] = 'custom'
352
+ if 'parameters' in tool:
353
+ tool['input_schema'] = tool.pop('parameters')
354
+ elif any(x in chosen_model for x in ["glm", "doubao"]):
355
+ headers = {
356
+ "Authorization": f"Bearer {api_key}",
357
+ "Content-Type": "application/json",
358
+ }
359
+ else:
360
+ headers = {
361
+ 'Authorization': f'Bearer {api_key}',
362
+ 'x-ark-moderation-scene': 'skip-ark-moderation'
363
+ }
364
+
365
+ # 默认都需要改成 text
366
+ for msg in messages:
367
+ if isinstance(msg, dict) and msg.get('role') == 'user' and isinstance(msg.get('content'), list):
368
+ for item in msg['content']:
369
+ if isinstance(item, dict) and item.get('type') == 'input_text':
370
+ item['type'] = 'text'
371
+
372
+ data=json.dumps({
373
+ "model": chosen_model, # 替换模型 id
374
+ "messages": messages,
375
+ "max_tokens": 128000,
376
+ "thinking" :{
377
+ "type": "enabled",
378
+ "budget_tokens": 15000
379
+ },
380
+ "tools": tool_list,
381
+ })
382
+
383
+ response_json = {}
384
+
385
+
386
+ tool_call_ids = []
387
+ meta_data = {
388
+ "role": "assistant",
389
+ "content": ""
390
+ }
391
+ answer_content_list = []
392
+ summary_list = []
393
+ log_messages = []
394
+
395
+ async def _request_once() -> dict:
396
+ nonlocal response_json, tool_call_ids, meta_data, answer_content_list, summary_list, log_messages
397
+ tool_call_ids = []
398
+ meta_data = {
399
+ "role": "assistant",
400
+ "content": ""
401
+ }
402
+ answer_content_list = []
403
+ summary_list = []
404
+ log_messages = []
405
+
406
+ timeout = aiohttp.ClientTimeout(total=self.client_timeout)
407
+ connector = aiohttp.TCPConnector(ssl=False)
408
+ # aiohttp 的 ClientSession.post 方法不支持 verify=False 参数,证书校验需要在 TCPConnector 里开启
409
+ # 所以需要在 ClientSession 构造时传入 connector
410
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
411
+ async with session.post(chosen_url, data=data, headers=headers) as resp:
412
+ resp.raise_for_status()
413
+ response_json = await resp.json()
414
+
415
+ tool_calls = ""
416
+
417
+ if "content" in response_json:
418
+ # 说明当前就是 cladue 的调用结果,直接把 content 列表拼回去就可以了
419
+ log_messages = [{"role": "assistant", "content": response_json['content']}]
420
+ next_messages = messages + log_messages
421
+ for msg in response_json['content']:
422
+ if msg['type'] == "tool_use":
423
+ tool_call_ids.append(msg['id'])
424
+ now_tool_call = {
425
+ "name": msg['name'],
426
+ "arguments": msg['input']
427
+ }
428
+ tool_calls += "<tool_call>\n" + json.dumps(now_tool_call, ensure_ascii=False) + "\n</tool_call>\n"
429
+ elif msg['type'] == "text":
430
+ answer_content_list.append(msg['text'])
431
+ elif msg['type'] == 'thinking':
432
+ summary_list.append(msg['thinking'])
433
+ elif "choices" in response_json and len(response_json['choices']):
434
+ tmp_messages = response_json['choices'][0]['message']
435
+ log_messages = [tmp_messages]
436
+ next_messages = messages + [tmp_messages]
437
+ msg = tmp_messages
438
+ if "reasoning_content" in msg:
439
+ summary_list.append(msg['reasoning_content'])
440
+ if "content" in msg:
441
+ answer_content_list.append(msg['content'])
442
+ if "tool_calls" in msg and msg['tool_calls']:
443
+ for tool_call in msg['tool_calls']:
444
+ tool_call_ids.append(tool_call['id'])
445
+ now_tool_call = {
446
+ "name": tool_call['function']['name'],
447
+ "arguments": json.loads(tool_call['function']['arguments'])
448
+ }
449
+ # 保证dict序列化为json字符串时使用双引号
450
+ tool_calls += "<tool_call>\n" + json.dumps(now_tool_call, ensure_ascii=False) + "\n</tool_call>\n"
451
+ else:
452
+ raise RuntimeError(f"Unexpected response payload: {response_json}")
453
+
454
+ reasoning_content = "\n".join(summary_list).strip()
455
+ content = "\n".join(answer_content_list).strip()
456
+ tool_calls = tool_calls.strip()
457
+ meta_data_content = ""
458
+ meta_data_content += "<think>\n"
459
+ meta_data_content += f"{reasoning_content}\n</think>" if reasoning_content else "</think>"
460
+ meta_data_content += f"\n{content}"
461
+ meta_data_content += f"\n" if content else ""
462
+ meta_data_content += f"{tool_calls}" if tool_calls else ""
463
+
464
+ meta_data['content'] = meta_data_content
465
+
466
+ return {
467
+ "next_messages": next_messages,
468
+ "log_messages": log_messages,
469
+ "meta_data": meta_data,
470
+ "tool_call_ids": tool_call_ids,
471
+ "usage": response_json['usage'],
472
+ }
473
+
474
+ try:
475
+ return await self._run_with_retry(
476
+ request_name=f"request_chat url={chosen_url} model={chosen_model}",
477
+ request_coro_factory=_request_once,
478
+ logger=logger,
479
+ query_id=query_id,
480
+ )
481
+ except Exception as e:
482
+ try:
483
+ if logger is not None:
484
+ logger.info("[vllm response] %s", response_json)
485
+ except:
486
+ pass
487
+
488
+ return {
489
+ "next_messages": messages,
490
+ "log_messages": [],
491
+ "meta_data": meta_data,
492
+ "tool_call_ids": tool_call_ids,
493
+ "usage": response_json['usage'] if response_json is not None and 'usage' in response_json else None,
494
+ "error": str(e)
495
+ }
496
+
497
+ async def _call_aliyun_chat(self,
498
+ raw_messages: List[Dict[str, str]],
499
+ tool_list = [],
500
+ temperature=0.7,
501
+ top_p=0.95,
502
+ logger = None,
503
+ api_key=None,
504
+ query_id: str = "") -> dict:
505
+ idx = random.randrange(len(self.base_urls))
506
+ chosen_url = self.base_urls[idx]
507
+ chosen_model = self.model_names[idx]
508
+ if chosen_url.rstrip("/").endswith("/chat/completions"):
509
+ chosen_url = chosen_url.rstrip("/")[: -len("/chat/completions")]
510
+
511
+ client = OpenAI(
512
+ api_key=api_key,
513
+ base_url=chosen_url,
514
+ )
515
+
516
+ messages = copy.deepcopy(raw_messages)
517
+ response_json = None
518
+ answer_content_list = []
519
+ summary_list = []
520
+ log_messages = []
521
+
522
+ tool_call_ids = []
523
+ tool_calls = ""
524
+
525
+ meta_data = {
526
+ "role": "assistant",
527
+ "content": ""
528
+ }
529
+
530
+ async def _request_once() -> dict:
531
+ nonlocal response_json, answer_content_list, summary_list, log_messages, tool_call_ids, tool_calls, meta_data
532
+ response_json = None
533
+ answer_content_list = []
534
+ summary_list = []
535
+ log_messages = []
536
+ tool_call_ids = []
537
+ tool_calls = ""
538
+ meta_data = {
539
+ "role": "assistant",
540
+ "content": ""
541
+ }
542
+
543
+ loop = asyncio.get_event_loop()
544
+ request_kwargs = {
545
+ "model": chosen_model,
546
+ "messages": messages,
547
+ "temperature": temperature,
548
+ "top_p": top_p,
549
+ "extra_body": {"enable_thinking": True},
550
+ }
551
+ if tool_list:
552
+ request_kwargs["tools"] = tool_list
553
+ func = lambda: client.chat.completions.create(**request_kwargs)
554
+ completion = await asyncio.wait_for(
555
+ loop.run_in_executor(None, func),
556
+ timeout=self.client_timeout
557
+ )
558
+ response_json = completion.model_dump()
559
+ tmp_messages = response_json['choices'][0]['message']
560
+ log_messages = [tmp_messages]
561
+ next_messages = messages + [tmp_messages]
562
+ msg = tmp_messages
563
+ if "reasoning_content" in msg:
564
+ summary_list.append(msg['reasoning_content'])
565
+ if "content" in msg:
566
+ answer_content_list.append(msg['content'])
567
+ if "tool_calls" in msg and msg['tool_calls']:
568
+ for tool_call in msg['tool_calls']:
569
+ tool_call_ids.append(tool_call['id'])
570
+ arguments_raw = tool_call['function']['arguments']
571
+ try:
572
+ arguments_obj = json.loads(arguments_raw)
573
+ except Exception:
574
+ arguments_obj = arguments_raw
575
+ now_tool_call = {
576
+ "name": tool_call['function']['name'],
577
+ "arguments": arguments_obj
578
+ }
579
+ tool_calls += "<tool_call>\n" + json.dumps(now_tool_call, ensure_ascii=False) + "\n</tool_call>\n"
580
+ reasoning_content = "\n".join(summary_list).strip()
581
+ content = "\n".join(answer_content_list).strip()
582
+ tool_calls = tool_calls.strip()
583
+ meta_data_content = ""
584
+ meta_data_content += "<think>\n"
585
+ meta_data_content += f"{reasoning_content}\n</think>" if reasoning_content else "</think>"
586
+ meta_data_content += f"\n{content}"
587
+ meta_data_content += f"\n" if content else ""
588
+ meta_data_content += f"{tool_calls}" if tool_calls else ""
589
+
590
+ meta_data['content'] = meta_data_content
591
+
592
+ return {
593
+ "next_messages": next_messages,
594
+ "log_messages": log_messages,
595
+ "meta_data": meta_data,
596
+ "tool_call_ids": tool_call_ids,
597
+ "usage": response_json['usage'],
598
+ }
599
+
600
+ try:
601
+ return await self._run_with_retry(
602
+ request_name=f"aliyun_chat url={chosen_url} model={chosen_model}",
603
+ request_coro_factory=_request_once,
604
+ logger=logger,
605
+ query_id=query_id,
606
+ )
607
+ except Exception as e:
608
+ try:
609
+ if logger is not None:
610
+ logger.info("[aliyun response] %s", response_json)
611
+ except:
612
+ pass
613
+
614
+ return {
615
+ "next_messages": messages,
616
+ "log_messages": [],
617
+ "meta_data": meta_data,
618
+ "tool_call_ids": tool_call_ids,
619
+ "usage": response_json['usage'] if response_json is not None and 'usage' in response_json else None,
620
+ "error": str(e)
621
+ }
622
+
623
+ async def aihubmix_chat(self, raw_messages: List[Dict[str, str]], tool_list = [], temperature=0.7, top_p=0.95, logger = None, query_id: str = "") -> dict:
624
+ return await self._call_openai_chat(
625
+ raw_messages=raw_messages,
626
+ tool_list=tool_list,
627
+ temperature=temperature,
628
+ top_p=top_p,
629
+ logger=logger,
630
+ api_key=self.api_keys.get("aihubmix"),
631
+ query_id=query_id,
632
+ )
633
+
634
+ async def azure_chat(self, raw_messages: List[Dict[str, str]], tool_list = [], temperature=0.7, top_p=0.95, logger = None, query_id: str = "") -> dict:
635
+ return await self._call_openai_chat(
636
+ raw_messages=raw_messages,
637
+ tool_list=tool_list,
638
+ temperature=temperature,
639
+ top_p=top_p,
640
+ logger=logger,
641
+ api_key=self.api_keys.get("azure"),
642
+ query_id=query_id,
643
+ )
644
+
645
+ async def volcano_chat(self, raw_messages: List[Dict[str, str]], tool_list = [], temperature=0.7, top_p=0.95, logger = None, query_id: str = "") -> dict:
646
+ return await self._call_request_chat(
647
+ raw_messages=raw_messages,
648
+ tool_list=tool_list,
649
+ temperature=temperature,
650
+ top_p=top_p,
651
+ logger=logger,
652
+ api_key=self.api_keys.get("volcano"),
653
+ query_id=query_id,
654
+ )
655
+
656
+ async def aliyun_chat(self, raw_messages: List[Dict[str, str]], tool_list = [], temperature=0.7, top_p=0.95, logger = None, query_id: str = "") -> dict:
657
+ return await self._call_aliyun_chat(
658
+ raw_messages=raw_messages,
659
+ tool_list=tool_list,
660
+ temperature=temperature,
661
+ top_p=top_p,
662
+ logger=logger,
663
+ api_key=self.api_keys.get("aliyun"),
664
+ query_id=query_id,
665
+ )
inference/server/tool_api.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ from tool_kits import (
5
+ AskQuestionAboutImageToolkit,
6
+ AskQuestionAboutVideoToolkit,
7
+ ExecuteCodeToolkit,
8
+ WideSearchToolkit,
9
+ ImageSearchToolkit,
10
+ ScholarSearchToolkit,
11
+ FileWideParseToolkit,
12
+ WideVisitToolkit,
13
+ BashToolkit,
14
+ )
15
+
16
+ from urllib.parse import urljoin
17
+ from typing import Callable, Dict, Any
18
+
19
+ # 基于 docker 的所有工具
20
+ def return_all_tools(tool_urls=None, use_cache=None, use_tongyi_format=None):
21
+ # initialize all tools (alphabetically sorted)
22
+ # 后续根据 key 找到实际调用的是哪个工具
23
+ tool_kwargs = {}
24
+ if tool_urls is not None:
25
+ tool_kwargs["server_url"] = tool_urls
26
+ if use_cache is not None:
27
+ tool_kwargs["use_cache"] = use_cache
28
+ if use_tongyi_format is not None:
29
+ tool_kwargs["is_tongyi_format"] = use_tongyi_format
30
+
31
+ # 只有需要 deep_research format 才能传 **tool_kwargs
32
+ tools = {
33
+ "ask_question_about_image": AskQuestionAboutImageToolkit(),
34
+ "ask_question_about_video": AskQuestionAboutVideoToolkit(),
35
+ "execute_code": ExecuteCodeToolkit(),
36
+ "wide_search": WideSearchToolkit(**tool_kwargs),
37
+ "image_search": ImageSearchToolkit(**tool_kwargs),
38
+ "scholar_search": ScholarSearchToolkit(**tool_kwargs),
39
+ "file_wide_parse": FileWideParseToolkit(**tool_kwargs),
40
+ "wide_visit": WideVisitToolkit(**tool_kwargs),
41
+ "bash": BashToolkit(),
42
+ }
43
+
44
+ ALL_TOOLS: Dict[str, Dict[str, Any]] = {}
45
+
46
+
47
+ # for competibility with the old code
48
+ for tool_name, tool in tools.items():
49
+ ALL_TOOLS[tool_name] = {
50
+ "name": tool.name,
51
+ "description": tool.description,
52
+ "strict": True,
53
+ "parameters": tool.params,
54
+ "function": tool.forward, # 工具执行函数
55
+ "schema_json": tool.json
56
+ }
57
+
58
+ return ALL_TOOLS
59
+
inference/server/tool_execution.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Tuple
2
+ import json
3
+ import asyncio
4
+ import os
5
+ import uuid
6
+ import importlib.util
7
+ from pathlib import Path
8
+ from utils.configs import USE_NLP_FORMAT_RETURN
9
+
10
+
11
+ async def execute_tool_call(name: Any, arguments: Any, all_tools, logger=None, USE_NLP_FORMAT_RETURN: bool | None = None, query_id = "") -> Tuple[str, str]:
12
+ """
13
+ Execute a single tool call locally. Returns (tool_name, tool_response_json_str).
14
+ """
15
+ if USE_NLP_FORMAT_RETURN is None:
16
+ USE_NLP_FORMAT_RETURN = USE_NLP_FORMAT_RETURN
17
+ if name == "parse_error_tool_call":
18
+ if USE_NLP_FORMAT_RETURN:
19
+ result = f"Error: Tool call is not a valid JSON. Tool call must contain a valid \"name\" and \"arguments\" field. Parse error: {arguments.get('parse_error', '')}"
20
+ else:
21
+ result = json.dumps({"error": f"Parse error: {arguments.get('parse_error', '')}", "raw": arguments.get('raw', '')}, ensure_ascii=False)
22
+ if logger:
23
+ logger.error(result)
24
+ return name or "parse_error_tool_call", result
25
+
26
+ tool = all_tools.get(name)
27
+ if tool is None:
28
+ result = json.dumps({"error": f"Unknown tool: {name}"}, ensure_ascii=False)
29
+ if logger:
30
+ logger.error(result)
31
+ return name or "unknown", result
32
+
33
+ # Ensure arguments is a dict
34
+ if not isinstance(arguments, dict):
35
+ arguments = {"_": arguments}
36
+
37
+ import functools
38
+ loop = asyncio.get_running_loop()
39
+
40
+ arguments["conversation_id"] = query_id
41
+
42
+ func = functools.partial(tool['function'], **arguments)
43
+ # 根据工具名称选择不同的超时时间
44
+ if name == "browse_url":
45
+ timeout = 5400 # browse 是 1.5 小时
46
+ else:
47
+ timeout = 1800 # 其他工具就是 30 分钟
48
+ try:
49
+ out = await asyncio.wait_for(loop.run_in_executor(None, func), timeout=timeout)
50
+ result = out if isinstance(out, str) else json.dumps(out, ensure_ascii=False) # 返回结果一定是字符串
51
+ except asyncio.TimeoutError:
52
+ if USE_NLP_FORMAT_RETURN:
53
+ result = f"The tool call timed out: execution exceeded {timeout} seconds for tool '{name}'."
54
+ else:
55
+ result = json.dumps({"error": f"Tool call timeout: exceeded {timeout}s", "tool": name, "arguments": arguments}, ensure_ascii=False)
56
+ if logger:
57
+ logger.error(result)
58
+ except TypeError as te:
59
+ if USE_NLP_FORMAT_RETURN:
60
+ result = f"Tool '{name}' failed due to argument mismatch: {str(te)}. Input arguments: {arguments}."
61
+ else:
62
+ result = json.dumps({"error": f"Argument mismatch for tool '{name}': {str(te)}", "received": arguments}, ensure_ascii=False)
63
+ if logger:
64
+ logger.error(result)
65
+ except Exception as e:
66
+ if USE_NLP_FORMAT_RETURN:
67
+ result = f"Tool '{name}' encountered an error: {str(e)}."
68
+ else:
69
+ result = json.dumps({"error": f"Tool '{name}' raised an exception: {str(e)}"}, ensure_ascii=False)
70
+ if logger:
71
+ logger.error(result)
72
+
73
+ return name, result
inference/test_all_tools.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tool_kits import (
2
+ AskQuestionAboutImageToolkit,
3
+ AskQuestionAboutVideoToolkit,
4
+ ExecuteCodeToolkit,
5
+ WideSearchToolkit,
6
+ ImageSearchToolkit,
7
+ ScholarSearchToolkit,
8
+ FileWideParseToolkit,
9
+ WideVisitToolkit,
10
+ BashToolkit
11
+ )
12
+
13
+ from urllib.parse import urljoin
14
+ from typing import Callable, Dict, Any
15
+
16
+ # initialize all tools (alphabetically sorted)
17
+ tools = {
18
+ "ask_question_about_image": AskQuestionAboutImageToolkit(),
19
+ "ask_question_about_video": AskQuestionAboutVideoToolkit(),
20
+ "execute_code": ExecuteCodeToolkit(),
21
+ "wide_search": WideSearchToolkit(),
22
+ "image_search": ImageSearchToolkit(),
23
+ "scholar_search": ScholarSearchToolkit(),
24
+ "file_wide_parse": FileWideParseToolkit(),
25
+ "wide_visit": WideVisitToolkit(),
26
+ "bash": BashToolkit(),
27
+ }
28
+
29
+ # register all tools
30
+
31
+ ALL_TOOLS: Dict[str, Dict[str, Any]] = {}
32
+
33
+ def register_tool(name: str, description: str, parameters: Dict[str, Any]):
34
+ """
35
+ 装饰器:注册工具到 TOOLS
36
+ """
37
+ def decorator(func: Callable):
38
+ ALL_TOOLS[name] = {
39
+ "name": name,
40
+ "description": description,
41
+ "strict": True,
42
+ "parameters": parameters,
43
+ "function": func,
44
+ }
45
+ return func
46
+ return decorator
47
+
48
+ # for competibility with the old code
49
+ for tool_name, tool in tools.items():
50
+ ALL_TOOLS[tool_name] = {
51
+ "name": tool.name,
52
+ "description": tool.description,
53
+ "strict": True,
54
+ "parameters": tool.params,
55
+ "function": tool.forward, # 工具执行函数
56
+ "schema_json": tool.json
57
+ }
58
+
59
+ def test_tools():
60
+ """
61
+ 测试所有注册的工具,参考xxl_wrapped_camel_tools.py中的测试用例
62
+ """
63
+ results = {}
64
+ # 测试用例定义(每个工具一个简单用例)
65
+ test_cases = {
66
+ "ask_question_about_image": {"image_path": [
67
+ "http://img.daimg.com/uploads/allimg/240712/3-240G2112F6.jpg"
68
+ ], "question": "What is in this image?"},
69
+ "ask_question_about_video": {"video_path": "https://www.bilibili.com/video/BV11p81zFEJT/?spm_id_from=333.337.search-card.all.click", "question": "描述这个视频的内容和主要场景。"},
70
+ "bash": {"command": "echo 'hello world'"},
71
+ "execute_code": {"code": "print('Hello World')"},
72
+ "execute_code": {"code": "import math\nimport matplotlib.pyplot as plt\n"},
73
+ "wide_search": {"query": ['伊莎贝尔·于佩尔 包法利夫人 苦的砒霜', 'Isabelle Huppert insisted poison taste bitter']},
74
+ "image_search": {"query": ["咖喱", "肉骨茶", "印尼九层塔"]},
75
+ "scholar_search": {"query": ["spa", "烟花", "attention"]},
76
+ "file_wide_parse": {
77
+ "files": [
78
+ "http://img.daimg.com/uploads/allimg/240712/3-240G2112F6.jpg"
79
+ ],
80
+ },
81
+ "wide_visit": {"url": "https://www.sohu.com/a/960662276_163491", "goal": "疯狂动物城有哪些周边"},
82
+ }
83
+
84
+ for tool_name, test_case in test_cases.items():
85
+ if tool_name not in ALL_TOOLS:
86
+ print(f"Tool {tool_name} not found in registered tools.")
87
+ continue
88
+ tool_info = ALL_TOOLS[tool_name]
89
+ print(f"\nTool: {tool_name}")
90
+ print(f"Description: {tool_info['description']}")
91
+ print(f"Parameters: {tool_info['parameters']}")
92
+ params = test_case
93
+ import time
94
+ params['conversation_id'] = f"test_{time.strftime('%Y%m%d%H%M%S', time.localtime())}"
95
+ print(f"Testing with parameters: {params}")
96
+
97
+ try:
98
+ # 支持异步工具(如browse_url)
99
+ if tool_name == "browse_url":
100
+ import asyncio
101
+ result = asyncio.run(tool_info["function"](**params))
102
+ else:
103
+ result = tool_info["function"](**params)
104
+ print(f"\n✅ Test result: {str(result)}")
105
+ results[tool_name] = {"success": True, "result": result}
106
+ except Exception as e:
107
+ print(f"\n❌ Test failed: {str(e)}")
108
+ results[tool_name] = {"success": False, "error": str(e)}
109
+ print("\n" + "🏃..🎈 " * 20 + "\n")
110
+
111
+ print("\n" + "==" * 20 + "END" + "==" * 20 + "\n")
112
+
113
+
114
+ if __name__ == "__main__":
115
+
116
+ for key in ALL_TOOLS.keys():
117
+ print(ALL_TOOLS[key]['schema_json'])
118
+
119
+ print("=" * 100)
120
+
121
+ test_tools()
122
+
123
+
inference/tool_kits/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .ask_question_about_image_toolkit import AskQuestionAboutImageToolkit
2
+ from .ask_question_about_video_toolkit import AskQuestionAboutVideoToolkit
3
+ from .base import BaseToolkit
4
+ from .browser_toolkit import BrowserToolkit
5
+ from .execute_code_toolkit import ExecuteCodeToolkit
6
+ from .extract_doc_content_toolkit import ExtractDocumentContentToolkit
7
+ from .extract_csv_content_toolkit import ExtractCSVContentToolkit
8
+ from .extract_pdf_content_toolkit import ExtractPDFContentToolkit
9
+ from .fetch_web_page_toolkit import FetchWebPageToolkit
10
+ from .image_to_text_toolkit import ImageToTextToolkit
11
+ from .visit_toolkit import VisitToolkit
12
+ from .web_search_toolkit import WebSearchToolkit
13
+ from .write_to_file_toolkit import WriteToFileToolkit
14
+ # 新增
15
+ from .wide_search_toolkit import WideSearchToolkit
16
+ from .image_search_toolkit import ImageSearchToolkit
17
+ from .file_wide_parse_toolkit import FileWideParseToolkit
18
+ from .scholar_search_toolkit import ScholarSearchToolkit
19
+ from .wide_visit_toolkit import WideVisitToolkit
20
+ # 04.03 新增
21
+ from .bash_toolkit import BashToolkit
22
+
23
+
24
+
25
+
26
+ __all__ = [
27
+ 'AskQuestionAboutImageToolkit',
28
+ 'AskQuestionAboutVideoToolkit',
29
+ 'BaseToolkit',
30
+ 'BrowserToolkit',
31
+ 'ExecuteCodeToolkit',
32
+ 'ExtractCSVContentToolkit',
33
+ 'ExtractDocumentContentToolkit',
34
+ 'ExtractPDFContentToolkit',
35
+ 'FetchWebPageToolkit',
36
+ 'ImageToTextToolkit',
37
+ 'VisitToolkit',
38
+ 'WebSearchToolkit',
39
+ 'WriteToFileToolkit',
40
+ # 新增
41
+ 'WideSearchToolkit',
42
+ 'ImageSearchToolkit',
43
+ 'ScholarSearchToolkit',
44
+ "FileWideParseToolkit",
45
+ "WideVisitToolkit",
46
+ # 04.03 新增
47
+ 'BashToolkit',
48
+ ]
inference/tool_kits/ask_question_about_image_toolkit.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class AskQuestionAboutImageToolkit(BaseToolkit):
11
+ NAME = "ask_question_about_image"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "ask_question_about_image"
14
+ DESCRIPTION = f"Identify image content and answer questions about one or more images."
15
+ TIMEOUT = 600
16
+ TOOL_PARAMS = {
17
+ "image_path": {
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string",
21
+ "description": "Local path or URL to an image file.",
22
+ },
23
+ "minItems": 1,
24
+ "description": "Array of local paths or URLs to image files.",
25
+ },
26
+ "question": {
27
+ "type": "string",
28
+ "description": "Query about the image content.",
29
+ },
30
+ }
31
+ TOOL_PARAMS_REQUIRED = ["image_path", "question"]
32
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
inference/tool_kits/ask_question_about_video_toolkit.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class AskQuestionAboutVideoToolkit(BaseToolkit):
11
+ NAME = "ask_question_about_video"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "ask_question_about_video"
14
+ DESCRIPTION = f"Ask a question about one or more videos."
15
+ TIMEOUT = 600
16
+ TOOL_PARAMS = {
17
+ "video_path": {
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string",
21
+ "description": "Local path or URL to the video file.",
22
+ },
23
+ "minItems": 1,
24
+ "description": "Array of local paths or URLs to video files.",
25
+ },
26
+ "question": {
27
+ "type": "string",
28
+ "description": "The question to ask about the video.",
29
+ },
30
+ }
31
+ TOOL_PARAMS_REQUIRED = ["video_path", "question"]
32
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
inference/tool_kits/base.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import json
4
+ import threading
5
+ import queue
6
+ import asyncio
7
+ from typing import Any
8
+ from urllib.parse import urljoin
9
+
10
+ import httpx
11
+
12
+ import time
13
+ import uuid
14
+
15
+ class BaseToolkit():
16
+ """A tool for assisting test tasks."""
17
+
18
+ NAME = "tool_base"
19
+ DESCRIPTION = f"The tool backbone of all real tools."
20
+ TIMEOUT = 60
21
+ TOOLS_SERVER_BASE_ENDPOINT = []
22
+ ENTRY_POINT = ""
23
+ TOOL_PARAMS = {}
24
+ TOOL_PARAMS_REQUIRED = []
25
+ USE_CACHE = False
26
+
27
+ def __init__(
28
+ self,
29
+ name: str = "",
30
+ description: str = "",
31
+ params: dict = {},
32
+ required_params: list[str] = [],
33
+ server_url: str | list[str] = [],
34
+ entry_point: str = "",
35
+ timeout: float | None = None,
36
+ request_id: str = "",
37
+ use_cache: bool | None = None,
38
+ is_tongyi_format: bool | None = None,
39
+ **kwargs,
40
+ ):
41
+ # 这行代码的意思是:获取当前实例(self)的类(即class),并把它赋值给变量cls。
42
+ # 这样可以在初始化方法中使用cls来访问类属性,比如cls.NAME等,无论是否通过继承生成子类。
43
+ cls = type(self)
44
+ self.name = name or getattr(cls, "NAME", "")
45
+ self.description = description or getattr(cls, "DESCRIPTION", "")
46
+ self.params = params or getattr(cls, "TOOL_PARAMS", {})
47
+ self.required_params = required_params or getattr(cls, "TOOL_PARAMS_REQUIRED", [])
48
+ self.server_url = server_url or getattr(cls, "TOOLS_SERVER_BASE_ENDPOINT", "")
49
+ self.entry_point = entry_point or getattr(cls, "ENTRY_POINT", "") or getattr(cls, "NAME", "")
50
+
51
+ if timeout is not None:
52
+ self.timeout = timeout
53
+ else:
54
+ self.timeout = getattr(cls, "TIMEOUT", 600)
55
+
56
+ if use_cache is not None:
57
+ self.use_cache = use_cache
58
+ else:
59
+ self.use_cache = getattr(cls, "USE_CACHE", False)
60
+
61
+ if is_tongyi_format is not None:
62
+ self.is_tongyi_format = is_tongyi_format
63
+ else:
64
+ self.is_tongyi_format = getattr(cls, "USE_TONGYI_FORMAT", None)
65
+
66
+ self.set_request_id(request_id=request_id)
67
+
68
+ self._init_client()
69
+
70
+ def _init_client(self):
71
+ """
72
+ Initialize the HTTP client for making requests.
73
+ """
74
+ # httpx 是一个用于发送 HTTP 请求的库,这里用它来创建一个客户端对象,方便后续发送 HTTP 请求到工具服务器。
75
+ self.client = httpx.Client()
76
+
77
+ @property
78
+ def json(self):
79
+ return {
80
+ "type": "function",
81
+ "function": {
82
+ "name": self.name,
83
+ "description": self.description,
84
+ "parameters": {
85
+ "type": "object",
86
+ "additionalProperties": False,
87
+ "properties": self.params,
88
+ "required": self.required_params,
89
+ },
90
+ },
91
+ }
92
+
93
+ def _post(self, pload: dict[str, Any]) -> Any:
94
+ """
95
+ Post request to the tool server and return the response.
96
+ """
97
+
98
+ # support multiple server urls for load balancing
99
+ server_url = random.choice(self.server_url) if isinstance(self.server_url, list) else self.server_url
100
+ tool_endpoint = urljoin(server_url, self.entry_point) # url + 访问接口
101
+
102
+ # with httpx.Client() as client:
103
+ try:
104
+ resp = self.client.post(tool_endpoint, json=pload, timeout=self.timeout)
105
+ if not resp.is_success:
106
+ return f"{resp.status_code} {resp.text}"
107
+ data = resp.json()
108
+ return data.get("result", "")
109
+
110
+ except Exception as e:
111
+ raise e
112
+
113
+ # **kwargs 是 Python 中的一种语法,用于将所有额外的关键字参数以字典形式收集起来
114
+ # 例如: forward(a=1, b=2) 时,kwargs={'a': 1, 'b': 2}
115
+ def forward(self, **kwargs):
116
+ """
117
+ Execute this tool.
118
+
119
+ Args:
120
+ keyword arguments: Arguments to be submitted to this tool.
121
+
122
+ Returns:
123
+ ToolOutput: An object containing either the tool's results or an error message.
124
+ """
125
+
126
+ # request_id, use_cache, and real params for the tool
127
+ try:
128
+ payload = {}
129
+
130
+ # Ensure request_id and use_cache is present
131
+ # timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
132
+ timestamp = time.strftime("%Y%m%d%", time.localtime())
133
+ if self.request_id:
134
+ payload["request_id"] = f"{self.request_id}_{self.name}_{timestamp}"
135
+ else:
136
+ # payload["request_id"] = f"{self.name}_{timestamp}"
137
+ payload["request_id"] = self.name
138
+
139
+ payload["use_cache"] = self.use_cache
140
+ # Ensure is_tongyi_format is inside params
141
+ if self.is_tongyi_format is not None:
142
+ kwargs['is_tongyi_format'] = self.is_tongyi_format
143
+ payload['params'] = kwargs
144
+
145
+ conversation_id = kwargs.pop('conversation_id', None)
146
+ if conversation_id is not None:
147
+ payload['conversation_id'] = conversation_id
148
+
149
+ # print("payload:", payload)
150
+ raw = self._post(payload)
151
+ return raw
152
+
153
+ except Exception as e:
154
+ raise e
155
+
156
+
157
+
158
+ def set_request_id(self, request_id: str):
159
+ """
160
+ Set the request ID for this tool.
161
+ """
162
+ self.request_id = request_id
163
+
164
+ def set_use_cache(self, use_cache: bool):
165
+ """
166
+ Set whether to use cache for this tool.
167
+ """
168
+ self.use_cache = use_cache
169
+
170
+ def set_timeout(self, timeout: float):
171
+ """
172
+ Set the timeout for this tool.
173
+ """
174
+ self.timeout = timeout
175
+
176
+
177
+ def __del__(self):
178
+ # try:
179
+ # if getattr(self, "client", None):
180
+ # self.client.close()
181
+ # except Exception:
182
+ # pass
183
+ pass
inference/tool_kits/bash_toolkit.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class BashToolkit(BaseToolkit):
11
+ NAME = "bash"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "bash"
14
+ DESCRIPTION = (
15
+ "Execute a shell script in the current working directory. "
16
+ "Use this tool to run one or more shell commands as a single script or "
17
+ "execute script files (e.g. `python script.py`)."
18
+ )
19
+ TIMEOUT = 900
20
+ TOOL_PARAMS = {
21
+ "command": {
22
+ "type": "string",
23
+ "description": (
24
+ "A shell script to execute. Multiple commands are allowed and will be "
25
+ "executed sequentially in the same shell session. Use relative paths by default."
26
+ ),
27
+ },
28
+ }
29
+ TOOL_PARAMS_REQUIRED = ["command"]
30
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
inference/tool_kits/execute_code_toolkit.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class ExecuteCodeToolkit(BaseToolkit):
11
+ NAME = "execute_code"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "execute_code"
14
+ DESCRIPTION = f"Execute a given code snippet for data processing, model training, analysis, or workflow automation, including writing or modifying files as needed."
15
+ TIMEOUT = 900
16
+ TOOL_PARAMS = {
17
+ "code": {
18
+ "type": "string",
19
+ "description": "The input code to the Code Interpreter tool call.",
20
+ },
21
+ }
22
+ TOOL_PARAMS_REQUIRED = ["code"]
23
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
inference/tool_kits/file_wide_parse_toolkit.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, USE_NLP_FORMAT_RETURN, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class FileWideParseToolkit(BaseToolkit):
11
+ NAME = "parse_file"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "file_wide_parse"
14
+ DESCRIPTION = f"This is a tool that can be used to parse multiple user uploaded local files or online files such as PDF, DOCX, PPTX, TXT, CSV, XLSX, DOC, ZIP, MP4, MP3."
15
+ TIMEOUT = 600
16
+ TOOL_PARAMS = {
17
+ "files": {
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string"
21
+ },
22
+ "description": "The online file's URLs or the user uploaded local file paths to be parsed."
23
+ },
24
+ }
25
+ TOOL_PARAMS_REQUIRED = ["files"]
26
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
27
+ USE_TONGYI_FORMAT = USE_NLP_FORMAT_RETURN
inference/tool_kits/image_search_toolkit.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, USE_NLP_FORMAT_RETURN, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class ImageSearchToolkit(BaseToolkit):
11
+ NAME = "image_search"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "image_search"
14
+ DESCRIPTION = f"Search images by query and return a list of related images. Accepts multiple complementary search queries in a single call."
15
+ TIMEOUT = 600
16
+ TOOL_PARAMS = {
17
+ "query": {
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string",
21
+ "description": "A single image search query string.",
22
+ },
23
+ "minItems": 1,
24
+ "description": "Array of query strings. Multiple complementary search queries can be provided in one request for image search.",
25
+ },
26
+ }
27
+ TOOL_PARAMS_REQUIRED = ["query"]
28
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
29
+ USE_TONGYI_FORMAT = USE_NLP_FORMAT_RETURN
30
+
inference/tool_kits/scholar_search_toolkit.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, USE_NLP_FORMAT_RETURN, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class ScholarSearchToolkit(BaseToolkit):
11
+ NAME = "google_scholar"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "scholar_search"
14
+ DESCRIPTION = f"Leverage Google Scholar to retrieve relevant information from academic publications. Accepts multiple queries. This tool will also return results from google search"
15
+ TIMEOUT = 600
16
+ TOOL_PARAMS = {
17
+ "query": {
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string",
21
+ "description": "The search query.",
22
+ },
23
+ "minItems": 1,
24
+ "description": "The list of search queries for Google Scholar.",
25
+ },
26
+ }
27
+ TOOL_PARAMS_REQUIRED = ["query"]
28
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
29
+ USE_TONGYI_FORMAT = USE_NLP_FORMAT_RETURN
inference/tool_kits/wide_search_toolkit.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, USE_NLP_FORMAT_RETURN, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class WideSearchToolkit(BaseToolkit):
11
+ NAME = "search"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "wide_search"
14
+ DESCRIPTION = f"Perform Google web searches then returns a string of the top search results. Accepts multiple queries."
15
+ TIMEOUT = 600
16
+ TOOL_PARAMS = {
17
+ "query": {
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string",
21
+ "description": "The search query.",
22
+ },
23
+ "minItems": 1,
24
+ "description": "The list of search queries.",
25
+ },
26
+ }
27
+ TOOL_PARAMS_REQUIRED = ["query"]
28
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
29
+ USE_TONGYI_FORMAT = USE_NLP_FORMAT_RETURN
inference/tool_kits/wide_visit_toolkit.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import sys
4
+ from urllib.parse import urljoin
5
+ from typing import Callable, Dict, Any
6
+ from utils.configs import TOOLS_SERVER_BASE_ENDPOINT_URL, USE_NLP_FORMAT_RETURN, WEB_BASED_TOOLS_USE_CACHE
7
+ from tool_kits.base import BaseToolkit
8
+
9
+
10
+ class WideVisitToolkit(BaseToolkit):
11
+ NAME = "visit"
12
+ TOOLS_SERVER_BASE_ENDPOINT = TOOLS_SERVER_BASE_ENDPOINT_URL
13
+ ENTRY_POINT = "wide_visit"
14
+ DESCRIPTION = "Visit webpage(s) and return the summary of the content."
15
+ TIMEOUT = 600
16
+ TOOL_PARAMS = {
17
+ "url": {
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string",
21
+ },
22
+ "minItems": 1,
23
+ "description": "The URL(s) of the webpage(s) to visit. Can be a single URL or an array of URLs.",
24
+ },
25
+ "goal": {
26
+ "type": "string",
27
+ "description": "The specific information goal for visiting webpage(s).",
28
+ },
29
+ }
30
+ TOOL_PARAMS_REQUIRED = ["url", "goal"]
31
+ USE_CACHE = WEB_BASED_TOOLS_USE_CACHE
32
+ USE_TONGYI_FORMAT = USE_NLP_FORMAT_RETURN