Chaplain0908 commited on
Commit
d3ac0e9
·
verified ·
1 Parent(s): 3c11317

Upload 14 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ utils/Data_Profile.png filter=lfs diff=lfs merge=lfs -text
37
+ utils/Overall_Pipeline.png filter=lfs diff=lfs merge=lfs -text
38
+ utils/RAG_Paradigms.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,679 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import base64
3
+ import pandas as pd
4
+ import os
5
+
6
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
7
+ os.path.join(BASE_DIR, "utils", "title_icon.png")
8
+
9
+ # 读取图片并转为 base64
10
+ def get_image_base64(image_path):
11
+ with open(image_path, "rb") as f:
12
+ return base64.b64encode(f.read()).decode()
13
+
14
+ # 设置 dataframe 样式:斑马纹 + 表头黑色加粗
15
+ def style_dataframe(df):
16
+ def row_style(row):
17
+ if row.name % 2 == 0:
18
+ return ['background-color: #f9f9f9'] * len(row)
19
+ return ['background-color: #ffffff'] * len(row)
20
+
21
+ return df.style.set_table_styles([
22
+ # 表头样式
23
+ {'selector': 'th', 'props': [
24
+ ('background-color', '#f0f0f0'),
25
+ ('color', '#000000'),
26
+ ('font-weight', 'bold'),
27
+ ('text-align', 'left'),
28
+ ('padding', '8px')
29
+ ]},
30
+ # 单元格样式
31
+ {'selector': 'td', 'props': [
32
+ ('text-align', 'left'),
33
+ ('padding', '8px')
34
+ ]},
35
+ # 表头文字样式
36
+ {'selector': 'th.col_heading', 'props': [
37
+ ('background-color', '#f0f0f0'),
38
+ ('color', '#000000'),
39
+ ('font-weight', 'bold')
40
+ ]}
41
+ ]).apply(row_style, axis=1)
42
+
43
+ def df_to_html_table(df, height=400):
44
+ html = f'<div style="max-height: {height}px; overflow-y: auto; border: 1px solid #d0d0d0; border-radius: 8px;">'
45
+ html += '<table style="width: 100%; border-collapse: collapse; font-size: 14px;">'
46
+
47
+ # 调整表头:font-weight 改为 normal,padding 第一个值调小
48
+ html += '<thead><tr style="background-color: #e8e8e8; position: sticky; top: 0; z-index: 1;">'
49
+ for col in df.columns:
50
+ html += f'<th style="padding: 6px 14px; text-align: left; font-weight: normal; font-size: 15px; color: #000; border-bottom: 2px solid #ccc;">{col}</th>'
51
+ html += '</tr></thead><tbody>'
52
+
53
+ # 调整单元格:padding 第一个值调小
54
+ for i, (_, row) in enumerate(df.iterrows()):
55
+ bg = '#f5f5f5' if i % 2 == 0 else '#ffffff'
56
+ html += f'<tr style="background-color: {bg};">'
57
+ for val in row:
58
+ html += f'<td style="padding: 4px 14px; text-align: left; border-bottom: 1px solid #eee;">{val}</td>'
59
+ html += '</tr>'
60
+
61
+ html += '</tbody></table></div>'
62
+ return html
63
+
64
+ st.set_page_config(
65
+ page_title="RAGRouter-Bench: A Dataset and Benchmark for Adaptive RAG Routing",
66
+ layout="wide",
67
+ initial_sidebar_state="expanded",
68
+ )
69
+
70
+ #背景颜色
71
+ st.markdown("""
72
+ <style>
73
+ /* 隐藏顶部深色栏 */
74
+ header[data-testid="stHeader"] {
75
+ background-color: #ffffff;
76
+ }
77
+
78
+ /* 左边侧边栏 - 灰色背景 */
79
+ [data-testid="stSidebar"] {
80
+ display: none;
81
+ }
82
+
83
+ /* 右边主内容区 - 白色背景 */
84
+ [data-testid="stMain"] {
85
+ background-color: #ffffff;
86
+ }
87
+
88
+ /* 隐藏顶部 header 的高度 */
89
+ header[data-testid="stHeader"] {
90
+ height: 0 !important;
91
+ min-height: 0 !important;
92
+ padding: 0 !important;
93
+ }
94
+
95
+ /* 减少顶部间距 - 调整这个值 */
96
+ .block-container {
97
+ padding-top: 0 !important; /* 移除顶部留白 */
98
+ max-width: 1200px; /* 最大宽度 */
99
+ padding-left: 2rem; /* 左边距 */
100
+ padding-right: 2rem; /* 右边距 */
101
+ }
102
+
103
+ /* 标签页字体大小和样式 */
104
+ .stTabs [data-baseweb="tab"] p {
105
+ font-size: 18px !important; /* 字体大小 */
106
+ font-weight: bold; /* 加粗 */
107
+ padding: 10px 20px; /* 内边距 */
108
+ color: #333333; /* 字体颜色 */
109
+ }
110
+
111
+ /* 给所有 tabs 区域加边框 */
112
+ [data-testid="stTabs"] {
113
+ border: 2px solid #e0e0e0;
114
+ border-radius: 15px;
115
+ padding: 5px 5px 35px 20px;
116
+ background-color: #fafafa;
117
+ margin-bottom: 5px;
118
+ }
119
+
120
+ /* tabs 固定高度和滚动 (Leaderboard 700px) */
121
+ .stTabs [data-baseweb="tab-panel"] {
122
+ max-height: 600px !important;
123
+ overflow-y: auto !important;
124
+ }
125
+
126
+ /* 表格内容左对齐 - glide-data-grid */
127
+ [data-testid="stDataFrame"] .dvn-scroller,
128
+ [data-testid="stDataFrame"] [class*="cell"],
129
+ [data-testid="stDataFrame"] div[style*="justify-content"] {
130
+ text-align: left !important;
131
+ justify-content: flex-start !important;
132
+ }
133
+
134
+ /* glide data editor 单元格 */
135
+ .gdg-cell {
136
+ justify-content: flex-start !important;
137
+ }
138
+
139
+ code {
140
+ background-color: transparent !important;
141
+ color: #333 !important;
142
+ }
143
+
144
+ pre {
145
+ background-color: #f5f5f5 !important;
146
+ color: #333 !important;
147
+ }
148
+
149
+ pre code {
150
+ background-color: transparent !important;
151
+ color: #333 !important;
152
+ }
153
+ </style>
154
+ """, unsafe_allow_html=True)
155
+
156
+ #标题
157
+ title_icon = get_image_base64("utils/title_icon.png")
158
+ st.markdown(f"""
159
+ <div style="background-color: #f0f0f0;
160
+ padding: 20px 20px;
161
+ margin: 0 -30rem 20px -30rem;">
162
+ <h1 style="text-align: center;
163
+ font-size: 36px;
164
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
165
+ -webkit-background-clip: text;
166
+ -webkit-text-fill-color: transparent;
167
+ padding: 5px;
168
+ margin: 0;">
169
+ <img src="data:image/png;base64,{title_icon}" width="45" style="vertical-align: middle; margin-right: 1px;">
170
+ RAGRouter-Bench:<br> A Dataset and Benchmark for Adaptive RAG Routing
171
+ </h1>
172
+ </div>
173
+ """, unsafe_allow_html=True)
174
+
175
+ # 统计横幅
176
+ st.markdown("""
177
+ <div style="
178
+ background-color: #e8f4fc;
179
+ border: 2px solid #b8d4e8;
180
+ border-radius: 15px;
181
+ margin: 0 auto 30px auto;
182
+ max-width: 100%;
183
+ text-align: center;
184
+ font-size: 18px;
185
+ color: #333;
186
+ ">
187
+ <span style="margin: 0 5px;"><strong>📚 4 Corpus Domains</strong></span>|
188
+ <span style="margin: 0 5px;"><strong>📄 21K Documents</strong></span>|
189
+ <span style="margin: 0 5px;"><strong>❓ 7.7K Query Types</strong></span>|
190
+ <span style="margin: 0 5px;"><strong>📊 3 Dimension Evaluations</strong></span>|
191
+ <span style="margin: 0 5px;"><strong>🔄 5 RAG Paradigms</strong></span>|
192
+ <span style="margin: 0 5px;"><strong>🤖 2 LLMs Tested</strong></span>
193
+ </div>
194
+ """, unsafe_allow_html=True)
195
+
196
+ # 主内容 - 添加锚点ID
197
+ # About 部分
198
+ with st.container():
199
+ about_icon = get_image_base64("utils/about_icon.png")
200
+
201
+ st.markdown(f"""
202
+ <h2 id="about" style="color: #333333;
203
+ padding-bottom: 10px;
204
+ font-family: 'Ubuntu Mono', monospace;
205
+ font-size: 30px;">
206
+ <img src="data:image/png;base64,{about_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
207
+ About
208
+ </h2>
209
+ """, unsafe_allow_html=True)
210
+
211
+ # About 内的标签页
212
+ about_tab1, about_tab2, about_tab3 = st.tabs(["📋 Overview", "⭐ Key Features", "🚀 Get Started"])
213
+
214
+ with about_tab1:
215
+ pipeline_img = get_image_base64("utils/Overall_Pipeline.png")
216
+
217
+ st.markdown(f"""
218
+ <div style="padding-right: 10px;">
219
+ <div style="text-align: center; margin: 1px 0;">
220
+ <img src="data:image/png;base64,{pipeline_img}" width="50%" style="border-radius: 10px;">
221
+ <p style="color: #666; font-size: 16px;">Overall Pipeline</p>
222
+ </div>
223
+ <div style="font-size: 16px; line-height: 1.4; color: #333; text-align: justify;">
224
+ <p>Retrieval-Augmented Generation (RAG) has become a core paradigm for grounding large language models with external knowledge.
225
+ Despite extensive efforts exploring diverse retrieval strategies, <strong>existing studies predominantly focus on query-side complexity or isolated method improvements, lacking a systematic understanding of how RAG paradigms behave across different query–corpus contexts and effectiveness–efficiency trade-offs</strong>.
226
+ In this work, we introduce RAGRouter-Bench, the first dataset and benchmark designed for adaptive RAG routing.
227
+ RAGRouter-Bench revisits retrieval from a query–corpus compatibility perspective and standardizes five representative RAG paradigms for systematic evaluation across 7,727 queries and 21,460 documents spanning diverse domains.
228
+ The benchmark incorporates three canonical query types together with fine-grained semantic and structural corpus metrics, as well as a unified evaluation for both generation quality and resource consumption.
229
+ Experiments with DeepSeek-V3 and LLaMA-3.1-8B demonstrate that <strong>no single RAG paradigm is universally optimal, that paradigm applicability is strongly shaped by query–corpus interactions, and that increased advanced mechanism does not necessarily yield better effectiveness–efficiency trade-offs</strong>.
230
+ These findings underscore the necessity of routing-aware evaluation and establish a foundation for adaptive, interpretable, and generalizable next-generation RAG systems.</p>
231
+ </div>
232
+ </div>
233
+ """, unsafe_allow_html=True)
234
+
235
+ with about_tab2:
236
+ bench_img = get_image_base64("utils/Data_Profile.png")
237
+
238
+ st.markdown(f"""
239
+ <div style="padding-right: 10px;">
240
+ <div style="text-align: center; margin: 1px 0;">
241
+ <img src="data:image/png;base64,{bench_img}" width="70%">
242
+ <p style="color: #666; font-size: 16px;">Benchmark Features</p>
243
+ </div>
244
+ <div style="font-size: 16px; line-height: 1.4; color: #333; text-align: left;">
245
+ <p style="margin-top: 1px; margin-bottom: 1px;"><strong>🌐 Multi-Domain Corpora</strong></p>
246
+ <ul>
247
+ <li><strong>Wikipedia (MuSiQue)</strong>: Encyclopedic knowledge with explicit entity relations (5,427 documents)</li>
248
+ <li><strong>Literature (QuALITY)</strong>: Long-form narratives with implicit semantic structures (2,523 documents)</li>
249
+ <li><strong>Legal (UltraDomain)</strong>: Professional domain with dense terminology (6,510 documents)</li>
250
+ <li><strong>Medical (GraphRAG-Bench)</strong>: Specialized knowledge requiring precise reasoning (7,000 documents)</li>
251
+ </ul>
252
+ <p style="margin-top: 1px; margin-bottom: 1px;"><strong>❓ Three Query Types</strong></p>
253
+ <ul>
254
+ <li><strong>Factual Queries</strong>: Single-hop lookup requiring direct fact retrieval</li>
255
+ <li><strong>Reasoning Queries</strong>: Multi-hop inference across chained evidence (2-4 hops)</li>
256
+ <li><strong>Summary Queries</strong>: Global aggregation over dispersed information</li>
257
+ </ul>
258
+ <p style="margin-top: 1px; margin-bottom: 1px;"><strong>🔄 Five RAG Paradigm</strong></p>
259
+ <ul>
260
+ <li><strong>RAG Paradigm</strong>:LLM-only, NaiveRAG, GraphRAG, HybridRAG, IterativeRAG
261
+ </ul>
262
+ <p style="margin-top: 1px; margin-bottom: 1px;"><strong>📊 Dual-View Corpus Evaluation</strong></p>
263
+ <ul>
264
+ <li><strong>Structural Metrics</strong>: Connectivity (LCC Ratio, Relation Types), Density (Avg Degree, Max Centrality), Clustering Coefficient</li>
265
+ <li><strong>Semantic Metrics</strong>: Intrinsic Dimension, Dispersion (Avg/Min/Std Distance), Hubness</li>
266
+ <li><strong>Quality Assurance</strong>: LLM-based query augmentation with Verify-then-Filter validation</li>
267
+ </ul>
268
+ <p style="margin-top: 1px; margin-bottom: 1px;"><strong>⚖️ Effectiveness-Efficiency Evaluation</strong></p>
269
+ <ul>
270
+ <li><strong>Effectiveness</strong>: LLM-as-a-Judge accuracy across three dimensions (Information Coverage, Semantic Accuracy, Logical Consistency)</li>
271
+ <li><strong>Efficiency</strong>: Token consumption decomposed into Retrieval Cost and Generation Cost</li>
272
+ </ul>
273
+ </div>
274
+ </div>
275
+ """, unsafe_allow_html=True)
276
+
277
+ with about_tab3:
278
+ paradigms_img = get_image_base64("utils/RAG_Paradigms.png")
279
+
280
+ st.markdown(f"""
281
+ <div style="padding-right: 10px; font-size: 16px; line-height: 1.4; color: #333;">
282
+ <div style="text-align: center; margin: 1px 0;">
283
+ <img src="data:image/png;base64,{paradigms_img}" width="60%">
284
+ <p style="color: #666; font-size: 16px;">RAG Paradigm</p>
285
+ </div>
286
+
287
+ <a href="https://your-dataset-link.com" style="color: #667eea;" target="_blank">📥 Download RAGRouter-Bench Dataset</a>
288
+
289
+ <p style="margin-top: 1px; margin-bottom: 5px;"><strong>💻 Installation</strong></p>
290
+ <pre style="background-color: #f0f7ff !important; padding: 10px; border-radius: 5px; overflow-x: auto; border: 1px solid #cce0ff;">
291
+ <code style="background-color: transparent !important; color: #333 !important; font-family: 'Courier New', monospace !important;">git clone https://github.com/your-repo/RAGRouter-Bench
292
+ cd RAGRouter-Bench
293
+ conda env create -f environment.yml
294
+ conda activate ragBench</code></pre>
295
+
296
+ <p style="margin-top: 1px; margin-bottom: 5px;"><strong>⚙️ Configuration</strong></p>
297
+ <ul>
298
+ <li>Set your API key in <code>Config/LLMConfig.py</code> (<code>DEEPSEEK_API_KEY</code> or <code>OPENAI_API_KEY</code>)</li>
299
+ </ul>
300
+
301
+ <p style="margin-top: 1px; margin-bottom: 5px;"><strong>🚀 Quick Start</strong></p>
302
+ <table style="width: 100%; border-collapse: collapse; margin: 10px 0;">
303
+ <tr style="background-color: #f0f0f0;">
304
+ <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Step</th>
305
+ <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Command</th>
306
+ <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Description</th>
307
+ </tr>
308
+ <tr>
309
+ <td style="border: 1px solid #ddd; padding: 8px;">1. Process</td>
310
+ <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py process all --dataset musique</code></td>
311
+ <td style="border: 1px solid #ddd; padding: 8px;">Chunking, embedding, graph building</td>
312
+ </tr>
313
+ <tr>
314
+ <td style="border: 1px solid #ddd; padding: 8px;">2. Retrieve</td>
315
+ <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py retrieve graph --dataset musique</code></td>
316
+ <td style="border: 1px solid #ddd; padding: 8px;">Run RAG retrieval</td>
317
+ </tr>
318
+ <tr>
319
+ <td style="border: 1px solid #ddd; padding: 8px;">3. Evaluate</td>
320
+ <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py evaluate result --dataset musique --method graph_rag</code></td>
321
+ <td style="border: 1px solid #ddd; padding: 8px;">Evaluate results</td>
322
+ </tr>
323
+ <tr>
324
+ <td style="border: 1px solid #ddd; padding: 8px;">Full Pipeline</td>
325
+ <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py pipeline --dataset musique --method graph</code></td>
326
+ <td style="border: 1px solid #ddd; padding: 8px;">Run all steps</td>
327
+ </tr>
328
+ </table>
329
+
330
+ <p style="margin-top: 15px; margin-bottom: 5px;"><strong>Available Datasets</strong></p>
331
+ <ul>
332
+ <li><code>musique</code> - Wikipedia (Encyclopedic)</li>
333
+ <li><code>quality</code> - Literature (Narrative)</li>
334
+ <li><code>ultraDomain_legal</code> - Legal (Professional)</li>
335
+ <li><code>graphragBench_medical</code> - Medical (Professional)</li>
336
+ </ul>
337
+
338
+ <p style="margin-top: 1px; margin-bottom: 5px;"><strong>Available RAG Paradigms</strong></p>
339
+ <ul>
340
+ <li><code>naive</code> - NaiveRAG (vector retrieval)</li>
341
+ <li><code>graph</code> - GraphRAG (graph traversal)</li>
342
+ <li><code>hybrid</code> - HybridRAG (naive + graph fusion)</li>
343
+ <li><code>iterative</code> - IterativeRAG (multi-round retrieval)</li>
344
+ <li><code>llm_direct</code> - LLM-only (no retrieval)</li>
345
+ </ul>
346
+
347
+ <p style="margin-top: 1px; margin-bottom: 5px;"><strong>Data Format</strong></p>
348
+ <p>Your data should be placed in <code>Dataset/Rawutils/{{dataset_name}}/</code> with:</p>
349
+ <ul>
350
+ <li><code>Corpus.json</code> - Document collection with <code>doc_id</code>, <code>title</code>, <code>text</code></li>
351
+ <li><code>Question.json</code> - Queries with <code>question_id</code>, <code>question</code>, <code>answer</code>, <code>query_type</code>, <code>supporting_facts</code></li>
352
+ </ul>
353
+ </div>
354
+ """, unsafe_allow_html=True)
355
+
356
+
357
+
358
+ # Leaderboard 部分
359
+ leaderboard_icon = get_image_base64("utils/leaderboard_icon.png")
360
+
361
+ st.markdown(f"""
362
+ <h2 id="leaderboard" style="color: #333333;
363
+ padding-bottom: 10px;
364
+ margin-top: 10px;
365
+ font-family: 'Ubuntu Mono', monospace;
366
+ font-size: 30px;">
367
+ <img src="data:image/png;base64,{leaderboard_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
368
+ Leaderboard
369
+ </h2>
370
+ """, unsafe_allow_html=True)
371
+
372
+ # Leaderboard 内的标签页
373
+ lb_tab1, lb_tab2, lb_tab3, lb_tab4 = st.tabs(["🏆 Full Leaderboard", "📁 Corpus Metrics", "📈 Effectiveness Metrics", "⚡ Efficiency Metrics"])
374
+
375
+ with lb_tab1:
376
+ # Full Leaderboard Explanation
377
+ st.markdown("""
378
+ <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
379
+ <p style="font-weight: bold; margin-bottom: 10px;">📋 Columns Explained:</p>
380
+ <ul style="margin: 0; padding-left: 20px;">
381
+ <li><strong>Dataset</strong>: Corpus domain (MuSiQue-Wikipedia, QuALITY-Literature, Legal, Medical).</li>
382
+ <li><strong>Method</strong>: RAG paradigm (NaiveRAG, GraphRAG, HybridRAG, IterativeRAG).</li>
383
+ <li><strong>Factual</strong>: LLM-as-a-Judge accuracy (%) on factual queries (single-hop fact retrieval). <em>Higher is better</em>.</li>
384
+ <li><strong>Reasoning</strong>: LLM-as-a-Judge accuracy (%) on reasoning queries (multi-hop inference, 2-4 hops). <em>Higher is better</em>.</li>
385
+ <li><strong>Summary</strong>: LLM-as-a-Judge accuracy (%) on summary queries (global information aggregation). <em>Higher is better</em>.</li>
386
+ <li><strong>Avg Acc</strong>: Average accuracy (%) across all three query types. <em>Higher is better</em>.</li>
387
+ <li><strong>Token</strong>: Average token consumption per query. <em>Lower is more efficient</em>.</li>
388
+ </ul>
389
+ </div>
390
+ """, unsafe_allow_html=True)
391
+
392
+ df_full = pd.read_csv("utils/full_lb.csv")
393
+
394
+ col1_f, col2_f, col3_f, col4_f = st.columns([2, 2, 2, 3])
395
+
396
+ with col1_f:
397
+ model_select_f = st.selectbox(
398
+ "Model",
399
+ options=["All"] + df_full["Model"].unique().tolist(),
400
+ index=0,
401
+ key="model_full"
402
+ )
403
+
404
+ with col2_f:
405
+ sort_by_f = st.selectbox(
406
+ "Sort by",
407
+ options=df_full.columns.tolist(),
408
+ index=df_full.columns.tolist().index("Avg Acc"),
409
+ key="sort_full"
410
+ )
411
+
412
+ with col3_f:
413
+ order_f = st.radio(
414
+ "Order",
415
+ options=["Descending", "Ascending"],
416
+ horizontal=True,
417
+ key="order_full"
418
+ )
419
+
420
+ with col4_f:
421
+ search_f = st.text_input("Search", placeholder="Search in all columns...", key="search_full")
422
+
423
+ df_display_f = df_full.copy()
424
+
425
+ if model_select_f != "All":
426
+ df_display_f = df_display_f[df_display_f["Model"] == model_select_f]
427
+
428
+ if search_f:
429
+ mask_f = df_display_f.apply(lambda row: row.astype(str).str.contains(search_f, case=False).any(), axis=1)
430
+ df_display_f = df_display_f[mask_f]
431
+
432
+ ascending_f = True if order_f == "Ascending" else False
433
+ df_display_f = df_display_f.sort_values(by=sort_by_f, ascending=ascending_f).reset_index(drop=True)
434
+
435
+ st.markdown(df_to_html_table(df_display_f), unsafe_allow_html=True)
436
+
437
+ with lb_tab2:
438
+ # Structure Metrics Explanation
439
+ st.markdown("""
440
+ <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
441
+ <p style="font-weight: bold; margin-bottom: 10px;">🔗 Structural Topology Metrics:</p>
442
+ <ul style="margin: 0; padding-left: 20px;">
443
+ <li>Nodes: Number of nodes in the knowledge graph.</li>
444
+ <li><strong>Edges</strong>: Number of edges in the knowledge graph.</li>
445
+ <li><strong>Density</strong>: Edge saturation level. <em>Excessive sparsity limits relational bridges</em>.</li>
446
+ <li><strong>Rel_Types (Relation Type Diversity)</strong>: Semantic richness of edges for precise graph traversal.</li>
447
+ <li><strong>Avg_Deg (Average Degree)</strong>: Average connections per node, reflecting connection intensity.</li>
448
+ <li><strong>Comp (Connected Components)</strong>: Number of independent subgraphs.</li>
449
+ <li><strong>LCC_Ratio (Largest Connected Component Ratio)</strong>: Proportion of nodes in the largest subgraph. <em>Low values indicate graph fragmentation that breaks multi-hop paths</em>.</li>
450
+ <li><strong>Cluster_Coeff (Clustering Coefficient)</strong>: Local cohesiveness. <em>High values indicate tight communities that facilitate evidence aggregation</em>.</li>
451
+ </ul>
452
+ </div>
453
+ """, unsafe_allow_html=True)
454
+
455
+ df_structure = pd.read_csv("utils/corpus_structure.csv")
456
+ col1_s, col2_s, col3_s = st.columns([2, 2, 3])
457
+ with col1_s:
458
+ sort_by_s = st.selectbox(
459
+ "Sort by",
460
+ options=df_structure.columns.tolist(),
461
+ index=0,
462
+ key="sort_structure"
463
+ )
464
+
465
+ with col2_s:
466
+ order_s = st.radio(
467
+ "Order",
468
+ options=["Descending", "Ascending"],
469
+ horizontal=True,
470
+ key="order_structure"
471
+ )
472
+
473
+ with col3_s:
474
+ search_s = st.text_input("Search", placeholder="Search in all columns...", key="search_structure")
475
+
476
+ df_display_s = df_structure.copy()
477
+ if search_s:
478
+ mask_s = df_display_s.apply(lambda row: row.astype(str).str.contains(search_s, case=False).any(), axis=1)
479
+ df_display_s = df_display_s[mask_s]
480
+ ascending_s = True if order_s == "Ascending" else False
481
+ df_display_s = df_display_s.sort_values(by=sort_by_s, ascending=ascending_s).reset_index(drop=True)
482
+ st.markdown(df_to_html_table(df_display_s, height=200), unsafe_allow_html=True)
483
+
484
+ # Semantic Metrics Explanation
485
+ st.markdown("""
486
+ <div style="font-size: 15px; line-height: 1.5; color: #333; margin-top: 30px; margin-bottom: 20px;">
487
+ <p style="font-weight: bold; margin-bottom: 10px;">🧠 Semantic Space Metrics:</p>
488
+ <ul style="margin: 0; padding-left: 20px;">
489
+ <li><strong>Chunks</strong>: Number of text chunks in the corpus.</li>
490
+ <li><strong>Int_Dim (Intrinsic Dimension)</strong>: Effective degrees of freedom estimated via TwoNN. <em>High dimensionality exacerbates the curse of dimensionality, diminishing distance-based similarity</em>.</li>
491
+ <li><strong>Hubness</strong>: Skewness of k-occurrence distribution, measuring retrieval interference. <em>High values indicate hub vectors that dominate nearest-neighbor lists, causing bias toward frequently retrieved but potentially irrelevant passages</em>.</li>
492
+ <li><strong>Avg_Dist (Average Distance)</strong>: Average distance to centroid, reflecting overall distribution spread.</li>
493
+ <li><strong>Std_Dist (Standard Deviation)</strong>: Distance standard deviation, revealing distributional imbalance. <em>High values indicate uneven distribution</em>.</li>
494
+ <li><strong>Min_Dist (Minimum Distance)</strong>: Distance of closest cluster pair, identifying most confusable semantic regions. <em>Low dispersion causes semantic crowding that hinders hard-negative discrimination</em>.</li>
495
+ <li><strong>Max_Dist (Maximum Distance)</strong>: Distance of farthest cluster pair, reflecting maximum semantic space span.</li>
496
+ </ul>
497
+ </div>
498
+ """, unsafe_allow_html=True)
499
+
500
+ df_semantic = pd.read_csv("utils/corpus_semantic.csv")
501
+ col1_m, col2_m, col3_m = st.columns([2, 2, 3])
502
+
503
+ with col1_m:
504
+ sort_by_m = st.selectbox(
505
+ "Sort by",
506
+ options=df_semantic.columns.tolist(),
507
+ index=0,
508
+ key="sort_semantic"
509
+ )
510
+
511
+ with col2_m:
512
+ order_m = st.radio(
513
+ "Order",
514
+ options=["Descending", "Ascending"],
515
+ horizontal=True,
516
+ key="order_semantic"
517
+ )
518
+
519
+ with col3_m:
520
+ search_m = st.text_input("Search", placeholder="Search in all columns...", key="search_semantic")
521
+
522
+ df_display_m = df_semantic.copy()
523
+ if search_m:
524
+ mask_m = df_display_m.apply(lambda row: row.astype(str).str.contains(search_m, case=False).any(), axis=1)
525
+ df_display_m = df_display_m[mask_m]
526
+ ascending_m = True if order_m == "Ascending" else False
527
+ df_display_m = df_display_m.sort_values(by=sort_by_m, ascending=ascending_m).reset_index(drop=True)
528
+ st.markdown(df_to_html_table(df_display_m, height=200), unsafe_allow_html=True)
529
+
530
+ with lb_tab3:
531
+ # Metrics Explanation
532
+ st.markdown("""
533
+ <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
534
+ <p style="font-weight: bold; margin-bottom: 10px;">📊 Metrics Explained:</p>
535
+ <ul style="margin: 0; padding-left: 20px;">
536
+ <li><strong>Sem_F1 (Semantic F1)</strong>: Token-level semantic similarity between generated and reference answers using BERTScore. Range: 0-1, <em>higher is better</em>.</li>
537
+ <li><strong>COV (Coverage)</strong>: Extent to which the answer covers key information using sentence embeddings. Range: 0-1, <em>higher is better</em>.</li>
538
+ <li><strong>Faith_H (Faithfulness Hard)</strong>: Strict support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li>
539
+ <li><strong>Faith_S (Faithfulness Soft)</strong>: Relaxed support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li>
540
+ <li><strong>LLM_Cor_Pct (LLM-as-a-Judge)</strong>: Correctness rate via LLM ternary classification, aligned with human judgment. Range: 0-100%, <em>higher is better</em>.</li>
541
+ </ul>
542
+ </div>
543
+ """, unsafe_allow_html=True)
544
+
545
+ # Model files mapping
546
+ model_files = {
547
+ "DeepSeek-V3": "utils/effect_deepseek.csv",
548
+ "Llama-3-8B": "utils/effect_llama.csv"
549
+ }
550
+
551
+ # Controls
552
+ col1_e, col2_e, col3_e, col4_e = st.columns([2, 2, 2, 3])
553
+
554
+ with col1_e:
555
+ model_select = st.selectbox(
556
+ "Model",
557
+ options=list(model_files.keys()),
558
+ index=0,
559
+ key="model_effect"
560
+ )
561
+
562
+ df_effect = pd.read_csv(model_files[model_select])
563
+
564
+ with col2_e:
565
+ sort_by_e = st.selectbox(
566
+ "Sort by",
567
+ options=df_effect.columns.tolist(),
568
+ index=0,
569
+ key="sort_effect"
570
+ )
571
+
572
+ with col3_e:
573
+ order_e = st.radio(
574
+ "Order",
575
+ options=["Descending", "Ascending"],
576
+ horizontal=True,
577
+ key="order_effect"
578
+ )
579
+
580
+ with col4_e:
581
+ search_e = st.text_input("Search", placeholder="Search in all columns...", key="search_effect")
582
+
583
+ df_display_e = df_effect.copy()
584
+
585
+ if search_e:
586
+ mask_e = df_display_e.apply(lambda row: row.astype(str).str.contains(search_e, case=False).any(), axis=1)
587
+ df_display_e = df_display_e[mask_e]
588
+
589
+ ascending_e = True if order_e == "Ascending" else False
590
+ df_display_e = df_display_e.sort_values(by=sort_by_e, ascending=ascending_e).reset_index(drop=True)
591
+
592
+ st.markdown(df_to_html_table(df_display_e), unsafe_allow_html=True)
593
+
594
+ with lb_tab4:
595
+ # Cost Explanation
596
+ st.markdown("""
597
+ <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
598
+ <p style="font-weight: bold; margin-bottom: 10px;">💰 Cost Explained:</p>
599
+ <ul style="margin: 0; padding-left: 20px;">
600
+ <li><strong>Total_Tokens</strong>: Total token consumption = Retrieval_Total + Generation_Total.</li>
601
+ <li><strong>Retrieval_Total</strong>: Total tokens in retrieval phase = Retrieval_Input + Retrieval_Output. Includes entity extraction, multi-turn queries. For GraphRAG/HybridRAG, includes amortized one-time graph construction cost.</li>
602
+ <li><strong>Generation_Total</strong>: Total tokens in generation phase = Generation_Input + Generation_Output. Primarily determined by context length.</li>
603
+ <li><strong>Avg_Context_Tokens</strong>: Average retrieved context length per query. <em>Higher means more retrieved content but also higher cost</em>.</li>
604
+ <li><strong>Num_Questions</strong>: Number of queries in the dataset.</li>
605
+ </ul>
606
+ </div>
607
+ """, unsafe_allow_html=True)
608
+
609
+ # Read data
610
+ df_efficiency = pd.read_csv("utils/retrieval_generation_cost.csv")
611
+
612
+ # Controls
613
+ col1, col2, col3 = st.columns([2, 2, 3])
614
+
615
+ with col1:
616
+ sort_by = st.selectbox(
617
+ "Sort by",
618
+ options=df_efficiency.columns.tolist(),
619
+ index=df_efficiency.columns.tolist().index("Total_Tokens") # 默认按 total_tokens 排序
620
+ )
621
+
622
+ with col2:
623
+ order = st.radio(
624
+ "Order",
625
+ options=["Descending", "Ascending"],
626
+ horizontal=True
627
+ )
628
+
629
+ with col3:
630
+ search = st.text_input("Search", placeholder="Search in all columns...")
631
+
632
+ df_display = df_efficiency.copy()
633
+
634
+ if search:
635
+ mask = df_display.apply(lambda row: row.astype(str).str.contains(search, case=False).any(), axis=1)
636
+ df_display = df_display[mask]
637
+
638
+ ascending = True if order == "Ascending" else False
639
+ df_display = df_display.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)
640
+ st.markdown(df_to_html_table(df_display), unsafe_allow_html=True)
641
+
642
+
643
+ # Questions & Contact 部分
644
+ contact_icon = get_image_base64("utils/contact_icon.png")
645
+
646
+ st.markdown(f"""
647
+ <h2 id="contact" style="color: #333333;
648
+ padding-bottom: 10px;
649
+ margin-top: 10px;
650
+ font-family: 'Ubuntu Mono', monospace;
651
+ font-size: 30px;">
652
+ <img src="data:image/png;base64,{contact_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
653
+ Questions & Contact
654
+ </h2>
655
+ """, unsafe_allow_html=True)
656
+
657
+ st.markdown("""
658
+ <div style="
659
+ border: 2px solid #e0e0e0;
660
+ border-radius: 15px;
661
+ padding: 25px 30px;
662
+ background-color: #fafafa;
663
+ margin-bottom: 10px;
664
+ font-size: 16px;
665
+ line-height: 1.6;
666
+ color: #333;
667
+ ">
668
+ <p style="margin-bottom: 5px;">
669
+ If you have any questions about RAGRouter-Bench, please feel free to reach out to us:
670
+ </p>
671
+ <ul style="margin: 0; padding-left: 20px;">
672
+ <li><strong>Email</strong>: <a href="mailto:RAGRouterBench@example.com" style="color: #667eea;">RAGRouterBench@example.com</a></li>
673
+ <li><strong>GitHub</strong>: <a href="https://github.com/your-repo/RAGRouter-Bench" style="color: #667eea;" target="_blank">github.com/your-repo/RAGRouter-Bench</a></li>
674
+ </ul>
675
+ <p style="margin-top: 5px; margin-bottom: 0;">
676
+ For bug reports or feature requests, please open an issue on our GitHub repository.
677
+ </p>
678
+ </div>
679
+ """, unsafe_allow_html=True)
utils/Data_Profile.png ADDED

Git LFS Details

  • SHA256: 0e2579615dfc74caa53e3e0890b88247b9c0d632e4b565a54726bafdf76dc0d9
  • Pointer size: 132 Bytes
  • Size of remote file: 1.07 MB
utils/Overall_Pipeline.png ADDED

Git LFS Details

  • SHA256: 25f59fc73bd2ba2b4b2502de3e3f5436060c34ec7d6b37444e462020607968a2
  • Pointer size: 131 Bytes
  • Size of remote file: 771 kB
utils/RAG_Paradigms.png ADDED

Git LFS Details

  • SHA256: 3d2d28357183e220448a6bd9e75916875f4993bbad981c01391449c1aab4765c
  • Pointer size: 131 Bytes
  • Size of remote file: 559 kB
utils/about_icon.png ADDED
utils/contact_icon.png ADDED
utils/corpus_semantic.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Rank,Dataset,Chunks,Int_Dim,Hubness,Avg_Dist,Std_Dist,Min_Dist,Max_Dist
2
+ 1,MuSiQue,21153,8.17,1.27,0.708,0.049,0.552,0.924
3
+ 2,QuALITY,3822,10.75,1.26,0.345,0.119,0.186,0.805
4
+ 3,Legal,11632,7.56,1.46,0.300,0.071,0.147,0.792
5
+ 4,Medical,538,8.39,0.86,0.312,0.063,0.196,0.700
utils/corpus_structure.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Rank,Dataset,Nodes,Edges,Density,Rel_Types,Avg_Deg,Comp,LCC_Ratio,Cluster_Coeff
2
+ 1,MuSiQue,206738,276898,6.00e-06,44766,2.68,7722,0.882,0.0213
3
+ 2,QuALITY,90088,120611,1.50e-05,23828,2.68,3997,0.883,0.0177
4
+ 3,Legal,135231,261207,1.40e-05,28799,3.86,3204,0.933,0.0701
5
+ 4,Medical,14712,21480,9.90e-05,4169,2.92,741,0.861,0.0357
utils/effect_deepseek.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Dataset,Method,Sem_F1,COV,Faith_H,Faith_S,LLM_Cor_Pct
2
+ 1,MuSiQue,NaiveRAG,0.503,0.362,0.143,0.486,26.4
3
+ 2,MuSiQue,GraphRAG,0.510,0.386,0.114,0.439,30.3
4
+ 3,MuSiQue,HybridRAG,0.613,0.472,-,-,38.6
5
+ 4,MuSiQue,Iterative (Naive),0.469,0.320,-,-,20.4
6
+ 5,MuSiQue,Iterative (Graph),0.512,0.400,-,-,29.2
7
+ 6,QuALITY,NaiveRAG,0.858,0.627,0.009,0.404,48.7
8
+ 7,QuALITY,GraphRAG,0.794,0.546,0.009,0.377,39.3
9
+ 8,QuALITY,HybridRAG,0.738,0.553,0.438,0.146,41.7
10
+ 9,QuALITY,Iterative (Naive),0.724,0.506,-,-,35.8
11
+ 10,QuALITY,Iterative (Graph),0.657,0.446,-,-,28.7
12
+ 11,Legal,NaiveRAG,0.568,0.469,0.145,0.537,32.2
13
+ 12,Legal,GraphRAG,0.530,0.443,0.094,0.510,29.3
14
+ 13,Legal,HybridRAG,0.617,0.520,0.589,0.326,36.1
15
+ 14,Legal,Iterative (Naive),0.572,0.466,-,-,28.5
16
+ 15,Legal,Iterative (Graph),0.534,0.439,-,-,26.0
17
+ 16,Medical,NaiveRAG,0.770,0.599,0.207,0.583,61.1
18
+ 17,Medical,GraphRAG,0.691,0.541,0.250,0.588,53.5
19
+ 18,Medical,HybridRAG,0.792,0.620,0.767,0.358,64.7
20
+ 19,Medical,Iterative (Naive),0.826,0.595,-,-,62.7
21
+ 20,Medical,Iterative (Graph),0.801,0.575,-,-,59.8
utils/effect_llama.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Dataset,Method,Sem_F1,COV,Faith_H,Faith_S,LLM_Cor_Pct
2
+ 1,MuSiQue,NaiveRAG,0.249,0.161,0.194,0.540,8.6
3
+ 2,MuSiQue,GraphRAG,0.374,0.262,0.185,0.494,18.7
4
+ 3,MuSiQue,HybridRAG,0.406,0.284,0.582,0.178,20.3
5
+ 4,MuSiQue,Iterative (Naive),0.289,0.169,-,-,6.8
6
+ 5,MuSiQue,Iterative (Graph),0.032,0.271,-,-,16.1
7
+ 6,QuALITY,NaiveRAG,0.620,0.475,0.011,0.414,30.7
8
+ 7,QuALITY,GraphRAG,0.501,0.376,0.014,0.385,21.2
9
+ 8,QuALITY,HybridRAG,0.643,0.493,0.283,0.201,33.2
10
+ 9,QuALITY,Iterative (Naive),0.583,0.435,-,-,28.4
11
+ 10,QuALITY,Iterative (Graph),0.528,0.378,-,-,20.4
12
+ 11,Legal,NaiveRAG,0.587,0.473,0.164,0.531,25.8
13
+ 12,Legal,GraphRAG,0.536,0.445,0.146,0.508,25.0
14
+ 13,Legal,HybridRAG,0.622,0.512,0.627,0.349,31.1
15
+ 14,Legal,Iterative (Naive),0.577,0.453,-,-,22.0
16
+ 15,Legal,Iterative (Graph),0.013,0.444,-,-,21.0
17
+ 16,Medical,NaiveRAG,0.732,0.574,0.175,0.575,44.9
18
+ 17,Medical,GraphRAG,0.673,0.532,0.214,0.572,41.7
19
+ 18,Medical,HybridRAG,0.759,0.600,0.813,0.374,48.2
20
+ 19,Medical,Iterative (Naive),0.802,0.582,-,-,43.6
21
+ 20,Medical,Iterative (Graph),0.818,0.595,-,-,43.6
utils/full_lb.csv ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Model,Dataset,Method,Factual,Reasoning,Summary,Avg Acc,Token
2
+ 1,DeepSeek-V3,MuSiQue,NaiveRAG,11.1,28.3,29.4,26.4,13k
3
+ 2,DeepSeek-V3,MuSiQue,GraphRAG,90.2,22.5,20.6,30.3,9k
4
+ 3,DeepSeek-V3,MuSiQue,HybridRAG,83.7,32.8,30.2,38.6,22k
5
+ 4,DeepSeek-V3,MuSiQue,IterativeRAG,10.3,21.8,21.2,20.4,20k
6
+ 5,DeepSeek-V3,QuALITY,NaiveRAG,83.7,33.8,17.0,48.8,50k
7
+ 6,DeepSeek-V3,QuALITY,GraphRAG,70.7,20.4,19.8,39.3,50k
8
+ 7,DeepSeek-V3,QuALITY,HybridRAG,80.0,20.4,14.8,41.6,99k
9
+ 8,DeepSeek-V3,QuALITY,IterativeRAG,67.0,17.1,16.2,35.8,21k
10
+ 9,DeepSeek-V3,Legal,NaiveRAG,54.9,11.2,39.1,32.2,46k
11
+ 10,DeepSeek-V3,Legal,GraphRAG,61.6,6.7,29.1,29.3,184k
12
+ 11,DeepSeek-V3,Legal,HybridRAG,72.2,11.8,34.6,36.1,230k
13
+ 12,DeepSeek-V3,Legal,IterativeRAG,49.5,12.4,30.4,28.5,20k
14
+ 13,DeepSeek-V3,Medical,NaiveRAG,63.1,63.5,49.1,61.1,51k
15
+ 14,DeepSeek-V3,Medical,GraphRAG,55.0,56.0,43.6,53.5,38k
16
+ 15,DeepSeek-V3,Medical,HybridRAG,67.8,64.0,54.0,64.7,74k
17
+ 16,DeepSeek-V3,Medical,IterativeRAG,62.1,67.8,56.1,62.7,7k
18
+ 17,LLaMA-3.1-8B,MuSiQue,NaiveRAG,10.3,7.9,12.0,8.6,13k
19
+ 18,LLaMA-3.1-8B,MuSiQue,GraphRAG,84.4,9.7,11.4,18.7,9k
20
+ 19,LLaMA-3.1-8B,MuSiQue,HybridRAG,79.9,12.1,13.9,20.3,22k
21
+ 20,LLaMA-3.1-8B,MuSiQue,IterativeRAG,12.3,6.0,6.8,6.8,20k
22
+ 21,LLaMA-3.1-8B,QuALITY,NaiveRAG,69.2,10.9,1.4,30.7,50k
23
+ 22,LLaMA-3.1-8B,QuALITY,GraphRAG,44.7,9.5,2.5,21.2,50k
24
+ 23,LLaMA-3.1-8B,QuALITY,HybridRAG,70.3,15.6,2.5,33.2,99k
25
+ 24,LLaMA-3.1-8B,QuALITY,IterativeRAG,62.3,11.5,1.4,28.4,21k
26
+ 25,LLaMA-3.1-8B,Legal,NaiveRAG,50.3,10.7,22.8,25.8,46k
27
+ 26,LLaMA-3.1-8B,Legal,GraphRAG,55.4,7.4,19.7,25.0,184k
28
+ 27,LLaMA-3.1-8B,Legal,HybridRAG,63.8,13.1,24.2,31.1,230k
29
+ 28,LLaMA-3.1-8B,Legal,IterativeRAG,48.7,10.1,12.6,22.0,20k
30
+ 29,LLaMA-3.1-8B,Medical,NaiveRAG,52.1,37.9,30.1,44.9,51k
31
+ 30,LLaMA-3.1-8B,Medical,GraphRAG,48.1,33.6,31.8,41.7,38k
32
+ 31,LLaMA-3.1-8B,Medical,HybridRAG,55.7,39.9,33.9,48.2,74k
33
+ 32,LLaMA-3.1-8B,Medical,IterativeRAG,47.5,44.6,27.3,43.6,7k
utils/leaderboard_icon.png ADDED
utils/retrieval_generation_cost.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Dataset,Method,Num_Questions,Avg_Context_Tokens,Avg_Query_Tokens,Retrieval_Input_Tokens,Retrieval_Output_Tokens,Retrieval_Total,Generation_Input_Tokens,Generation_Output_Tokens,Generation_Total,Total_Tokens
2
+ 1,Medical,llm_direct,1896,-,15.5,-,-,-,29338,57176,86514,86514
3
+ 2,Medical,naive,1896,50503.9,15.5,-,-,-,95784821,77714,95862535,95862535
4
+ 3,Medical,graph,1896,37513.3,15.5,250833,153175,404008,71154518,67161,71221679,71625687
5
+ 4,Medical,hybrid,1896,73628.3,15.5,250833,153175,404008,139628589,80880,139709469,140113477
6
+ 5,Medical,iterative,1897,2230.6,15.5,8671831,257213,8929044,4260848,77289,4338137,13267181
7
+ 6,MuSiQue,llm_direct,3356,-,22.3,-,-,-,74925,25238,100163,100163
8
+ 7,MuSiQue,naive,3356,13139.3,22.3,-,-,-,44170465,53233,44223698,44223698
9
+ 8,MuSiQue,graph,3356,8471.7,22.3,2350938,244016,2594954,28505878,42917,28548795,31143749
10
+ 9,MuSiQue,hybrid,3356,21602.4,22.3,2350938,244016,2594954,72572546,55375,72627921,75222875
11
+ 10,MuSiQue,iterative,3357,6364.2,22.3,43242177,723972,43966149,21439701,50772,21490473,65456622
12
+ 11,QuALITY,llm_direct,1198,-,29.0,-,-,-,34716,4038,38754,38754
13
+ 12,QuALITY,naive,1198,49444.3,29.0,-,-,-,59268989,39140,59308129,59308129
14
+ 13,QuALITY,graph,1198,48501.6,29.0,1556111,71840,1627951,58139642,39180,58178822,59806773
15
+ 14,QuALITY,hybrid,1198,97789.1,29.0,1556111,71840,1627951,117186129,34985,117221114,118849065
16
+ 15,QuALITY,iterative,1198,6870.5,29.0,16685428,251443,16936871,8265571,34999,8300570,25237441
17
+ 16,Legal,llm_direct,1277,-,37.0,-,-,-,47290,7891,55181,55181
18
+ 17,Legal,naive,1277,46272.7,37.0,-,-,-,59137514,49921,59187435,59187435
19
+ 18,Legal,graph,1277,179727.7,37.0,4821083,125070,4946153,229559514,43677,229603191,234549344
20
+ 19,Legal,hybrid,1277,225571.4,37.0,4821083,125070,4946153,288101932,50340,288152272,293098425
21
+ 20,Legal,iterative,1278,6460.2,37.1,16810781,291294,17102075,8303522,47288,8350810,25452885
utils/title_icon.png ADDED