Chaplain0908 commited on
Commit
708c90b
·
verified ·
1 Parent(s): d3ac0e9

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -679
app.py DELETED
@@ -1,679 +0,0 @@
1
- import streamlit as st
2
- import base64
3
- import pandas as pd
4
- import os
5
-
6
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
7
- os.path.join(BASE_DIR, "utils", "title_icon.png")
8
-
9
- # 读取图片并转为 base64
10
- def get_image_base64(image_path):
11
- with open(image_path, "rb") as f:
12
- return base64.b64encode(f.read()).decode()
13
-
14
- # 设置 dataframe 样式:斑马纹 + 表头黑色加粗
15
- def style_dataframe(df):
16
- def row_style(row):
17
- if row.name % 2 == 0:
18
- return ['background-color: #f9f9f9'] * len(row)
19
- return ['background-color: #ffffff'] * len(row)
20
-
21
- return df.style.set_table_styles([
22
- # 表头样式
23
- {'selector': 'th', 'props': [
24
- ('background-color', '#f0f0f0'),
25
- ('color', '#000000'),
26
- ('font-weight', 'bold'),
27
- ('text-align', 'left'),
28
- ('padding', '8px')
29
- ]},
30
- # 单元格样式
31
- {'selector': 'td', 'props': [
32
- ('text-align', 'left'),
33
- ('padding', '8px')
34
- ]},
35
- # 表头文字样式
36
- {'selector': 'th.col_heading', 'props': [
37
- ('background-color', '#f0f0f0'),
38
- ('color', '#000000'),
39
- ('font-weight', 'bold')
40
- ]}
41
- ]).apply(row_style, axis=1)
42
-
43
- def df_to_html_table(df, height=400):
44
- html = f'<div style="max-height: {height}px; overflow-y: auto; border: 1px solid #d0d0d0; border-radius: 8px;">'
45
- html += '<table style="width: 100%; border-collapse: collapse; font-size: 14px;">'
46
-
47
- # 调整表头:font-weight 改为 normal,padding 第一个值调小
48
- html += '<thead><tr style="background-color: #e8e8e8; position: sticky; top: 0; z-index: 1;">'
49
- for col in df.columns:
50
- html += f'<th style="padding: 6px 14px; text-align: left; font-weight: normal; font-size: 15px; color: #000; border-bottom: 2px solid #ccc;">{col}</th>'
51
- html += '</tr></thead><tbody>'
52
-
53
- # 调整单元格:padding 第一个值调小
54
- for i, (_, row) in enumerate(df.iterrows()):
55
- bg = '#f5f5f5' if i % 2 == 0 else '#ffffff'
56
- html += f'<tr style="background-color: {bg};">'
57
- for val in row:
58
- html += f'<td style="padding: 4px 14px; text-align: left; border-bottom: 1px solid #eee;">{val}</td>'
59
- html += '</tr>'
60
-
61
- html += '</tbody></table></div>'
62
- return html
63
-
64
- st.set_page_config(
65
- page_title="RAGRouter-Bench: A Dataset and Benchmark for Adaptive RAG Routing",
66
- layout="wide",
67
- initial_sidebar_state="expanded",
68
- )
69
-
70
- #背景颜色
71
- st.markdown("""
72
- <style>
73
- /* 隐藏顶部深色栏 */
74
- header[data-testid="stHeader"] {
75
- background-color: #ffffff;
76
- }
77
-
78
- /* 左边侧边栏 - 灰色背景 */
79
- [data-testid="stSidebar"] {
80
- display: none;
81
- }
82
-
83
- /* 右边主内容区 - 白色背景 */
84
- [data-testid="stMain"] {
85
- background-color: #ffffff;
86
- }
87
-
88
- /* 隐藏顶部 header 的高度 */
89
- header[data-testid="stHeader"] {
90
- height: 0 !important;
91
- min-height: 0 !important;
92
- padding: 0 !important;
93
- }
94
-
95
- /* 减少顶部间距 - 调整这个值 */
96
- .block-container {
97
- padding-top: 0 !important; /* 移除顶部留白 */
98
- max-width: 1200px; /* 最大宽度 */
99
- padding-left: 2rem; /* 左边距 */
100
- padding-right: 2rem; /* 右边距 */
101
- }
102
-
103
- /* 标签页字体大小和样式 */
104
- .stTabs [data-baseweb="tab"] p {
105
- font-size: 18px !important; /* 字体大小 */
106
- font-weight: bold; /* 加粗 */
107
- padding: 10px 20px; /* 内边距 */
108
- color: #333333; /* 字体颜色 */
109
- }
110
-
111
- /* 给所有 tabs 区域加边框 */
112
- [data-testid="stTabs"] {
113
- border: 2px solid #e0e0e0;
114
- border-radius: 15px;
115
- padding: 5px 5px 35px 20px;
116
- background-color: #fafafa;
117
- margin-bottom: 5px;
118
- }
119
-
120
- /* tabs 固定高度和滚动 (Leaderboard 700px) */
121
- .stTabs [data-baseweb="tab-panel"] {
122
- max-height: 600px !important;
123
- overflow-y: auto !important;
124
- }
125
-
126
- /* 表格内容左对齐 - glide-data-grid */
127
- [data-testid="stDataFrame"] .dvn-scroller,
128
- [data-testid="stDataFrame"] [class*="cell"],
129
- [data-testid="stDataFrame"] div[style*="justify-content"] {
130
- text-align: left !important;
131
- justify-content: flex-start !important;
132
- }
133
-
134
- /* glide data editor 单元格 */
135
- .gdg-cell {
136
- justify-content: flex-start !important;
137
- }
138
-
139
- code {
140
- background-color: transparent !important;
141
- color: #333 !important;
142
- }
143
-
144
- pre {
145
- background-color: #f5f5f5 !important;
146
- color: #333 !important;
147
- }
148
-
149
- pre code {
150
- background-color: transparent !important;
151
- color: #333 !important;
152
- }
153
- </style>
154
- """, unsafe_allow_html=True)
155
-
156
- #标题
157
- title_icon = get_image_base64("utils/title_icon.png")
158
- st.markdown(f"""
159
- <div style="background-color: #f0f0f0;
160
- padding: 20px 20px;
161
- margin: 0 -30rem 20px -30rem;">
162
- <h1 style="text-align: center;
163
- font-size: 36px;
164
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
165
- -webkit-background-clip: text;
166
- -webkit-text-fill-color: transparent;
167
- padding: 5px;
168
- margin: 0;">
169
- <img src="data:image/png;base64,{title_icon}" width="45" style="vertical-align: middle; margin-right: 1px;">
170
- RAGRouter-Bench:<br> A Dataset and Benchmark for Adaptive RAG Routing
171
- </h1>
172
- </div>
173
- """, unsafe_allow_html=True)
174
-
175
- # 统计横幅
176
- st.markdown("""
177
- <div style="
178
- background-color: #e8f4fc;
179
- border: 2px solid #b8d4e8;
180
- border-radius: 15px;
181
- margin: 0 auto 30px auto;
182
- max-width: 100%;
183
- text-align: center;
184
- font-size: 18px;
185
- color: #333;
186
- ">
187
- <span style="margin: 0 5px;"><strong>📚 4 Corpus Domains</strong></span>|
188
- <span style="margin: 0 5px;"><strong>📄 21K Documents</strong></span>|
189
- <span style="margin: 0 5px;"><strong>❓ 7.7K Query Types</strong></span>|
190
- <span style="margin: 0 5px;"><strong>📊 3 Dimension Evaluations</strong></span>|
191
- <span style="margin: 0 5px;"><strong>🔄 5 RAG Paradigms</strong></span>|
192
- <span style="margin: 0 5px;"><strong>🤖 2 LLMs Tested</strong></span>
193
- </div>
194
- """, unsafe_allow_html=True)
195
-
196
- # 主内容 - 添加锚点ID
197
- # About 部分
198
- with st.container():
199
- about_icon = get_image_base64("utils/about_icon.png")
200
-
201
- st.markdown(f"""
202
- <h2 id="about" style="color: #333333;
203
- padding-bottom: 10px;
204
- font-family: 'Ubuntu Mono', monospace;
205
- font-size: 30px;">
206
- <img src="data:image/png;base64,{about_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
207
- About
208
- </h2>
209
- """, unsafe_allow_html=True)
210
-
211
- # About 内的标签页
212
- about_tab1, about_tab2, about_tab3 = st.tabs(["📋 Overview", "⭐ Key Features", "🚀 Get Started"])
213
-
214
- with about_tab1:
215
- pipeline_img = get_image_base64("utils/Overall_Pipeline.png")
216
-
217
- st.markdown(f"""
218
- <div style="padding-right: 10px;">
219
- <div style="text-align: center; margin: 1px 0;">
220
- <img src="data:image/png;base64,{pipeline_img}" width="50%" style="border-radius: 10px;">
221
- <p style="color: #666; font-size: 16px;">Overall Pipeline</p>
222
- </div>
223
- <div style="font-size: 16px; line-height: 1.4; color: #333; text-align: justify;">
224
- <p>Retrieval-Augmented Generation (RAG) has become a core paradigm for grounding large language models with external knowledge.
225
- Despite extensive efforts exploring diverse retrieval strategies, <strong>existing studies predominantly focus on query-side complexity or isolated method improvements, lacking a systematic understanding of how RAG paradigms behave across different query–corpus contexts and effectiveness–efficiency trade-offs</strong>.
226
- In this work, we introduce RAGRouter-Bench, the first dataset and benchmark designed for adaptive RAG routing.
227
- RAGRouter-Bench revisits retrieval from a query–corpus compatibility perspective and standardizes five representative RAG paradigms for systematic evaluation across 7,727 queries and 21,460 documents spanning diverse domains.
228
- The benchmark incorporates three canonical query types together with fine-grained semantic and structural corpus metrics, as well as a unified evaluation for both generation quality and resource consumption.
229
- Experiments with DeepSeek-V3 and LLaMA-3.1-8B demonstrate that <strong>no single RAG paradigm is universally optimal, that paradigm applicability is strongly shaped by query–corpus interactions, and that increased advanced mechanism does not necessarily yield better effectiveness–efficiency trade-offs</strong>.
230
- These findings underscore the necessity of routing-aware evaluation and establish a foundation for adaptive, interpretable, and generalizable next-generation RAG systems.</p>
231
- </div>
232
- </div>
233
- """, unsafe_allow_html=True)
234
-
235
- with about_tab2:
236
- bench_img = get_image_base64("utils/Data_Profile.png")
237
-
238
- st.markdown(f"""
239
- <div style="padding-right: 10px;">
240
- <div style="text-align: center; margin: 1px 0;">
241
- <img src="data:image/png;base64,{bench_img}" width="70%">
242
- <p style="color: #666; font-size: 16px;">Benchmark Features</p>
243
- </div>
244
- <div style="font-size: 16px; line-height: 1.4; color: #333; text-align: left;">
245
- <p style="margin-top: 1px; margin-bottom: 1px;"><strong>🌐 Multi-Domain Corpora</strong></p>
246
- <ul>
247
- <li><strong>Wikipedia (MuSiQue)</strong>: Encyclopedic knowledge with explicit entity relations (5,427 documents)</li>
248
- <li><strong>Literature (QuALITY)</strong>: Long-form narratives with implicit semantic structures (2,523 documents)</li>
249
- <li><strong>Legal (UltraDomain)</strong>: Professional domain with dense terminology (6,510 documents)</li>
250
- <li><strong>Medical (GraphRAG-Bench)</strong>: Specialized knowledge requiring precise reasoning (7,000 documents)</li>
251
- </ul>
252
- <p style="margin-top: 1px; margin-bottom: 1px;"><strong>❓ Three Query Types</strong></p>
253
- <ul>
254
- <li><strong>Factual Queries</strong>: Single-hop lookup requiring direct fact retrieval</li>
255
- <li><strong>Reasoning Queries</strong>: Multi-hop inference across chained evidence (2-4 hops)</li>
256
- <li><strong>Summary Queries</strong>: Global aggregation over dispersed information</li>
257
- </ul>
258
- <p style="margin-top: 1px; margin-bottom: 1px;"><strong>🔄 Five RAG Paradigm</strong></p>
259
- <ul>
260
- <li><strong>RAG Paradigm</strong>:LLM-only, NaiveRAG, GraphRAG, HybridRAG, IterativeRAG
261
- </ul>
262
- <p style="margin-top: 1px; margin-bottom: 1px;"><strong>📊 Dual-View Corpus Evaluation</strong></p>
263
- <ul>
264
- <li><strong>Structural Metrics</strong>: Connectivity (LCC Ratio, Relation Types), Density (Avg Degree, Max Centrality), Clustering Coefficient</li>
265
- <li><strong>Semantic Metrics</strong>: Intrinsic Dimension, Dispersion (Avg/Min/Std Distance), Hubness</li>
266
- <li><strong>Quality Assurance</strong>: LLM-based query augmentation with Verify-then-Filter validation</li>
267
- </ul>
268
- <p style="margin-top: 1px; margin-bottom: 1px;"><strong>⚖️ Effectiveness-Efficiency Evaluation</strong></p>
269
- <ul>
270
- <li><strong>Effectiveness</strong>: LLM-as-a-Judge accuracy across three dimensions (Information Coverage, Semantic Accuracy, Logical Consistency)</li>
271
- <li><strong>Efficiency</strong>: Token consumption decomposed into Retrieval Cost and Generation Cost</li>
272
- </ul>
273
- </div>
274
- </div>
275
- """, unsafe_allow_html=True)
276
-
277
- with about_tab3:
278
- paradigms_img = get_image_base64("utils/RAG_Paradigms.png")
279
-
280
- st.markdown(f"""
281
- <div style="padding-right: 10px; font-size: 16px; line-height: 1.4; color: #333;">
282
- <div style="text-align: center; margin: 1px 0;">
283
- <img src="data:image/png;base64,{paradigms_img}" width="60%">
284
- <p style="color: #666; font-size: 16px;">RAG Paradigm</p>
285
- </div>
286
-
287
- <a href="https://your-dataset-link.com" style="color: #667eea;" target="_blank">📥 Download RAGRouter-Bench Dataset</a>
288
-
289
- <p style="margin-top: 1px; margin-bottom: 5px;"><strong>💻 Installation</strong></p>
290
- <pre style="background-color: #f0f7ff !important; padding: 10px; border-radius: 5px; overflow-x: auto; border: 1px solid #cce0ff;">
291
- <code style="background-color: transparent !important; color: #333 !important; font-family: 'Courier New', monospace !important;">git clone https://github.com/your-repo/RAGRouter-Bench
292
- cd RAGRouter-Bench
293
- conda env create -f environment.yml
294
- conda activate ragBench</code></pre>
295
-
296
- <p style="margin-top: 1px; margin-bottom: 5px;"><strong>⚙️ Configuration</strong></p>
297
- <ul>
298
- <li>Set your API key in <code>Config/LLMConfig.py</code> (<code>DEEPSEEK_API_KEY</code> or <code>OPENAI_API_KEY</code>)</li>
299
- </ul>
300
-
301
- <p style="margin-top: 1px; margin-bottom: 5px;"><strong>🚀 Quick Start</strong></p>
302
- <table style="width: 100%; border-collapse: collapse; margin: 10px 0;">
303
- <tr style="background-color: #f0f0f0;">
304
- <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Step</th>
305
- <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Command</th>
306
- <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Description</th>
307
- </tr>
308
- <tr>
309
- <td style="border: 1px solid #ddd; padding: 8px;">1. Process</td>
310
- <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py process all --dataset musique</code></td>
311
- <td style="border: 1px solid #ddd; padding: 8px;">Chunking, embedding, graph building</td>
312
- </tr>
313
- <tr>
314
- <td style="border: 1px solid #ddd; padding: 8px;">2. Retrieve</td>
315
- <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py retrieve graph --dataset musique</code></td>
316
- <td style="border: 1px solid #ddd; padding: 8px;">Run RAG retrieval</td>
317
- </tr>
318
- <tr>
319
- <td style="border: 1px solid #ddd; padding: 8px;">3. Evaluate</td>
320
- <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py evaluate result --dataset musique --method graph_rag</code></td>
321
- <td style="border: 1px solid #ddd; padding: 8px;">Evaluate results</td>
322
- </tr>
323
- <tr>
324
- <td style="border: 1px solid #ddd; padding: 8px;">Full Pipeline</td>
325
- <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py pipeline --dataset musique --method graph</code></td>
326
- <td style="border: 1px solid #ddd; padding: 8px;">Run all steps</td>
327
- </tr>
328
- </table>
329
-
330
- <p style="margin-top: 15px; margin-bottom: 5px;"><strong>Available Datasets</strong></p>
331
- <ul>
332
- <li><code>musique</code> - Wikipedia (Encyclopedic)</li>
333
- <li><code>quality</code> - Literature (Narrative)</li>
334
- <li><code>ultraDomain_legal</code> - Legal (Professional)</li>
335
- <li><code>graphragBench_medical</code> - Medical (Professional)</li>
336
- </ul>
337
-
338
- <p style="margin-top: 1px; margin-bottom: 5px;"><strong>Available RAG Paradigms</strong></p>
339
- <ul>
340
- <li><code>naive</code> - NaiveRAG (vector retrieval)</li>
341
- <li><code>graph</code> - GraphRAG (graph traversal)</li>
342
- <li><code>hybrid</code> - HybridRAG (naive + graph fusion)</li>
343
- <li><code>iterative</code> - IterativeRAG (multi-round retrieval)</li>
344
- <li><code>llm_direct</code> - LLM-only (no retrieval)</li>
345
- </ul>
346
-
347
- <p style="margin-top: 1px; margin-bottom: 5px;"><strong>Data Format</strong></p>
348
- <p>Your data should be placed in <code>Dataset/Rawutils/{{dataset_name}}/</code> with:</p>
349
- <ul>
350
- <li><code>Corpus.json</code> - Document collection with <code>doc_id</code>, <code>title</code>, <code>text</code></li>
351
- <li><code>Question.json</code> - Queries with <code>question_id</code>, <code>question</code>, <code>answer</code>, <code>query_type</code>, <code>supporting_facts</code></li>
352
- </ul>
353
- </div>
354
- """, unsafe_allow_html=True)
355
-
356
-
357
-
358
- # Leaderboard 部分
359
- leaderboard_icon = get_image_base64("utils/leaderboard_icon.png")
360
-
361
- st.markdown(f"""
362
- <h2 id="leaderboard" style="color: #333333;
363
- padding-bottom: 10px;
364
- margin-top: 10px;
365
- font-family: 'Ubuntu Mono', monospace;
366
- font-size: 30px;">
367
- <img src="data:image/png;base64,{leaderboard_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
368
- Leaderboard
369
- </h2>
370
- """, unsafe_allow_html=True)
371
-
372
- # Leaderboard 内的标签页
373
- lb_tab1, lb_tab2, lb_tab3, lb_tab4 = st.tabs(["🏆 Full Leaderboard", "📁 Corpus Metrics", "📈 Effectiveness Metrics", "⚡ Efficiency Metrics"])
374
-
375
- with lb_tab1:
376
- # Full Leaderboard Explanation
377
- st.markdown("""
378
- <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
379
- <p style="font-weight: bold; margin-bottom: 10px;">📋 Columns Explained:</p>
380
- <ul style="margin: 0; padding-left: 20px;">
381
- <li><strong>Dataset</strong>: Corpus domain (MuSiQue-Wikipedia, QuALITY-Literature, Legal, Medical).</li>
382
- <li><strong>Method</strong>: RAG paradigm (NaiveRAG, GraphRAG, HybridRAG, IterativeRAG).</li>
383
- <li><strong>Factual</strong>: LLM-as-a-Judge accuracy (%) on factual queries (single-hop fact retrieval). <em>Higher is better</em>.</li>
384
- <li><strong>Reasoning</strong>: LLM-as-a-Judge accuracy (%) on reasoning queries (multi-hop inference, 2-4 hops). <em>Higher is better</em>.</li>
385
- <li><strong>Summary</strong>: LLM-as-a-Judge accuracy (%) on summary queries (global information aggregation). <em>Higher is better</em>.</li>
386
- <li><strong>Avg Acc</strong>: Average accuracy (%) across all three query types. <em>Higher is better</em>.</li>
387
- <li><strong>Token</strong>: Average token consumption per query. <em>Lower is more efficient</em>.</li>
388
- </ul>
389
- </div>
390
- """, unsafe_allow_html=True)
391
-
392
- df_full = pd.read_csv("utils/full_lb.csv")
393
-
394
- col1_f, col2_f, col3_f, col4_f = st.columns([2, 2, 2, 3])
395
-
396
- with col1_f:
397
- model_select_f = st.selectbox(
398
- "Model",
399
- options=["All"] + df_full["Model"].unique().tolist(),
400
- index=0,
401
- key="model_full"
402
- )
403
-
404
- with col2_f:
405
- sort_by_f = st.selectbox(
406
- "Sort by",
407
- options=df_full.columns.tolist(),
408
- index=df_full.columns.tolist().index("Avg Acc"),
409
- key="sort_full"
410
- )
411
-
412
- with col3_f:
413
- order_f = st.radio(
414
- "Order",
415
- options=["Descending", "Ascending"],
416
- horizontal=True,
417
- key="order_full"
418
- )
419
-
420
- with col4_f:
421
- search_f = st.text_input("Search", placeholder="Search in all columns...", key="search_full")
422
-
423
- df_display_f = df_full.copy()
424
-
425
- if model_select_f != "All":
426
- df_display_f = df_display_f[df_display_f["Model"] == model_select_f]
427
-
428
- if search_f:
429
- mask_f = df_display_f.apply(lambda row: row.astype(str).str.contains(search_f, case=False).any(), axis=1)
430
- df_display_f = df_display_f[mask_f]
431
-
432
- ascending_f = True if order_f == "Ascending" else False
433
- df_display_f = df_display_f.sort_values(by=sort_by_f, ascending=ascending_f).reset_index(drop=True)
434
-
435
- st.markdown(df_to_html_table(df_display_f), unsafe_allow_html=True)
436
-
437
- with lb_tab2:
438
- # Structure Metrics Explanation
439
- st.markdown("""
440
- <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
441
- <p style="font-weight: bold; margin-bottom: 10px;">🔗 Structural Topology Metrics:</p>
442
- <ul style="margin: 0; padding-left: 20px;">
443
- <li>Nodes: Number of nodes in the knowledge graph.</li>
444
- <li><strong>Edges</strong>: Number of edges in the knowledge graph.</li>
445
- <li><strong>Density</strong>: Edge saturation level. <em>Excessive sparsity limits relational bridges</em>.</li>
446
- <li><strong>Rel_Types (Relation Type Diversity)</strong>: Semantic richness of edges for precise graph traversal.</li>
447
- <li><strong>Avg_Deg (Average Degree)</strong>: Average connections per node, reflecting connection intensity.</li>
448
- <li><strong>Comp (Connected Components)</strong>: Number of independent subgraphs.</li>
449
- <li><strong>LCC_Ratio (Largest Connected Component Ratio)</strong>: Proportion of nodes in the largest subgraph. <em>Low values indicate graph fragmentation that breaks multi-hop paths</em>.</li>
450
- <li><strong>Cluster_Coeff (Clustering Coefficient)</strong>: Local cohesiveness. <em>High values indicate tight communities that facilitate evidence aggregation</em>.</li>
451
- </ul>
452
- </div>
453
- """, unsafe_allow_html=True)
454
-
455
- df_structure = pd.read_csv("utils/corpus_structure.csv")
456
- col1_s, col2_s, col3_s = st.columns([2, 2, 3])
457
- with col1_s:
458
- sort_by_s = st.selectbox(
459
- "Sort by",
460
- options=df_structure.columns.tolist(),
461
- index=0,
462
- key="sort_structure"
463
- )
464
-
465
- with col2_s:
466
- order_s = st.radio(
467
- "Order",
468
- options=["Descending", "Ascending"],
469
- horizontal=True,
470
- key="order_structure"
471
- )
472
-
473
- with col3_s:
474
- search_s = st.text_input("Search", placeholder="Search in all columns...", key="search_structure")
475
-
476
- df_display_s = df_structure.copy()
477
- if search_s:
478
- mask_s = df_display_s.apply(lambda row: row.astype(str).str.contains(search_s, case=False).any(), axis=1)
479
- df_display_s = df_display_s[mask_s]
480
- ascending_s = True if order_s == "Ascending" else False
481
- df_display_s = df_display_s.sort_values(by=sort_by_s, ascending=ascending_s).reset_index(drop=True)
482
- st.markdown(df_to_html_table(df_display_s, height=200), unsafe_allow_html=True)
483
-
484
- # Semantic Metrics Explanation
485
- st.markdown("""
486
- <div style="font-size: 15px; line-height: 1.5; color: #333; margin-top: 30px; margin-bottom: 20px;">
487
- <p style="font-weight: bold; margin-bottom: 10px;">🧠 Semantic Space Metrics:</p>
488
- <ul style="margin: 0; padding-left: 20px;">
489
- <li><strong>Chunks</strong>: Number of text chunks in the corpus.</li>
490
- <li><strong>Int_Dim (Intrinsic Dimension)</strong>: Effective degrees of freedom estimated via TwoNN. <em>High dimensionality exacerbates the curse of dimensionality, diminishing distance-based similarity</em>.</li>
491
- <li><strong>Hubness</strong>: Skewness of k-occurrence distribution, measuring retrieval interference. <em>High values indicate hub vectors that dominate nearest-neighbor lists, causing bias toward frequently retrieved but potentially irrelevant passages</em>.</li>
492
- <li><strong>Avg_Dist (Average Distance)</strong>: Average distance to centroid, reflecting overall distribution spread.</li>
493
- <li><strong>Std_Dist (Standard Deviation)</strong>: Distance standard deviation, revealing distributional imbalance. <em>High values indicate uneven distribution</em>.</li>
494
- <li><strong>Min_Dist (Minimum Distance)</strong>: Distance of closest cluster pair, identifying most confusable semantic regions. <em>Low dispersion causes semantic crowding that hinders hard-negative discrimination</em>.</li>
495
- <li><strong>Max_Dist (Maximum Distance)</strong>: Distance of farthest cluster pair, reflecting maximum semantic space span.</li>
496
- </ul>
497
- </div>
498
- """, unsafe_allow_html=True)
499
-
500
- df_semantic = pd.read_csv("utils/corpus_semantic.csv")
501
- col1_m, col2_m, col3_m = st.columns([2, 2, 3])
502
-
503
- with col1_m:
504
- sort_by_m = st.selectbox(
505
- "Sort by",
506
- options=df_semantic.columns.tolist(),
507
- index=0,
508
- key="sort_semantic"
509
- )
510
-
511
- with col2_m:
512
- order_m = st.radio(
513
- "Order",
514
- options=["Descending", "Ascending"],
515
- horizontal=True,
516
- key="order_semantic"
517
- )
518
-
519
- with col3_m:
520
- search_m = st.text_input("Search", placeholder="Search in all columns...", key="search_semantic")
521
-
522
- df_display_m = df_semantic.copy()
523
- if search_m:
524
- mask_m = df_display_m.apply(lambda row: row.astype(str).str.contains(search_m, case=False).any(), axis=1)
525
- df_display_m = df_display_m[mask_m]
526
- ascending_m = True if order_m == "Ascending" else False
527
- df_display_m = df_display_m.sort_values(by=sort_by_m, ascending=ascending_m).reset_index(drop=True)
528
- st.markdown(df_to_html_table(df_display_m, height=200), unsafe_allow_html=True)
529
-
530
- with lb_tab3:
531
- # Metrics Explanation
532
- st.markdown("""
533
- <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
534
- <p style="font-weight: bold; margin-bottom: 10px;">📊 Metrics Explained:</p>
535
- <ul style="margin: 0; padding-left: 20px;">
536
- <li><strong>Sem_F1 (Semantic F1)</strong>: Token-level semantic similarity between generated and reference answers using BERTScore. Range: 0-1, <em>higher is better</em>.</li>
537
- <li><strong>COV (Coverage)</strong>: Extent to which the answer covers key information using sentence embeddings. Range: 0-1, <em>higher is better</em>.</li>
538
- <li><strong>Faith_H (Faithfulness Hard)</strong>: Strict support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li>
539
- <li><strong>Faith_S (Faithfulness Soft)</strong>: Relaxed support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li>
540
- <li><strong>LLM_Cor_Pct (LLM-as-a-Judge)</strong>: Correctness rate via LLM ternary classification, aligned with human judgment. Range: 0-100%, <em>higher is better</em>.</li>
541
- </ul>
542
- </div>
543
- """, unsafe_allow_html=True)
544
-
545
- # Model files mapping
546
- model_files = {
547
- "DeepSeek-V3": "utils/effect_deepseek.csv",
548
- "Llama-3-8B": "utils/effect_llama.csv"
549
- }
550
-
551
- # Controls
552
- col1_e, col2_e, col3_e, col4_e = st.columns([2, 2, 2, 3])
553
-
554
- with col1_e:
555
- model_select = st.selectbox(
556
- "Model",
557
- options=list(model_files.keys()),
558
- index=0,
559
- key="model_effect"
560
- )
561
-
562
- df_effect = pd.read_csv(model_files[model_select])
563
-
564
- with col2_e:
565
- sort_by_e = st.selectbox(
566
- "Sort by",
567
- options=df_effect.columns.tolist(),
568
- index=0,
569
- key="sort_effect"
570
- )
571
-
572
- with col3_e:
573
- order_e = st.radio(
574
- "Order",
575
- options=["Descending", "Ascending"],
576
- horizontal=True,
577
- key="order_effect"
578
- )
579
-
580
- with col4_e:
581
- search_e = st.text_input("Search", placeholder="Search in all columns...", key="search_effect")
582
-
583
- df_display_e = df_effect.copy()
584
-
585
- if search_e:
586
- mask_e = df_display_e.apply(lambda row: row.astype(str).str.contains(search_e, case=False).any(), axis=1)
587
- df_display_e = df_display_e[mask_e]
588
-
589
- ascending_e = True if order_e == "Ascending" else False
590
- df_display_e = df_display_e.sort_values(by=sort_by_e, ascending=ascending_e).reset_index(drop=True)
591
-
592
- st.markdown(df_to_html_table(df_display_e), unsafe_allow_html=True)
593
-
594
- with lb_tab4:
595
- # Cost Explanation
596
- st.markdown("""
597
- <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
598
- <p style="font-weight: bold; margin-bottom: 10px;">💰 Cost Explained:</p>
599
- <ul style="margin: 0; padding-left: 20px;">
600
- <li><strong>Total_Tokens</strong>: Total token consumption = Retrieval_Total + Generation_Total.</li>
601
- <li><strong>Retrieval_Total</strong>: Total tokens in retrieval phase = Retrieval_Input + Retrieval_Output. Includes entity extraction, multi-turn queries. For GraphRAG/HybridRAG, includes amortized one-time graph construction cost.</li>
602
- <li><strong>Generation_Total</strong>: Total tokens in generation phase = Generation_Input + Generation_Output. Primarily determined by context length.</li>
603
- <li><strong>Avg_Context_Tokens</strong>: Average retrieved context length per query. <em>Higher means more retrieved content but also higher cost</em>.</li>
604
- <li><strong>Num_Questions</strong>: Number of queries in the dataset.</li>
605
- </ul>
606
- </div>
607
- """, unsafe_allow_html=True)
608
-
609
- # Read data
610
- df_efficiency = pd.read_csv("utils/retrieval_generation_cost.csv")
611
-
612
- # Controls
613
- col1, col2, col3 = st.columns([2, 2, 3])
614
-
615
- with col1:
616
- sort_by = st.selectbox(
617
- "Sort by",
618
- options=df_efficiency.columns.tolist(),
619
- index=df_efficiency.columns.tolist().index("Total_Tokens") # 默认按 total_tokens 排序
620
- )
621
-
622
- with col2:
623
- order = st.radio(
624
- "Order",
625
- options=["Descending", "Ascending"],
626
- horizontal=True
627
- )
628
-
629
- with col3:
630
- search = st.text_input("Search", placeholder="Search in all columns...")
631
-
632
- df_display = df_efficiency.copy()
633
-
634
- if search:
635
- mask = df_display.apply(lambda row: row.astype(str).str.contains(search, case=False).any(), axis=1)
636
- df_display = df_display[mask]
637
-
638
- ascending = True if order == "Ascending" else False
639
- df_display = df_display.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)
640
- st.markdown(df_to_html_table(df_display), unsafe_allow_html=True)
641
-
642
-
643
- # Questions & Contact 部分
644
- contact_icon = get_image_base64("utils/contact_icon.png")
645
-
646
- st.markdown(f"""
647
- <h2 id="contact" style="color: #333333;
648
- padding-bottom: 10px;
649
- margin-top: 10px;
650
- font-family: 'Ubuntu Mono', monospace;
651
- font-size: 30px;">
652
- <img src="data:image/png;base64,{contact_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
653
- Questions & Contact
654
- </h2>
655
- """, unsafe_allow_html=True)
656
-
657
- st.markdown("""
658
- <div style="
659
- border: 2px solid #e0e0e0;
660
- border-radius: 15px;
661
- padding: 25px 30px;
662
- background-color: #fafafa;
663
- margin-bottom: 10px;
664
- font-size: 16px;
665
- line-height: 1.6;
666
- color: #333;
667
- ">
668
- <p style="margin-bottom: 5px;">
669
- If you have any questions about RAGRouter-Bench, please feel free to reach out to us:
670
- </p>
671
- <ul style="margin: 0; padding-left: 20px;">
672
- <li><strong>Email</strong>: <a href="mailto:RAGRouterBench@example.com" style="color: #667eea;">RAGRouterBench@example.com</a></li>
673
- <li><strong>GitHub</strong>: <a href="https://github.com/your-repo/RAGRouter-Bench" style="color: #667eea;" target="_blank">github.com/your-repo/RAGRouter-Bench</a></li>
674
- </ul>
675
- <p style="margin-top: 5px; margin-bottom: 0;">
676
- For bug reports or feature requests, please open an issue on our GitHub repository.
677
- </p>
678
- </div>
679
- """, unsafe_allow_html=True)