Spaces:
Runtime error
Runtime error
Upload 14 files
Browse files- .gitattributes +3 -0
- app.py +679 -0
- utils/Data_Profile.png +3 -0
- utils/Overall_Pipeline.png +3 -0
- utils/RAG_Paradigms.png +3 -0
- utils/about_icon.png +0 -0
- utils/contact_icon.png +0 -0
- utils/corpus_semantic.csv +5 -0
- utils/corpus_structure.csv +5 -0
- utils/effect_deepseek.csv +21 -0
- utils/effect_llama.csv +21 -0
- utils/full_lb.csv +33 -0
- utils/leaderboard_icon.png +0 -0
- utils/retrieval_generation_cost.csv +21 -0
- utils/title_icon.png +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
utils/Data_Profile.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
utils/Overall_Pipeline.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
utils/RAG_Paradigms.png filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,679 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import base64
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
+
os.path.join(BASE_DIR, "utils", "title_icon.png")
|
| 8 |
+
|
| 9 |
+
# 读取图片并转为 base64
|
| 10 |
+
def get_image_base64(image_path):
|
| 11 |
+
with open(image_path, "rb") as f:
|
| 12 |
+
return base64.b64encode(f.read()).decode()
|
| 13 |
+
|
| 14 |
+
# 设置 dataframe 样式:斑马纹 + 表头黑色加粗
|
| 15 |
+
def style_dataframe(df):
|
| 16 |
+
def row_style(row):
|
| 17 |
+
if row.name % 2 == 0:
|
| 18 |
+
return ['background-color: #f9f9f9'] * len(row)
|
| 19 |
+
return ['background-color: #ffffff'] * len(row)
|
| 20 |
+
|
| 21 |
+
return df.style.set_table_styles([
|
| 22 |
+
# 表头样式
|
| 23 |
+
{'selector': 'th', 'props': [
|
| 24 |
+
('background-color', '#f0f0f0'),
|
| 25 |
+
('color', '#000000'),
|
| 26 |
+
('font-weight', 'bold'),
|
| 27 |
+
('text-align', 'left'),
|
| 28 |
+
('padding', '8px')
|
| 29 |
+
]},
|
| 30 |
+
# 单元格样式
|
| 31 |
+
{'selector': 'td', 'props': [
|
| 32 |
+
('text-align', 'left'),
|
| 33 |
+
('padding', '8px')
|
| 34 |
+
]},
|
| 35 |
+
# 表头文字样式
|
| 36 |
+
{'selector': 'th.col_heading', 'props': [
|
| 37 |
+
('background-color', '#f0f0f0'),
|
| 38 |
+
('color', '#000000'),
|
| 39 |
+
('font-weight', 'bold')
|
| 40 |
+
]}
|
| 41 |
+
]).apply(row_style, axis=1)
|
| 42 |
+
|
| 43 |
+
def df_to_html_table(df, height=400):
|
| 44 |
+
html = f'<div style="max-height: {height}px; overflow-y: auto; border: 1px solid #d0d0d0; border-radius: 8px;">'
|
| 45 |
+
html += '<table style="width: 100%; border-collapse: collapse; font-size: 14px;">'
|
| 46 |
+
|
| 47 |
+
# 调整表头:font-weight 改为 normal,padding 第一个值调小
|
| 48 |
+
html += '<thead><tr style="background-color: #e8e8e8; position: sticky; top: 0; z-index: 1;">'
|
| 49 |
+
for col in df.columns:
|
| 50 |
+
html += f'<th style="padding: 6px 14px; text-align: left; font-weight: normal; font-size: 15px; color: #000; border-bottom: 2px solid #ccc;">{col}</th>'
|
| 51 |
+
html += '</tr></thead><tbody>'
|
| 52 |
+
|
| 53 |
+
# 调整单元格:padding 第一个值调小
|
| 54 |
+
for i, (_, row) in enumerate(df.iterrows()):
|
| 55 |
+
bg = '#f5f5f5' if i % 2 == 0 else '#ffffff'
|
| 56 |
+
html += f'<tr style="background-color: {bg};">'
|
| 57 |
+
for val in row:
|
| 58 |
+
html += f'<td style="padding: 4px 14px; text-align: left; border-bottom: 1px solid #eee;">{val}</td>'
|
| 59 |
+
html += '</tr>'
|
| 60 |
+
|
| 61 |
+
html += '</tbody></table></div>'
|
| 62 |
+
return html
|
| 63 |
+
|
| 64 |
+
st.set_page_config(
|
| 65 |
+
page_title="RAGRouter-Bench: A Dataset and Benchmark for Adaptive RAG Routing",
|
| 66 |
+
layout="wide",
|
| 67 |
+
initial_sidebar_state="expanded",
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
#背景颜色
|
| 71 |
+
st.markdown("""
|
| 72 |
+
<style>
|
| 73 |
+
/* 隐藏顶部深色栏 */
|
| 74 |
+
header[data-testid="stHeader"] {
|
| 75 |
+
background-color: #ffffff;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/* 左边侧边栏 - 灰色背景 */
|
| 79 |
+
[data-testid="stSidebar"] {
|
| 80 |
+
display: none;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/* 右边主内容区 - 白色背景 */
|
| 84 |
+
[data-testid="stMain"] {
|
| 85 |
+
background-color: #ffffff;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
/* 隐藏顶部 header 的高度 */
|
| 89 |
+
header[data-testid="stHeader"] {
|
| 90 |
+
height: 0 !important;
|
| 91 |
+
min-height: 0 !important;
|
| 92 |
+
padding: 0 !important;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
/* 减少顶部间距 - 调整这个值 */
|
| 96 |
+
.block-container {
|
| 97 |
+
padding-top: 0 !important; /* 移除顶部留白 */
|
| 98 |
+
max-width: 1200px; /* 最大宽度 */
|
| 99 |
+
padding-left: 2rem; /* 左边距 */
|
| 100 |
+
padding-right: 2rem; /* 右边距 */
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
/* 标签页字体大小和样式 */
|
| 104 |
+
.stTabs [data-baseweb="tab"] p {
|
| 105 |
+
font-size: 18px !important; /* 字体大小 */
|
| 106 |
+
font-weight: bold; /* 加粗 */
|
| 107 |
+
padding: 10px 20px; /* 内边距 */
|
| 108 |
+
color: #333333; /* 字体颜色 */
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
/* 给所有 tabs 区域加边框 */
|
| 112 |
+
[data-testid="stTabs"] {
|
| 113 |
+
border: 2px solid #e0e0e0;
|
| 114 |
+
border-radius: 15px;
|
| 115 |
+
padding: 5px 5px 35px 20px;
|
| 116 |
+
background-color: #fafafa;
|
| 117 |
+
margin-bottom: 5px;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
/* tabs 固定高度和滚动 (Leaderboard 700px) */
|
| 121 |
+
.stTabs [data-baseweb="tab-panel"] {
|
| 122 |
+
max-height: 600px !important;
|
| 123 |
+
overflow-y: auto !important;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
/* 表格内容左对齐 - glide-data-grid */
|
| 127 |
+
[data-testid="stDataFrame"] .dvn-scroller,
|
| 128 |
+
[data-testid="stDataFrame"] [class*="cell"],
|
| 129 |
+
[data-testid="stDataFrame"] div[style*="justify-content"] {
|
| 130 |
+
text-align: left !important;
|
| 131 |
+
justify-content: flex-start !important;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
/* glide data editor 单元格 */
|
| 135 |
+
.gdg-cell {
|
| 136 |
+
justify-content: flex-start !important;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
code {
|
| 140 |
+
background-color: transparent !important;
|
| 141 |
+
color: #333 !important;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
pre {
|
| 145 |
+
background-color: #f5f5f5 !important;
|
| 146 |
+
color: #333 !important;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
pre code {
|
| 150 |
+
background-color: transparent !important;
|
| 151 |
+
color: #333 !important;
|
| 152 |
+
}
|
| 153 |
+
</style>
|
| 154 |
+
""", unsafe_allow_html=True)
|
| 155 |
+
|
| 156 |
+
#标题
|
| 157 |
+
title_icon = get_image_base64("utils/title_icon.png")
|
| 158 |
+
st.markdown(f"""
|
| 159 |
+
<div style="background-color: #f0f0f0;
|
| 160 |
+
padding: 20px 20px;
|
| 161 |
+
margin: 0 -30rem 20px -30rem;">
|
| 162 |
+
<h1 style="text-align: center;
|
| 163 |
+
font-size: 36px;
|
| 164 |
+
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
|
| 165 |
+
-webkit-background-clip: text;
|
| 166 |
+
-webkit-text-fill-color: transparent;
|
| 167 |
+
padding: 5px;
|
| 168 |
+
margin: 0;">
|
| 169 |
+
<img src="data:image/png;base64,{title_icon}" width="45" style="vertical-align: middle; margin-right: 1px;">
|
| 170 |
+
RAGRouter-Bench:<br> A Dataset and Benchmark for Adaptive RAG Routing
|
| 171 |
+
</h1>
|
| 172 |
+
</div>
|
| 173 |
+
""", unsafe_allow_html=True)
|
| 174 |
+
|
| 175 |
+
# 统计横幅
|
| 176 |
+
st.markdown("""
|
| 177 |
+
<div style="
|
| 178 |
+
background-color: #e8f4fc;
|
| 179 |
+
border: 2px solid #b8d4e8;
|
| 180 |
+
border-radius: 15px;
|
| 181 |
+
margin: 0 auto 30px auto;
|
| 182 |
+
max-width: 100%;
|
| 183 |
+
text-align: center;
|
| 184 |
+
font-size: 18px;
|
| 185 |
+
color: #333;
|
| 186 |
+
">
|
| 187 |
+
<span style="margin: 0 5px;"><strong>📚 4 Corpus Domains</strong></span>|
|
| 188 |
+
<span style="margin: 0 5px;"><strong>📄 21K Documents</strong></span>|
|
| 189 |
+
<span style="margin: 0 5px;"><strong>❓ 7.7K Query Types</strong></span>|
|
| 190 |
+
<span style="margin: 0 5px;"><strong>📊 3 Dimension Evaluations</strong></span>|
|
| 191 |
+
<span style="margin: 0 5px;"><strong>🔄 5 RAG Paradigms</strong></span>|
|
| 192 |
+
<span style="margin: 0 5px;"><strong>🤖 2 LLMs Tested</strong></span>
|
| 193 |
+
</div>
|
| 194 |
+
""", unsafe_allow_html=True)
|
| 195 |
+
|
| 196 |
+
# 主内容 - 添加锚点ID
|
| 197 |
+
# About 部分
|
| 198 |
+
with st.container():
|
| 199 |
+
about_icon = get_image_base64("utils/about_icon.png")
|
| 200 |
+
|
| 201 |
+
st.markdown(f"""
|
| 202 |
+
<h2 id="about" style="color: #333333;
|
| 203 |
+
padding-bottom: 10px;
|
| 204 |
+
font-family: 'Ubuntu Mono', monospace;
|
| 205 |
+
font-size: 30px;">
|
| 206 |
+
<img src="data:image/png;base64,{about_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
|
| 207 |
+
About
|
| 208 |
+
</h2>
|
| 209 |
+
""", unsafe_allow_html=True)
|
| 210 |
+
|
| 211 |
+
# About 内的标签页
|
| 212 |
+
about_tab1, about_tab2, about_tab3 = st.tabs(["📋 Overview", "⭐ Key Features", "🚀 Get Started"])
|
| 213 |
+
|
| 214 |
+
with about_tab1:
|
| 215 |
+
pipeline_img = get_image_base64("utils/Overall_Pipeline.png")
|
| 216 |
+
|
| 217 |
+
st.markdown(f"""
|
| 218 |
+
<div style="padding-right: 10px;">
|
| 219 |
+
<div style="text-align: center; margin: 1px 0;">
|
| 220 |
+
<img src="data:image/png;base64,{pipeline_img}" width="50%" style="border-radius: 10px;">
|
| 221 |
+
<p style="color: #666; font-size: 16px;">Overall Pipeline</p>
|
| 222 |
+
</div>
|
| 223 |
+
<div style="font-size: 16px; line-height: 1.4; color: #333; text-align: justify;">
|
| 224 |
+
<p>Retrieval-Augmented Generation (RAG) has become a core paradigm for grounding large language models with external knowledge.
|
| 225 |
+
Despite extensive efforts exploring diverse retrieval strategies, <strong>existing studies predominantly focus on query-side complexity or isolated method improvements, lacking a systematic understanding of how RAG paradigms behave across different query–corpus contexts and effectiveness–efficiency trade-offs</strong>.
|
| 226 |
+
In this work, we introduce RAGRouter-Bench, the first dataset and benchmark designed for adaptive RAG routing.
|
| 227 |
+
RAGRouter-Bench revisits retrieval from a query–corpus compatibility perspective and standardizes five representative RAG paradigms for systematic evaluation across 7,727 queries and 21,460 documents spanning diverse domains.
|
| 228 |
+
The benchmark incorporates three canonical query types together with fine-grained semantic and structural corpus metrics, as well as a unified evaluation for both generation quality and resource consumption.
|
| 229 |
+
Experiments with DeepSeek-V3 and LLaMA-3.1-8B demonstrate that <strong>no single RAG paradigm is universally optimal, that paradigm applicability is strongly shaped by query–corpus interactions, and that increased advanced mechanism does not necessarily yield better effectiveness–efficiency trade-offs</strong>.
|
| 230 |
+
These findings underscore the necessity of routing-aware evaluation and establish a foundation for adaptive, interpretable, and generalizable next-generation RAG systems.</p>
|
| 231 |
+
</div>
|
| 232 |
+
</div>
|
| 233 |
+
""", unsafe_allow_html=True)
|
| 234 |
+
|
| 235 |
+
with about_tab2:
|
| 236 |
+
bench_img = get_image_base64("utils/Data_Profile.png")
|
| 237 |
+
|
| 238 |
+
st.markdown(f"""
|
| 239 |
+
<div style="padding-right: 10px;">
|
| 240 |
+
<div style="text-align: center; margin: 1px 0;">
|
| 241 |
+
<img src="data:image/png;base64,{bench_img}" width="70%">
|
| 242 |
+
<p style="color: #666; font-size: 16px;">Benchmark Features</p>
|
| 243 |
+
</div>
|
| 244 |
+
<div style="font-size: 16px; line-height: 1.4; color: #333; text-align: left;">
|
| 245 |
+
<p style="margin-top: 1px; margin-bottom: 1px;"><strong>🌐 Multi-Domain Corpora</strong></p>
|
| 246 |
+
<ul>
|
| 247 |
+
<li><strong>Wikipedia (MuSiQue)</strong>: Encyclopedic knowledge with explicit entity relations (5,427 documents)</li>
|
| 248 |
+
<li><strong>Literature (QuALITY)</strong>: Long-form narratives with implicit semantic structures (2,523 documents)</li>
|
| 249 |
+
<li><strong>Legal (UltraDomain)</strong>: Professional domain with dense terminology (6,510 documents)</li>
|
| 250 |
+
<li><strong>Medical (GraphRAG-Bench)</strong>: Specialized knowledge requiring precise reasoning (7,000 documents)</li>
|
| 251 |
+
</ul>
|
| 252 |
+
<p style="margin-top: 1px; margin-bottom: 1px;"><strong>❓ Three Query Types</strong></p>
|
| 253 |
+
<ul>
|
| 254 |
+
<li><strong>Factual Queries</strong>: Single-hop lookup requiring direct fact retrieval</li>
|
| 255 |
+
<li><strong>Reasoning Queries</strong>: Multi-hop inference across chained evidence (2-4 hops)</li>
|
| 256 |
+
<li><strong>Summary Queries</strong>: Global aggregation over dispersed information</li>
|
| 257 |
+
</ul>
|
| 258 |
+
<p style="margin-top: 1px; margin-bottom: 1px;"><strong>🔄 Five RAG Paradigm</strong></p>
|
| 259 |
+
<ul>
|
| 260 |
+
<li><strong>RAG Paradigm</strong>:LLM-only, NaiveRAG, GraphRAG, HybridRAG, IterativeRAG
|
| 261 |
+
</ul>
|
| 262 |
+
<p style="margin-top: 1px; margin-bottom: 1px;"><strong>📊 Dual-View Corpus Evaluation</strong></p>
|
| 263 |
+
<ul>
|
| 264 |
+
<li><strong>Structural Metrics</strong>: Connectivity (LCC Ratio, Relation Types), Density (Avg Degree, Max Centrality), Clustering Coefficient</li>
|
| 265 |
+
<li><strong>Semantic Metrics</strong>: Intrinsic Dimension, Dispersion (Avg/Min/Std Distance), Hubness</li>
|
| 266 |
+
<li><strong>Quality Assurance</strong>: LLM-based query augmentation with Verify-then-Filter validation</li>
|
| 267 |
+
</ul>
|
| 268 |
+
<p style="margin-top: 1px; margin-bottom: 1px;"><strong>⚖️ Effectiveness-Efficiency Evaluation</strong></p>
|
| 269 |
+
<ul>
|
| 270 |
+
<li><strong>Effectiveness</strong>: LLM-as-a-Judge accuracy across three dimensions (Information Coverage, Semantic Accuracy, Logical Consistency)</li>
|
| 271 |
+
<li><strong>Efficiency</strong>: Token consumption decomposed into Retrieval Cost and Generation Cost</li>
|
| 272 |
+
</ul>
|
| 273 |
+
</div>
|
| 274 |
+
</div>
|
| 275 |
+
""", unsafe_allow_html=True)
|
| 276 |
+
|
| 277 |
+
with about_tab3:
|
| 278 |
+
paradigms_img = get_image_base64("utils/RAG_Paradigms.png")
|
| 279 |
+
|
| 280 |
+
st.markdown(f"""
|
| 281 |
+
<div style="padding-right: 10px; font-size: 16px; line-height: 1.4; color: #333;">
|
| 282 |
+
<div style="text-align: center; margin: 1px 0;">
|
| 283 |
+
<img src="data:image/png;base64,{paradigms_img}" width="60%">
|
| 284 |
+
<p style="color: #666; font-size: 16px;">RAG Paradigm</p>
|
| 285 |
+
</div>
|
| 286 |
+
|
| 287 |
+
<a href="https://your-dataset-link.com" style="color: #667eea;" target="_blank">📥 Download RAGRouter-Bench Dataset</a>
|
| 288 |
+
|
| 289 |
+
<p style="margin-top: 1px; margin-bottom: 5px;"><strong>💻 Installation</strong></p>
|
| 290 |
+
<pre style="background-color: #f0f7ff !important; padding: 10px; border-radius: 5px; overflow-x: auto; border: 1px solid #cce0ff;">
|
| 291 |
+
<code style="background-color: transparent !important; color: #333 !important; font-family: 'Courier New', monospace !important;">git clone https://github.com/your-repo/RAGRouter-Bench
|
| 292 |
+
cd RAGRouter-Bench
|
| 293 |
+
conda env create -f environment.yml
|
| 294 |
+
conda activate ragBench</code></pre>
|
| 295 |
+
|
| 296 |
+
<p style="margin-top: 1px; margin-bottom: 5px;"><strong>⚙️ Configuration</strong></p>
|
| 297 |
+
<ul>
|
| 298 |
+
<li>Set your API key in <code>Config/LLMConfig.py</code> (<code>DEEPSEEK_API_KEY</code> or <code>OPENAI_API_KEY</code>)</li>
|
| 299 |
+
</ul>
|
| 300 |
+
|
| 301 |
+
<p style="margin-top: 1px; margin-bottom: 5px;"><strong>🚀 Quick Start</strong></p>
|
| 302 |
+
<table style="width: 100%; border-collapse: collapse; margin: 10px 0;">
|
| 303 |
+
<tr style="background-color: #f0f0f0;">
|
| 304 |
+
<th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Step</th>
|
| 305 |
+
<th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Command</th>
|
| 306 |
+
<th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Description</th>
|
| 307 |
+
</tr>
|
| 308 |
+
<tr>
|
| 309 |
+
<td style="border: 1px solid #ddd; padding: 8px;">1. Process</td>
|
| 310 |
+
<td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py process all --dataset musique</code></td>
|
| 311 |
+
<td style="border: 1px solid #ddd; padding: 8px;">Chunking, embedding, graph building</td>
|
| 312 |
+
</tr>
|
| 313 |
+
<tr>
|
| 314 |
+
<td style="border: 1px solid #ddd; padding: 8px;">2. Retrieve</td>
|
| 315 |
+
<td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py retrieve graph --dataset musique</code></td>
|
| 316 |
+
<td style="border: 1px solid #ddd; padding: 8px;">Run RAG retrieval</td>
|
| 317 |
+
</tr>
|
| 318 |
+
<tr>
|
| 319 |
+
<td style="border: 1px solid #ddd; padding: 8px;">3. Evaluate</td>
|
| 320 |
+
<td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py evaluate result --dataset musique --method graph_rag</code></td>
|
| 321 |
+
<td style="border: 1px solid #ddd; padding: 8px;">Evaluate results</td>
|
| 322 |
+
</tr>
|
| 323 |
+
<tr>
|
| 324 |
+
<td style="border: 1px solid #ddd; padding: 8px;">Full Pipeline</td>
|
| 325 |
+
<td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py pipeline --dataset musique --method graph</code></td>
|
| 326 |
+
<td style="border: 1px solid #ddd; padding: 8px;">Run all steps</td>
|
| 327 |
+
</tr>
|
| 328 |
+
</table>
|
| 329 |
+
|
| 330 |
+
<p style="margin-top: 15px; margin-bottom: 5px;"><strong>Available Datasets</strong></p>
|
| 331 |
+
<ul>
|
| 332 |
+
<li><code>musique</code> - Wikipedia (Encyclopedic)</li>
|
| 333 |
+
<li><code>quality</code> - Literature (Narrative)</li>
|
| 334 |
+
<li><code>ultraDomain_legal</code> - Legal (Professional)</li>
|
| 335 |
+
<li><code>graphragBench_medical</code> - Medical (Professional)</li>
|
| 336 |
+
</ul>
|
| 337 |
+
|
| 338 |
+
<p style="margin-top: 1px; margin-bottom: 5px;"><strong>Available RAG Paradigms</strong></p>
|
| 339 |
+
<ul>
|
| 340 |
+
<li><code>naive</code> - NaiveRAG (vector retrieval)</li>
|
| 341 |
+
<li><code>graph</code> - GraphRAG (graph traversal)</li>
|
| 342 |
+
<li><code>hybrid</code> - HybridRAG (naive + graph fusion)</li>
|
| 343 |
+
<li><code>iterative</code> - IterativeRAG (multi-round retrieval)</li>
|
| 344 |
+
<li><code>llm_direct</code> - LLM-only (no retrieval)</li>
|
| 345 |
+
</ul>
|
| 346 |
+
|
| 347 |
+
<p style="margin-top: 1px; margin-bottom: 5px;"><strong>Data Format</strong></p>
|
| 348 |
+
<p>Your data should be placed in <code>Dataset/Rawutils/{{dataset_name}}/</code> with:</p>
|
| 349 |
+
<ul>
|
| 350 |
+
<li><code>Corpus.json</code> - Document collection with <code>doc_id</code>, <code>title</code>, <code>text</code></li>
|
| 351 |
+
<li><code>Question.json</code> - Queries with <code>question_id</code>, <code>question</code>, <code>answer</code>, <code>query_type</code>, <code>supporting_facts</code></li>
|
| 352 |
+
</ul>
|
| 353 |
+
</div>
|
| 354 |
+
""", unsafe_allow_html=True)
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# Leaderboard 部分
|
| 359 |
+
leaderboard_icon = get_image_base64("utils/leaderboard_icon.png")
|
| 360 |
+
|
| 361 |
+
st.markdown(f"""
|
| 362 |
+
<h2 id="leaderboard" style="color: #333333;
|
| 363 |
+
padding-bottom: 10px;
|
| 364 |
+
margin-top: 10px;
|
| 365 |
+
font-family: 'Ubuntu Mono', monospace;
|
| 366 |
+
font-size: 30px;">
|
| 367 |
+
<img src="data:image/png;base64,{leaderboard_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
|
| 368 |
+
Leaderboard
|
| 369 |
+
</h2>
|
| 370 |
+
""", unsafe_allow_html=True)
|
| 371 |
+
|
| 372 |
+
# Leaderboard 内的标签页
|
| 373 |
+
lb_tab1, lb_tab2, lb_tab3, lb_tab4 = st.tabs(["🏆 Full Leaderboard", "📁 Corpus Metrics", "📈 Effectiveness Metrics", "⚡ Efficiency Metrics"])
|
| 374 |
+
|
| 375 |
+
with lb_tab1:
|
| 376 |
+
# Full Leaderboard Explanation
|
| 377 |
+
st.markdown("""
|
| 378 |
+
<div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
|
| 379 |
+
<p style="font-weight: bold; margin-bottom: 10px;">📋 Columns Explained:</p>
|
| 380 |
+
<ul style="margin: 0; padding-left: 20px;">
|
| 381 |
+
<li><strong>Dataset</strong>: Corpus domain (MuSiQue-Wikipedia, QuALITY-Literature, Legal, Medical).</li>
|
| 382 |
+
<li><strong>Method</strong>: RAG paradigm (NaiveRAG, GraphRAG, HybridRAG, IterativeRAG).</li>
|
| 383 |
+
<li><strong>Factual</strong>: LLM-as-a-Judge accuracy (%) on factual queries (single-hop fact retrieval). <em>Higher is better</em>.</li>
|
| 384 |
+
<li><strong>Reasoning</strong>: LLM-as-a-Judge accuracy (%) on reasoning queries (multi-hop inference, 2-4 hops). <em>Higher is better</em>.</li>
|
| 385 |
+
<li><strong>Summary</strong>: LLM-as-a-Judge accuracy (%) on summary queries (global information aggregation). <em>Higher is better</em>.</li>
|
| 386 |
+
<li><strong>Avg Acc</strong>: Average accuracy (%) across all three query types. <em>Higher is better</em>.</li>
|
| 387 |
+
<li><strong>Token</strong>: Average token consumption per query. <em>Lower is more efficient</em>.</li>
|
| 388 |
+
</ul>
|
| 389 |
+
</div>
|
| 390 |
+
""", unsafe_allow_html=True)
|
| 391 |
+
|
| 392 |
+
df_full = pd.read_csv("utils/full_lb.csv")
|
| 393 |
+
|
| 394 |
+
col1_f, col2_f, col3_f, col4_f = st.columns([2, 2, 2, 3])
|
| 395 |
+
|
| 396 |
+
with col1_f:
|
| 397 |
+
model_select_f = st.selectbox(
|
| 398 |
+
"Model",
|
| 399 |
+
options=["All"] + df_full["Model"].unique().tolist(),
|
| 400 |
+
index=0,
|
| 401 |
+
key="model_full"
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
with col2_f:
|
| 405 |
+
sort_by_f = st.selectbox(
|
| 406 |
+
"Sort by",
|
| 407 |
+
options=df_full.columns.tolist(),
|
| 408 |
+
index=df_full.columns.tolist().index("Avg Acc"),
|
| 409 |
+
key="sort_full"
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
with col3_f:
|
| 413 |
+
order_f = st.radio(
|
| 414 |
+
"Order",
|
| 415 |
+
options=["Descending", "Ascending"],
|
| 416 |
+
horizontal=True,
|
| 417 |
+
key="order_full"
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
with col4_f:
|
| 421 |
+
search_f = st.text_input("Search", placeholder="Search in all columns...", key="search_full")
|
| 422 |
+
|
| 423 |
+
df_display_f = df_full.copy()
|
| 424 |
+
|
| 425 |
+
if model_select_f != "All":
|
| 426 |
+
df_display_f = df_display_f[df_display_f["Model"] == model_select_f]
|
| 427 |
+
|
| 428 |
+
if search_f:
|
| 429 |
+
mask_f = df_display_f.apply(lambda row: row.astype(str).str.contains(search_f, case=False).any(), axis=1)
|
| 430 |
+
df_display_f = df_display_f[mask_f]
|
| 431 |
+
|
| 432 |
+
ascending_f = True if order_f == "Ascending" else False
|
| 433 |
+
df_display_f = df_display_f.sort_values(by=sort_by_f, ascending=ascending_f).reset_index(drop=True)
|
| 434 |
+
|
| 435 |
+
st.markdown(df_to_html_table(df_display_f), unsafe_allow_html=True)
|
| 436 |
+
|
| 437 |
+
with lb_tab2:
|
| 438 |
+
# Structure Metrics Explanation
|
| 439 |
+
st.markdown("""
|
| 440 |
+
<div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
|
| 441 |
+
<p style="font-weight: bold; margin-bottom: 10px;">🔗 Structural Topology Metrics:</p>
|
| 442 |
+
<ul style="margin: 0; padding-left: 20px;">
|
| 443 |
+
<li>Nodes: Number of nodes in the knowledge graph.</li>
|
| 444 |
+
<li><strong>Edges</strong>: Number of edges in the knowledge graph.</li>
|
| 445 |
+
<li><strong>Density</strong>: Edge saturation level. <em>Excessive sparsity limits relational bridges</em>.</li>
|
| 446 |
+
<li><strong>Rel_Types (Relation Type Diversity)</strong>: Semantic richness of edges for precise graph traversal.</li>
|
| 447 |
+
<li><strong>Avg_Deg (Average Degree)</strong>: Average connections per node, reflecting connection intensity.</li>
|
| 448 |
+
<li><strong>Comp (Connected Components)</strong>: Number of independent subgraphs.</li>
|
| 449 |
+
<li><strong>LCC_Ratio (Largest Connected Component Ratio)</strong>: Proportion of nodes in the largest subgraph. <em>Low values indicate graph fragmentation that breaks multi-hop paths</em>.</li>
|
| 450 |
+
<li><strong>Cluster_Coeff (Clustering Coefficient)</strong>: Local cohesiveness. <em>High values indicate tight communities that facilitate evidence aggregation</em>.</li>
|
| 451 |
+
</ul>
|
| 452 |
+
</div>
|
| 453 |
+
""", unsafe_allow_html=True)
|
| 454 |
+
|
| 455 |
+
df_structure = pd.read_csv("utils/corpus_structure.csv")
|
| 456 |
+
col1_s, col2_s, col3_s = st.columns([2, 2, 3])
|
| 457 |
+
with col1_s:
|
| 458 |
+
sort_by_s = st.selectbox(
|
| 459 |
+
"Sort by",
|
| 460 |
+
options=df_structure.columns.tolist(),
|
| 461 |
+
index=0,
|
| 462 |
+
key="sort_structure"
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
with col2_s:
|
| 466 |
+
order_s = st.radio(
|
| 467 |
+
"Order",
|
| 468 |
+
options=["Descending", "Ascending"],
|
| 469 |
+
horizontal=True,
|
| 470 |
+
key="order_structure"
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
with col3_s:
|
| 474 |
+
search_s = st.text_input("Search", placeholder="Search in all columns...", key="search_structure")
|
| 475 |
+
|
| 476 |
+
df_display_s = df_structure.copy()
|
| 477 |
+
if search_s:
|
| 478 |
+
mask_s = df_display_s.apply(lambda row: row.astype(str).str.contains(search_s, case=False).any(), axis=1)
|
| 479 |
+
df_display_s = df_display_s[mask_s]
|
| 480 |
+
ascending_s = True if order_s == "Ascending" else False
|
| 481 |
+
df_display_s = df_display_s.sort_values(by=sort_by_s, ascending=ascending_s).reset_index(drop=True)
|
| 482 |
+
st.markdown(df_to_html_table(df_display_s, height=200), unsafe_allow_html=True)
|
| 483 |
+
|
| 484 |
+
# Semantic Metrics Explanation
|
| 485 |
+
st.markdown("""
|
| 486 |
+
<div style="font-size: 15px; line-height: 1.5; color: #333; margin-top: 30px; margin-bottom: 20px;">
|
| 487 |
+
<p style="font-weight: bold; margin-bottom: 10px;">🧠 Semantic Space Metrics:</p>
|
| 488 |
+
<ul style="margin: 0; padding-left: 20px;">
|
| 489 |
+
<li><strong>Chunks</strong>: Number of text chunks in the corpus.</li>
|
| 490 |
+
<li><strong>Int_Dim (Intrinsic Dimension)</strong>: Effective degrees of freedom estimated via TwoNN. <em>High dimensionality exacerbates the curse of dimensionality, diminishing distance-based similarity</em>.</li>
|
| 491 |
+
<li><strong>Hubness</strong>: Skewness of k-occurrence distribution, measuring retrieval interference. <em>High values indicate hub vectors that dominate nearest-neighbor lists, causing bias toward frequently retrieved but potentially irrelevant passages</em>.</li>
|
| 492 |
+
<li><strong>Avg_Dist (Average Distance)</strong>: Average distance to centroid, reflecting overall distribution spread.</li>
|
| 493 |
+
<li><strong>Std_Dist (Standard Deviation)</strong>: Distance standard deviation, revealing distributional imbalance. <em>High values indicate uneven distribution</em>.</li>
|
| 494 |
+
<li><strong>Min_Dist (Minimum Distance)</strong>: Distance of closest cluster pair, identifying most confusable semantic regions. <em>Low dispersion causes semantic crowding that hinders hard-negative discrimination</em>.</li>
|
| 495 |
+
<li><strong>Max_Dist (Maximum Distance)</strong>: Distance of farthest cluster pair, reflecting maximum semantic space span.</li>
|
| 496 |
+
</ul>
|
| 497 |
+
</div>
|
| 498 |
+
""", unsafe_allow_html=True)
|
| 499 |
+
|
| 500 |
+
df_semantic = pd.read_csv("utils/corpus_semantic.csv")
|
| 501 |
+
col1_m, col2_m, col3_m = st.columns([2, 2, 3])
|
| 502 |
+
|
| 503 |
+
with col1_m:
|
| 504 |
+
sort_by_m = st.selectbox(
|
| 505 |
+
"Sort by",
|
| 506 |
+
options=df_semantic.columns.tolist(),
|
| 507 |
+
index=0,
|
| 508 |
+
key="sort_semantic"
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
with col2_m:
|
| 512 |
+
order_m = st.radio(
|
| 513 |
+
"Order",
|
| 514 |
+
options=["Descending", "Ascending"],
|
| 515 |
+
horizontal=True,
|
| 516 |
+
key="order_semantic"
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
with col3_m:
|
| 520 |
+
search_m = st.text_input("Search", placeholder="Search in all columns...", key="search_semantic")
|
| 521 |
+
|
| 522 |
+
df_display_m = df_semantic.copy()
|
| 523 |
+
if search_m:
|
| 524 |
+
mask_m = df_display_m.apply(lambda row: row.astype(str).str.contains(search_m, case=False).any(), axis=1)
|
| 525 |
+
df_display_m = df_display_m[mask_m]
|
| 526 |
+
ascending_m = True if order_m == "Ascending" else False
|
| 527 |
+
df_display_m = df_display_m.sort_values(by=sort_by_m, ascending=ascending_m).reset_index(drop=True)
|
| 528 |
+
st.markdown(df_to_html_table(df_display_m, height=200), unsafe_allow_html=True)
|
| 529 |
+
|
| 530 |
+
with lb_tab3:
|
| 531 |
+
# Metrics Explanation
|
| 532 |
+
st.markdown("""
|
| 533 |
+
<div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
|
| 534 |
+
<p style="font-weight: bold; margin-bottom: 10px;">📊 Metrics Explained:</p>
|
| 535 |
+
<ul style="margin: 0; padding-left: 20px;">
|
| 536 |
+
<li><strong>Sem_F1 (Semantic F1)</strong>: Token-level semantic similarity between generated and reference answers using BERTScore. Range: 0-1, <em>higher is better</em>.</li>
|
| 537 |
+
<li><strong>COV (Coverage)</strong>: Extent to which the answer covers key information using sentence embeddings. Range: 0-1, <em>higher is better</em>.</li>
|
| 538 |
+
<li><strong>Faith_H (Faithfulness Hard)</strong>: Strict support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li>
|
| 539 |
+
<li><strong>Faith_S (Faithfulness Soft)</strong>: Relaxed support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li>
|
| 540 |
+
<li><strong>LLM_Cor_Pct (LLM-as-a-Judge)</strong>: Correctness rate via LLM ternary classification, aligned with human judgment. Range: 0-100%, <em>higher is better</em>.</li>
|
| 541 |
+
</ul>
|
| 542 |
+
</div>
|
| 543 |
+
""", unsafe_allow_html=True)
|
| 544 |
+
|
| 545 |
+
# Model files mapping
|
| 546 |
+
model_files = {
|
| 547 |
+
"DeepSeek-V3": "utils/effect_deepseek.csv",
|
| 548 |
+
"Llama-3-8B": "utils/effect_llama.csv"
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
# Controls
|
| 552 |
+
col1_e, col2_e, col3_e, col4_e = st.columns([2, 2, 2, 3])
|
| 553 |
+
|
| 554 |
+
with col1_e:
|
| 555 |
+
model_select = st.selectbox(
|
| 556 |
+
"Model",
|
| 557 |
+
options=list(model_files.keys()),
|
| 558 |
+
index=0,
|
| 559 |
+
key="model_effect"
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
df_effect = pd.read_csv(model_files[model_select])
|
| 563 |
+
|
| 564 |
+
with col2_e:
|
| 565 |
+
sort_by_e = st.selectbox(
|
| 566 |
+
"Sort by",
|
| 567 |
+
options=df_effect.columns.tolist(),
|
| 568 |
+
index=0,
|
| 569 |
+
key="sort_effect"
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
with col3_e:
|
| 573 |
+
order_e = st.radio(
|
| 574 |
+
"Order",
|
| 575 |
+
options=["Descending", "Ascending"],
|
| 576 |
+
horizontal=True,
|
| 577 |
+
key="order_effect"
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
with col4_e:
|
| 581 |
+
search_e = st.text_input("Search", placeholder="Search in all columns...", key="search_effect")
|
| 582 |
+
|
| 583 |
+
df_display_e = df_effect.copy()
|
| 584 |
+
|
| 585 |
+
if search_e:
|
| 586 |
+
mask_e = df_display_e.apply(lambda row: row.astype(str).str.contains(search_e, case=False).any(), axis=1)
|
| 587 |
+
df_display_e = df_display_e[mask_e]
|
| 588 |
+
|
| 589 |
+
ascending_e = True if order_e == "Ascending" else False
|
| 590 |
+
df_display_e = df_display_e.sort_values(by=sort_by_e, ascending=ascending_e).reset_index(drop=True)
|
| 591 |
+
|
| 592 |
+
st.markdown(df_to_html_table(df_display_e), unsafe_allow_html=True)
|
| 593 |
+
|
| 594 |
+
with lb_tab4:
|
| 595 |
+
# Cost Explanation
|
| 596 |
+
st.markdown("""
|
| 597 |
+
<div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;">
|
| 598 |
+
<p style="font-weight: bold; margin-bottom: 10px;">💰 Cost Explained:</p>
|
| 599 |
+
<ul style="margin: 0; padding-left: 20px;">
|
| 600 |
+
<li><strong>Total_Tokens</strong>: Total token consumption = Retrieval_Total + Generation_Total.</li>
|
| 601 |
+
<li><strong>Retrieval_Total</strong>: Total tokens in retrieval phase = Retrieval_Input + Retrieval_Output. Includes entity extraction, multi-turn queries. For GraphRAG/HybridRAG, includes amortized one-time graph construction cost.</li>
|
| 602 |
+
<li><strong>Generation_Total</strong>: Total tokens in generation phase = Generation_Input + Generation_Output. Primarily determined by context length.</li>
|
| 603 |
+
<li><strong>Avg_Context_Tokens</strong>: Average retrieved context length per query. <em>Higher means more retrieved content but also higher cost</em>.</li>
|
| 604 |
+
<li><strong>Num_Questions</strong>: Number of queries in the dataset.</li>
|
| 605 |
+
</ul>
|
| 606 |
+
</div>
|
| 607 |
+
""", unsafe_allow_html=True)
|
| 608 |
+
|
| 609 |
+
# Read data
|
| 610 |
+
df_efficiency = pd.read_csv("utils/retrieval_generation_cost.csv")
|
| 611 |
+
|
| 612 |
+
# Controls
|
| 613 |
+
col1, col2, col3 = st.columns([2, 2, 3])
|
| 614 |
+
|
| 615 |
+
with col1:
|
| 616 |
+
sort_by = st.selectbox(
|
| 617 |
+
"Sort by",
|
| 618 |
+
options=df_efficiency.columns.tolist(),
|
| 619 |
+
index=df_efficiency.columns.tolist().index("Total_Tokens") # 默认按 total_tokens 排序
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
with col2:
|
| 623 |
+
order = st.radio(
|
| 624 |
+
"Order",
|
| 625 |
+
options=["Descending", "Ascending"],
|
| 626 |
+
horizontal=True
|
| 627 |
+
)
|
| 628 |
+
|
| 629 |
+
with col3:
|
| 630 |
+
search = st.text_input("Search", placeholder="Search in all columns...")
|
| 631 |
+
|
| 632 |
+
df_display = df_efficiency.copy()
|
| 633 |
+
|
| 634 |
+
if search:
|
| 635 |
+
mask = df_display.apply(lambda row: row.astype(str).str.contains(search, case=False).any(), axis=1)
|
| 636 |
+
df_display = df_display[mask]
|
| 637 |
+
|
| 638 |
+
ascending = True if order == "Ascending" else False
|
| 639 |
+
df_display = df_display.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)
|
| 640 |
+
st.markdown(df_to_html_table(df_display), unsafe_allow_html=True)
|
| 641 |
+
|
| 642 |
+
|
| 643 |
+
# Questions & Contact 部分
|
| 644 |
+
contact_icon = get_image_base64("utils/contact_icon.png")
|
| 645 |
+
|
| 646 |
+
st.markdown(f"""
|
| 647 |
+
<h2 id="contact" style="color: #333333;
|
| 648 |
+
padding-bottom: 10px;
|
| 649 |
+
margin-top: 10px;
|
| 650 |
+
font-family: 'Ubuntu Mono', monospace;
|
| 651 |
+
font-size: 30px;">
|
| 652 |
+
<img src="data:image/png;base64,{contact_icon}" width="30" style="vertical-align: middle; margin-right: 1px;">
|
| 653 |
+
Questions & Contact
|
| 654 |
+
</h2>
|
| 655 |
+
""", unsafe_allow_html=True)
|
| 656 |
+
|
| 657 |
+
st.markdown("""
|
| 658 |
+
<div style="
|
| 659 |
+
border: 2px solid #e0e0e0;
|
| 660 |
+
border-radius: 15px;
|
| 661 |
+
padding: 25px 30px;
|
| 662 |
+
background-color: #fafafa;
|
| 663 |
+
margin-bottom: 10px;
|
| 664 |
+
font-size: 16px;
|
| 665 |
+
line-height: 1.6;
|
| 666 |
+
color: #333;
|
| 667 |
+
">
|
| 668 |
+
<p style="margin-bottom: 5px;">
|
| 669 |
+
If you have any questions about RAGRouter-Bench, please feel free to reach out to us:
|
| 670 |
+
</p>
|
| 671 |
+
<ul style="margin: 0; padding-left: 20px;">
|
| 672 |
+
<li><strong>Email</strong>: <a href="mailto:RAGRouterBench@example.com" style="color: #667eea;">RAGRouterBench@example.com</a></li>
|
| 673 |
+
<li><strong>GitHub</strong>: <a href="https://github.com/your-repo/RAGRouter-Bench" style="color: #667eea;" target="_blank">github.com/your-repo/RAGRouter-Bench</a></li>
|
| 674 |
+
</ul>
|
| 675 |
+
<p style="margin-top: 5px; margin-bottom: 0;">
|
| 676 |
+
For bug reports or feature requests, please open an issue on our GitHub repository.
|
| 677 |
+
</p>
|
| 678 |
+
</div>
|
| 679 |
+
""", unsafe_allow_html=True)
|
utils/Data_Profile.png
ADDED
|
Git LFS Details
|
utils/Overall_Pipeline.png
ADDED
|
Git LFS Details
|
utils/RAG_Paradigms.png
ADDED
|
Git LFS Details
|
utils/about_icon.png
ADDED
|
|
utils/contact_icon.png
ADDED
|
|
utils/corpus_semantic.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Rank,Dataset,Chunks,Int_Dim,Hubness,Avg_Dist,Std_Dist,Min_Dist,Max_Dist
|
| 2 |
+
1,MuSiQue,21153,8.17,1.27,0.708,0.049,0.552,0.924
|
| 3 |
+
2,QuALITY,3822,10.75,1.26,0.345,0.119,0.186,0.805
|
| 4 |
+
3,Legal,11632,7.56,1.46,0.300,0.071,0.147,0.792
|
| 5 |
+
4,Medical,538,8.39,0.86,0.312,0.063,0.196,0.700
|
utils/corpus_structure.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Rank,Dataset,Nodes,Edges,Density,Rel_Types,Avg_Deg,Comp,LCC_Ratio,Cluster_Coeff
|
| 2 |
+
1,MuSiQue,206738,276898,6.00e-06,44766,2.68,7722,0.882,0.0213
|
| 3 |
+
2,QuALITY,90088,120611,1.50e-05,23828,2.68,3997,0.883,0.0177
|
| 4 |
+
3,Legal,135231,261207,1.40e-05,28799,3.86,3204,0.933,0.0701
|
| 5 |
+
4,Medical,14712,21480,9.90e-05,4169,2.92,741,0.861,0.0357
|
utils/effect_deepseek.csv
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Rank,Dataset,Method,Sem_F1,COV,Faith_H,Faith_S,LLM_Cor_Pct
|
| 2 |
+
1,MuSiQue,NaiveRAG,0.503,0.362,0.143,0.486,26.4
|
| 3 |
+
2,MuSiQue,GraphRAG,0.510,0.386,0.114,0.439,30.3
|
| 4 |
+
3,MuSiQue,HybridRAG,0.613,0.472,-,-,38.6
|
| 5 |
+
4,MuSiQue,Iterative (Naive),0.469,0.320,-,-,20.4
|
| 6 |
+
5,MuSiQue,Iterative (Graph),0.512,0.400,-,-,29.2
|
| 7 |
+
6,QuALITY,NaiveRAG,0.858,0.627,0.009,0.404,48.7
|
| 8 |
+
7,QuALITY,GraphRAG,0.794,0.546,0.009,0.377,39.3
|
| 9 |
+
8,QuALITY,HybridRAG,0.738,0.553,0.438,0.146,41.7
|
| 10 |
+
9,QuALITY,Iterative (Naive),0.724,0.506,-,-,35.8
|
| 11 |
+
10,QuALITY,Iterative (Graph),0.657,0.446,-,-,28.7
|
| 12 |
+
11,Legal,NaiveRAG,0.568,0.469,0.145,0.537,32.2
|
| 13 |
+
12,Legal,GraphRAG,0.530,0.443,0.094,0.510,29.3
|
| 14 |
+
13,Legal,HybridRAG,0.617,0.520,0.589,0.326,36.1
|
| 15 |
+
14,Legal,Iterative (Naive),0.572,0.466,-,-,28.5
|
| 16 |
+
15,Legal,Iterative (Graph),0.534,0.439,-,-,26.0
|
| 17 |
+
16,Medical,NaiveRAG,0.770,0.599,0.207,0.583,61.1
|
| 18 |
+
17,Medical,GraphRAG,0.691,0.541,0.250,0.588,53.5
|
| 19 |
+
18,Medical,HybridRAG,0.792,0.620,0.767,0.358,64.7
|
| 20 |
+
19,Medical,Iterative (Naive),0.826,0.595,-,-,62.7
|
| 21 |
+
20,Medical,Iterative (Graph),0.801,0.575,-,-,59.8
|
utils/effect_llama.csv
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Rank,Dataset,Method,Sem_F1,COV,Faith_H,Faith_S,LLM_Cor_Pct
|
| 2 |
+
1,MuSiQue,NaiveRAG,0.249,0.161,0.194,0.540,8.6
|
| 3 |
+
2,MuSiQue,GraphRAG,0.374,0.262,0.185,0.494,18.7
|
| 4 |
+
3,MuSiQue,HybridRAG,0.406,0.284,0.582,0.178,20.3
|
| 5 |
+
4,MuSiQue,Iterative (Naive),0.289,0.169,-,-,6.8
|
| 6 |
+
5,MuSiQue,Iterative (Graph),0.032,0.271,-,-,16.1
|
| 7 |
+
6,QuALITY,NaiveRAG,0.620,0.475,0.011,0.414,30.7
|
| 8 |
+
7,QuALITY,GraphRAG,0.501,0.376,0.014,0.385,21.2
|
| 9 |
+
8,QuALITY,HybridRAG,0.643,0.493,0.283,0.201,33.2
|
| 10 |
+
9,QuALITY,Iterative (Naive),0.583,0.435,-,-,28.4
|
| 11 |
+
10,QuALITY,Iterative (Graph),0.528,0.378,-,-,20.4
|
| 12 |
+
11,Legal,NaiveRAG,0.587,0.473,0.164,0.531,25.8
|
| 13 |
+
12,Legal,GraphRAG,0.536,0.445,0.146,0.508,25.0
|
| 14 |
+
13,Legal,HybridRAG,0.622,0.512,0.627,0.349,31.1
|
| 15 |
+
14,Legal,Iterative (Naive),0.577,0.453,-,-,22.0
|
| 16 |
+
15,Legal,Iterative (Graph),0.013,0.444,-,-,21.0
|
| 17 |
+
16,Medical,NaiveRAG,0.732,0.574,0.175,0.575,44.9
|
| 18 |
+
17,Medical,GraphRAG,0.673,0.532,0.214,0.572,41.7
|
| 19 |
+
18,Medical,HybridRAG,0.759,0.600,0.813,0.374,48.2
|
| 20 |
+
19,Medical,Iterative (Naive),0.802,0.582,-,-,43.6
|
| 21 |
+
20,Medical,Iterative (Graph),0.818,0.595,-,-,43.6
|
utils/full_lb.csv
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Rank,Model,Dataset,Method,Factual,Reasoning,Summary,Avg Acc,Token
|
| 2 |
+
1,DeepSeek-V3,MuSiQue,NaiveRAG,11.1,28.3,29.4,26.4,13k
|
| 3 |
+
2,DeepSeek-V3,MuSiQue,GraphRAG,90.2,22.5,20.6,30.3,9k
|
| 4 |
+
3,DeepSeek-V3,MuSiQue,HybridRAG,83.7,32.8,30.2,38.6,22k
|
| 5 |
+
4,DeepSeek-V3,MuSiQue,IterativeRAG,10.3,21.8,21.2,20.4,20k
|
| 6 |
+
5,DeepSeek-V3,QuALITY,NaiveRAG,83.7,33.8,17.0,48.8,50k
|
| 7 |
+
6,DeepSeek-V3,QuALITY,GraphRAG,70.7,20.4,19.8,39.3,50k
|
| 8 |
+
7,DeepSeek-V3,QuALITY,HybridRAG,80.0,20.4,14.8,41.6,99k
|
| 9 |
+
8,DeepSeek-V3,QuALITY,IterativeRAG,67.0,17.1,16.2,35.8,21k
|
| 10 |
+
9,DeepSeek-V3,Legal,NaiveRAG,54.9,11.2,39.1,32.2,46k
|
| 11 |
+
10,DeepSeek-V3,Legal,GraphRAG,61.6,6.7,29.1,29.3,184k
|
| 12 |
+
11,DeepSeek-V3,Legal,HybridRAG,72.2,11.8,34.6,36.1,230k
|
| 13 |
+
12,DeepSeek-V3,Legal,IterativeRAG,49.5,12.4,30.4,28.5,20k
|
| 14 |
+
13,DeepSeek-V3,Medical,NaiveRAG,63.1,63.5,49.1,61.1,51k
|
| 15 |
+
14,DeepSeek-V3,Medical,GraphRAG,55.0,56.0,43.6,53.5,38k
|
| 16 |
+
15,DeepSeek-V3,Medical,HybridRAG,67.8,64.0,54.0,64.7,74k
|
| 17 |
+
16,DeepSeek-V3,Medical,IterativeRAG,62.1,67.8,56.1,62.7,7k
|
| 18 |
+
17,LLaMA-3.1-8B,MuSiQue,NaiveRAG,10.3,7.9,12.0,8.6,13k
|
| 19 |
+
18,LLaMA-3.1-8B,MuSiQue,GraphRAG,84.4,9.7,11.4,18.7,9k
|
| 20 |
+
19,LLaMA-3.1-8B,MuSiQue,HybridRAG,79.9,12.1,13.9,20.3,22k
|
| 21 |
+
20,LLaMA-3.1-8B,MuSiQue,IterativeRAG,12.3,6.0,6.8,6.8,20k
|
| 22 |
+
21,LLaMA-3.1-8B,QuALITY,NaiveRAG,69.2,10.9,1.4,30.7,50k
|
| 23 |
+
22,LLaMA-3.1-8B,QuALITY,GraphRAG,44.7,9.5,2.5,21.2,50k
|
| 24 |
+
23,LLaMA-3.1-8B,QuALITY,HybridRAG,70.3,15.6,2.5,33.2,99k
|
| 25 |
+
24,LLaMA-3.1-8B,QuALITY,IterativeRAG,62.3,11.5,1.4,28.4,21k
|
| 26 |
+
25,LLaMA-3.1-8B,Legal,NaiveRAG,50.3,10.7,22.8,25.8,46k
|
| 27 |
+
26,LLaMA-3.1-8B,Legal,GraphRAG,55.4,7.4,19.7,25.0,184k
|
| 28 |
+
27,LLaMA-3.1-8B,Legal,HybridRAG,63.8,13.1,24.2,31.1,230k
|
| 29 |
+
28,LLaMA-3.1-8B,Legal,IterativeRAG,48.7,10.1,12.6,22.0,20k
|
| 30 |
+
29,LLaMA-3.1-8B,Medical,NaiveRAG,52.1,37.9,30.1,44.9,51k
|
| 31 |
+
30,LLaMA-3.1-8B,Medical,GraphRAG,48.1,33.6,31.8,41.7,38k
|
| 32 |
+
31,LLaMA-3.1-8B,Medical,HybridRAG,55.7,39.9,33.9,48.2,74k
|
| 33 |
+
32,LLaMA-3.1-8B,Medical,IterativeRAG,47.5,44.6,27.3,43.6,7k
|
utils/leaderboard_icon.png
ADDED
|
|
utils/retrieval_generation_cost.csv
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Rank,Dataset,Method,Num_Questions,Avg_Context_Tokens,Avg_Query_Tokens,Retrieval_Input_Tokens,Retrieval_Output_Tokens,Retrieval_Total,Generation_Input_Tokens,Generation_Output_Tokens,Generation_Total,Total_Tokens
|
| 2 |
+
1,Medical,llm_direct,1896,-,15.5,-,-,-,29338,57176,86514,86514
|
| 3 |
+
2,Medical,naive,1896,50503.9,15.5,-,-,-,95784821,77714,95862535,95862535
|
| 4 |
+
3,Medical,graph,1896,37513.3,15.5,250833,153175,404008,71154518,67161,71221679,71625687
|
| 5 |
+
4,Medical,hybrid,1896,73628.3,15.5,250833,153175,404008,139628589,80880,139709469,140113477
|
| 6 |
+
5,Medical,iterative,1897,2230.6,15.5,8671831,257213,8929044,4260848,77289,4338137,13267181
|
| 7 |
+
6,MuSiQue,llm_direct,3356,-,22.3,-,-,-,74925,25238,100163,100163
|
| 8 |
+
7,MuSiQue,naive,3356,13139.3,22.3,-,-,-,44170465,53233,44223698,44223698
|
| 9 |
+
8,MuSiQue,graph,3356,8471.7,22.3,2350938,244016,2594954,28505878,42917,28548795,31143749
|
| 10 |
+
9,MuSiQue,hybrid,3356,21602.4,22.3,2350938,244016,2594954,72572546,55375,72627921,75222875
|
| 11 |
+
10,MuSiQue,iterative,3357,6364.2,22.3,43242177,723972,43966149,21439701,50772,21490473,65456622
|
| 12 |
+
11,QuALITY,llm_direct,1198,-,29.0,-,-,-,34716,4038,38754,38754
|
| 13 |
+
12,QuALITY,naive,1198,49444.3,29.0,-,-,-,59268989,39140,59308129,59308129
|
| 14 |
+
13,QuALITY,graph,1198,48501.6,29.0,1556111,71840,1627951,58139642,39180,58178822,59806773
|
| 15 |
+
14,QuALITY,hybrid,1198,97789.1,29.0,1556111,71840,1627951,117186129,34985,117221114,118849065
|
| 16 |
+
15,QuALITY,iterative,1198,6870.5,29.0,16685428,251443,16936871,8265571,34999,8300570,25237441
|
| 17 |
+
16,Legal,llm_direct,1277,-,37.0,-,-,-,47290,7891,55181,55181
|
| 18 |
+
17,Legal,naive,1277,46272.7,37.0,-,-,-,59137514,49921,59187435,59187435
|
| 19 |
+
18,Legal,graph,1277,179727.7,37.0,4821083,125070,4946153,229559514,43677,229603191,234549344
|
| 20 |
+
19,Legal,hybrid,1277,225571.4,37.0,4821083,125070,4946153,288101932,50340,288152272,293098425
|
| 21 |
+
20,Legal,iterative,1278,6460.2,37.1,16810781,291294,17102075,8303522,47288,8350810,25452885
|
utils/title_icon.png
ADDED
|
|