Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import base64 | |
| import pandas as pd | |
| import os | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| os.path.join(BASE_DIR, "utils", "title_icon.png") | |
| # 读取图片并转为 base64 | |
| def get_image_base64(image_path): | |
| with open(image_path, "rb") as f: | |
| return base64.b64encode(f.read()).decode() | |
| # 设置 dataframe 样式:斑马纹 + 表头黑色加粗 | |
| def style_dataframe(df): | |
| def row_style(row): | |
| if row.name % 2 == 0: | |
| return ['background-color: #f9f9f9'] * len(row) | |
| return ['background-color: #ffffff'] * len(row) | |
| return df.style.set_table_styles([ | |
| # 表头样式 | |
| {'selector': 'th', 'props': [ | |
| ('background-color', '#f0f0f0'), | |
| ('color', '#000000'), | |
| ('font-weight', 'bold'), | |
| ('text-align', 'left'), | |
| ('padding', '8px') | |
| ]}, | |
| # 单元格样式 | |
| {'selector': 'td', 'props': [ | |
| ('text-align', 'left'), | |
| ('padding', '8px') | |
| ]}, | |
| # 表头文字样式 | |
| {'selector': 'th.col_heading', 'props': [ | |
| ('background-color', '#f0f0f0'), | |
| ('color', '#000000'), | |
| ('font-weight', 'bold') | |
| ]} | |
| ]).apply(row_style, axis=1) | |
| def df_to_html_table(df, height=400): | |
| html = f'<div style="max-height: {height}px; overflow-y: auto; border: 1px solid #d0d0d0; border-radius: 8px;">' | |
| html += '<table style="width: 100%; border-collapse: collapse; font-size: 14px;">' | |
| # 调整表头:font-weight 改为 normal,padding 第一个值调小 | |
| html += '<thead><tr style="background-color: #e8e8e8; position: sticky; top: 0; z-index: 1;">' | |
| for col in df.columns: | |
| html += f'<th style="padding: 6px 14px; text-align: left; font-weight: normal; font-size: 15px; color: #000; border-bottom: 2px solid #ccc;">{col}</th>' | |
| html += '</tr></thead><tbody>' | |
| # 调整单元格:padding 第一个值调小 | |
| for i, (_, row) in enumerate(df.iterrows()): | |
| bg = '#f5f5f5' if i % 2 == 0 else '#ffffff' | |
| html += f'<tr style="background-color: {bg};">' | |
| for val in row: | |
| html += f'<td style="padding: 4px 14px; text-align: left; border-bottom: 1px solid #eee; color: #333;">{val}</td>' | |
| html += '</tr>' | |
| html += '</tbody></table></div>' | |
| return html | |
| st.set_page_config( | |
| page_title="RAGRouter-Bench: A Dataset and Benchmark for Adaptive RAG Routing", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| #背景颜色 | |
| st.markdown(""" | |
| <style> | |
| /* 强制浅色模式 */ | |
| :root, html, body { | |
| color-scheme: light only !important; | |
| background-color: #ffffff !important; | |
| color: #333333 !important; | |
| } | |
| /* 全局强制浅色背景 */ | |
| *, *::before, *::after { | |
| color-scheme: light only !important; | |
| } | |
| [data-testid="stAppViewContainer"], | |
| [data-testid="stMain"], | |
| [data-testid="stVerticalBlock"], | |
| .stApp { | |
| background-color: #ffffff !important; | |
| color: #333333 !important; | |
| } | |
| /* 下拉框和输入框的标签文字 */ | |
| .stSelectbox label, | |
| .stTextInput label, | |
| .stRadio label, | |
| .stSelectbox label p, | |
| .stTextInput label p, | |
| [data-testid="stWidgetLabel"], | |
| [data-testid="stWidgetLabel"] p { | |
| color: #333333 !important; | |
| } | |
| /* 下拉框容器 */ | |
| [data-baseweb="select"], | |
| [data-baseweb="input"], | |
| .stSelectbox [data-baseweb="select"], | |
| .stTextInput input { | |
| background-color: #ffffff !important; | |
| color: #333333 !important; | |
| border-color: #ccc !important; | |
| } | |
| /* 下拉框内部 */ | |
| [data-baseweb="select"] > div, | |
| [data-baseweb="select"] > div > div { | |
| background-color: #ffffff !important; | |
| color: #333333 !important; | |
| } | |
| /* 下拉菜单弹出层 */ | |
| [data-baseweb="popover"], | |
| [data-baseweb="menu"], | |
| [data-baseweb="list"], | |
| [role="listbox"], | |
| ul[role="listbox"], | |
| [data-baseweb="popover"] > div { | |
| background-color: #ffffff !important; | |
| color: #333333 !important; | |
| } | |
| /* 下拉菜单选项 */ | |
| [role="option"], | |
| [data-baseweb="menu"] li, | |
| li[role="option"] { | |
| background-color: #ffffff !important; | |
| color: #333333 !important; | |
| } | |
| [role="option"]:hover, | |
| li[role="option"]:hover { | |
| background-color: #f0f0f0 !important; | |
| } | |
| /* 文本输入框 */ | |
| .stTextInput input, | |
| input[type="text"] { | |
| background-color: #ffffff !important; | |
| color: #333333 !important; | |
| border-color: #ccc !important; | |
| } | |
| /* 单选按钮 */ | |
| .stRadio label, | |
| .stRadio [data-baseweb="radio"] { | |
| color: #333333 !important; | |
| } | |
| .stRadio p { | |
| color: #333333 !important; | |
| } | |
| /* 隐藏外层滚动条但保留滚动功能 */ | |
| html, body { | |
| overflow: hidden !important; | |
| height: 100% !important; | |
| margin: 0 !important; | |
| } | |
| .stApp { | |
| overflow-y: auto !important; | |
| overflow-x: hidden !important; | |
| height: 100vh !important; | |
| /* 隐藏滚动条 */ | |
| scrollbar-width: none !important; /* Firefox */ | |
| -ms-overflow-style: none !important; /* IE/Edge */ | |
| } | |
| .stApp { | |
| overflow: hidden !important; | |
| height: auto !important; | |
| } | |
| [data-testid="stAppViewContainer"] { | |
| overflow: visible !important; | |
| } | |
| /* 隐藏顶部深色栏 */ | |
| header[data-testid="stHeader"] { | |
| background-color: #ffffff !important; | |
| } | |
| /* 左边侧边栏 - 灰色背景 */ | |
| [data-testid="stSidebar"] { | |
| display: none; | |
| } | |
| /* 右边主内容区 - 白色背景 */ | |
| [data-testid="stMain"] { | |
| background-color: #ffffff; | |
| } | |
| /* 隐藏顶部 header 的高度 */ | |
| header[data-testid="stHeader"] { | |
| height: 0 !important; | |
| min-height: 0 !important; | |
| padding: 0 !important; | |
| } | |
| /* 减少顶部间距 - 调整这个值 */ | |
| .block-container { | |
| padding-top: 0 !important; /* 移除顶部留白 */ | |
| max-width: 1200px; /* 最大宽度 */ | |
| padding-left: 2rem; /* 左边距 */ | |
| padding-right: 2rem; /* 右边距 */ | |
| } | |
| /* 标签页字体大小和样式 */ | |
| .stTabs [data-baseweb="tab"] p { | |
| font-size: 18px !important; /* 字体大小 */ | |
| font-weight: bold; /* 加粗 */ | |
| padding: 10px 20px; /* 内边距 */ | |
| color: #333333; /* 字体颜色 */ | |
| } | |
| /* 给所有 tabs 区域加边框 */ | |
| [data-testid="stTabs"] { | |
| border: 2px solid #e0e0e0; | |
| border-radius: 15px; | |
| padding: 5px 5px 35px 20px; | |
| background-color: #fafafa; | |
| margin-bottom: 5px; | |
| } | |
| /* tabs 固定高度和滚动 (Leaderboard 700px) */ | |
| .stTabs [data-baseweb="tab-panel"] { | |
| max-height: 600px !important; | |
| overflow-y: auto !important; | |
| background-color: #fafafa !important; | |
| } | |
| /* tabs 内部所有内容背景 */ | |
| .stTabs [data-baseweb="tab-panel"] > div, | |
| .stTabs [data-testid="stVerticalBlock"] { | |
| background-color: #fafafa !important; | |
| } | |
| /* 表格内容左对齐 - glide-data-grid */ | |
| [data-testid="stDataFrame"] .dvn-scroller, | |
| [data-testid="stDataFrame"] [class*="cell"], | |
| [data-testid="stDataFrame"] div[style*="justify-content"] { | |
| text-align: left !important; | |
| justify-content: flex-start !important; | |
| } | |
| /* glide data editor 单元格 */ | |
| .gdg-cell { | |
| justify-content: flex-start !important; | |
| } | |
| code { | |
| background-color: transparent !important; | |
| color: #333 !important; | |
| } | |
| pre { | |
| background-color: #f5f5f5 !important; | |
| color: #333 !important; | |
| } | |
| pre code { | |
| background-color: transparent !important; | |
| color: #333 !important; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| #标题 | |
| title_icon = get_image_base64("utils/title_icon.png") | |
| st.markdown(f""" | |
| <div style="background-color: #f0f0f0; | |
| padding: 20px 20px; | |
| margin: 0 -30rem 20px -30rem;"> | |
| <h1 style="text-align: center; | |
| font-size: 36px; | |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| padding: 5px; | |
| margin: 0;"> | |
| <img src="data:image/png;base64,{title_icon}" width="45" style="vertical-align: middle; margin-right: 1px;"> | |
| RAGRouter-Bench:<br> A Dataset and Benchmark for Adaptive RAG Routing | |
| </h1> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # 统计横幅 | |
| st.markdown(""" | |
| <div style=" | |
| background-color: #e8f4fc; | |
| border: 2px solid #b8d4e8; | |
| border-radius: 15px; | |
| margin: 0 auto 30px auto; | |
| max-width: 100%; | |
| text-align: center; | |
| font-size: 18px; | |
| color: #333; | |
| "> | |
| <span style="margin: 0 5px;"><strong>📚 4 Corpus Domains</strong></span>| | |
| <span style="margin: 0 5px;"><strong>📄 21K Documents</strong></span>| | |
| <span style="margin: 0 5px;"><strong>❓ 7.7K Query Types</strong></span>| | |
| <span style="margin: 0 5px;"><strong>📊 3 Dimension Evaluations</strong></span>| | |
| <span style="margin: 0 5px;"><strong>🔄 5 RAG Paradigms</strong></span>| | |
| <span style="margin: 0 5px;"><strong>🤖 2 LLMs Tested</strong></span> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # 主内容 - 添加锚点ID | |
| # About 部分 | |
| with st.container(): | |
| about_icon = get_image_base64("utils/about_icon.png") | |
| st.markdown(f""" | |
| <h2 id="about" style="color: #333333; | |
| padding-bottom: 10px; | |
| font-family: 'Ubuntu Mono', monospace; | |
| font-size: 30px;"> | |
| <img src="data:image/png;base64,{about_icon}" width="30" style="vertical-align: middle; margin-right: 1px;"> | |
| About | |
| </h2> | |
| """, unsafe_allow_html=True) | |
| # About 内的标签页 | |
| about_tab1, about_tab2, about_tab3 = st.tabs(["📋 Overview", "⭐ Key Features", "🚀 Get Started"]) | |
| with about_tab1: | |
| pipeline_img = get_image_base64("utils/Overall_Pipeline.png") | |
| st.markdown(f""" | |
| <div style="padding-right: 10px;"> | |
| <div style="text-align: center; margin: 1px 0;"> | |
| <img src="data:image/png;base64,{pipeline_img}" width="50%" style="border-radius: 10px;"> | |
| <p style="color: #666; font-size: 16px;">Overall Pipeline</p> | |
| </div> | |
| <div style="font-size: 16px; line-height: 1.4; color: #333; text-align: justify;"> | |
| <p>Retrieval-Augmented Generation (RAG) has become a core paradigm for grounding large language models with external knowledge. | |
| Despite extensive efforts exploring diverse retrieval strategies, <strong>existing studies predominantly focus on query-side complexity or isolated method improvements, lacking a systematic understanding of how RAG paradigms behave across different query–corpus contexts and effectiveness–efficiency trade-offs</strong>. | |
| In this work, we introduce RAGRouter-Bench, the first dataset and benchmark designed for adaptive RAG routing. | |
| RAGRouter-Bench revisits retrieval from a query–corpus compatibility perspective and standardizes five representative RAG paradigms for systematic evaluation across 7,727 queries and 21,460 documents spanning diverse domains. | |
| The benchmark incorporates three canonical query types together with fine-grained semantic and structural corpus metrics, as well as a unified evaluation for both generation quality and resource consumption. | |
| Experiments with DeepSeek-V3 and LLaMA-3.1-8B demonstrate that <strong>no single RAG paradigm is universally optimal, that paradigm applicability is strongly shaped by query–corpus interactions, and that increased advanced mechanism does not necessarily yield better effectiveness–efficiency trade-offs</strong>. | |
| These findings underscore the necessity of routing-aware evaluation and establish a foundation for adaptive, interpretable, and generalizable next-generation RAG systems.</p> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with about_tab2: | |
| bench_img = get_image_base64("utils/Data_Profile.png") | |
| st.markdown(f""" | |
| <div style="padding-right: 10px;"> | |
| <div style="text-align: center; margin: 1px 0;"> | |
| <img src="data:image/png;base64,{bench_img}" width="70%"> | |
| <p style="color: #666; font-size: 16px;">Benchmark Features</p> | |
| </div> | |
| <div style="font-size: 16px; line-height: 1.4; color: #333; text-align: left;"> | |
| <p style="margin-top: 1px; margin-bottom: 1px;"><strong>🌐 Multi-Domain Corpora</strong></p> | |
| <ul> | |
| <li><strong>Wikipedia (MuSiQue)</strong>: Encyclopedic knowledge with explicit entity relations (5,427 documents)</li> | |
| <li><strong>Literature (QuALITY)</strong>: Long-form narratives with implicit semantic structures (2,523 documents)</li> | |
| <li><strong>Legal (UltraDomain)</strong>: Professional domain with dense terminology (6,510 documents)</li> | |
| <li><strong>Medical (GraphRAG-Bench)</strong>: Specialized knowledge requiring precise reasoning (7,000 documents)</li> | |
| </ul> | |
| <p style="margin-top: 1px; margin-bottom: 1px;"><strong>❓ Three Query Types</strong></p> | |
| <ul> | |
| <li><strong>Factual Queries</strong>: Single-hop lookup requiring direct fact retrieval</li> | |
| <li><strong>Reasoning Queries</strong>: Multi-hop inference across chained evidence (2-4 hops)</li> | |
| <li><strong>Summary Queries</strong>: Global aggregation over dispersed information</li> | |
| </ul> | |
| <p style="margin-top: 1px; margin-bottom: 1px;"><strong>🔄 Five RAG Paradigm</strong></p> | |
| <ul> | |
| <li><strong>RAG Paradigm</strong>:LLM-only, NaiveRAG, GraphRAG, HybridRAG, IterativeRAG | |
| </ul> | |
| <p style="margin-top: 1px; margin-bottom: 1px;"><strong>📊 Dual-View Corpus Evaluation</strong></p> | |
| <ul> | |
| <li><strong>Structural Metrics</strong>: Connectivity (LCC Ratio, Relation Types), Density (Avg Degree, Max Centrality), Clustering Coefficient</li> | |
| <li><strong>Semantic Metrics</strong>: Intrinsic Dimension, Dispersion (Avg/Min/Std Distance), Hubness</li> | |
| <li><strong>Quality Assurance</strong>: LLM-based query augmentation with Verify-then-Filter validation</li> | |
| </ul> | |
| <p style="margin-top: 1px; margin-bottom: 1px;"><strong>⚖️ Effectiveness-Efficiency Evaluation</strong></p> | |
| <ul> | |
| <li><strong>Effectiveness</strong>: LLM-as-a-Judge accuracy across three dimensions (Information Coverage, Semantic Accuracy, Logical Consistency)</li> | |
| <li><strong>Efficiency</strong>: Token consumption decomposed into Retrieval Cost and Generation Cost</li> | |
| </ul> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with about_tab3: | |
| paradigms_img = get_image_base64("utils/RAG_Paradigms.png") | |
| st.markdown(f""" | |
| <div style="padding-right: 10px; font-size: 16px; line-height: 1.4; color: #333;"> | |
| <div style="text-align: center; margin: 1px 0;"> | |
| <img src="data:image/png;base64,{paradigms_img}" width="60%"> | |
| <p style="color: #666; font-size: 16px;">RAG Paradigm</p> | |
| </div> | |
| <a href="https://huggingface.co/datasets/Chaplain0908/RAGRouter" style="color: #667eea;" target="_blank">📥 Download RAGRouter-Bench Dataset</a> | |
| <p style="margin-top: 1px; margin-bottom: 5px;"><strong>💻 Installation</strong></p> | |
| <pre style="background-color: #f0f7ff !important; padding: 10px; border-radius: 5px; overflow-x: auto; border: 1px solid #cce0ff;"> | |
| <code style="background-color: transparent !important; color: #333 !important; font-family: 'Courier New', monospace !important;">git clone https://github.com/ziqiwang0908/RAGRouter-Bench | |
| cd RAGRouter-Bench | |
| conda env create -f environment.yml | |
| conda activate ragBench</code></pre> | |
| <p style="margin-top: 1px; margin-bottom: 5px;"><strong>⚙️ Configuration</strong></p> | |
| <ul> | |
| <li>Set your API key in <code>Config/LLMConfig.py</code> (<code>DEEPSEEK_API_KEY</code> or <code>OPENAI_API_KEY</code>)</li> | |
| </ul> | |
| <p style="margin-top: 1px; margin-bottom: 5px;"><strong>🚀 Quick Start</strong></p> | |
| <table style="width: 100%; border-collapse: collapse; margin: 10px 0;"> | |
| <tr style="background-color: #f0f0f0;"> | |
| <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Step</th> | |
| <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Command</th> | |
| <th style="border: 1px solid #ddd; padding: 8px; text-align: left;">Description</th> | |
| </tr> | |
| <tr> | |
| <td style="border: 1px solid #ddd; padding: 8px;">1. Process</td> | |
| <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py process all --dataset musique</code></td> | |
| <td style="border: 1px solid #ddd; padding: 8px;">Chunking, embedding, graph building</td> | |
| </tr> | |
| <tr> | |
| <td style="border: 1px solid #ddd; padding: 8px;">2. Retrieve</td> | |
| <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py retrieve graph --dataset musique</code></td> | |
| <td style="border: 1px solid #ddd; padding: 8px;">Run RAG retrieval</td> | |
| </tr> | |
| <tr> | |
| <td style="border: 1px solid #ddd; padding: 8px;">3. Evaluate</td> | |
| <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py evaluate result --dataset musique --method graph_rag</code></td> | |
| <td style="border: 1px solid #ddd; padding: 8px;">Evaluate results</td> | |
| </tr> | |
| <tr> | |
| <td style="border: 1px solid #ddd; padding: 8px;">Full Pipeline</td> | |
| <td style="border: 1px solid #ddd; padding: 8px;"><code>python main.py pipeline --dataset musique --method graph</code></td> | |
| <td style="border: 1px solid #ddd; padding: 8px;">Run all steps</td> | |
| </tr> | |
| </table> | |
| <p style="margin-top: 15px; margin-bottom: 5px;"><strong>Available Datasets</strong></p> | |
| <ul> | |
| <li><code>musique</code> - Wikipedia (Encyclopedic)</li> | |
| <li><code>quality</code> - Literature (Narrative)</li> | |
| <li><code>ultraDomain_legal</code> - Legal (Professional)</li> | |
| <li><code>graphragBench_medical</code> - Medical (Professional)</li> | |
| </ul> | |
| <p style="margin-top: 1px; margin-bottom: 5px;"><strong>Available RAG Paradigms</strong></p> | |
| <ul> | |
| <li><code>naive</code> - NaiveRAG (vector retrieval)</li> | |
| <li><code>graph</code> - GraphRAG (graph traversal)</li> | |
| <li><code>hybrid</code> - HybridRAG (naive + graph fusion)</li> | |
| <li><code>iterative</code> - IterativeRAG (multi-round retrieval)</li> | |
| <li><code>llm_direct</code> - LLM-only (no retrieval)</li> | |
| </ul> | |
| <p style="margin-top: 1px; margin-bottom: 5px;"><strong>Data Format</strong></p> | |
| <p>Your data should be placed in <code>Dataset/Rawutils/{{dataset_name}}/</code> with:</p> | |
| <ul> | |
| <li><code>Corpus.json</code> - Document collection with <code>doc_id</code>, <code>title</code>, <code>text</code></li> | |
| <li><code>Question.json</code> - Queries with <code>question_id</code>, <code>question</code>, <code>answer</code>, <code>query_type</code>, <code>supporting_facts</code></li> | |
| </ul> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Leaderboard 部分 | |
| leaderboard_icon = get_image_base64("utils/leaderboard_icon.png") | |
| st.markdown(f""" | |
| <h2 id="leaderboard" style="color: #333333; | |
| padding-bottom: 10px; | |
| margin-top: 10px; | |
| font-family: 'Ubuntu Mono', monospace; | |
| font-size: 30px;"> | |
| <img src="data:image/png;base64,{leaderboard_icon}" width="30" style="vertical-align: middle; margin-right: 1px;"> | |
| Leaderboard | |
| </h2> | |
| """, unsafe_allow_html=True) | |
| # Leaderboard 内的标签页 | |
| lb_tab1, lb_tab2, lb_tab3, lb_tab4 = st.tabs(["🏆 Full Leaderboard", "📁 Corpus Metrics", "📈 Effectiveness Metrics", "⚡ Efficiency Metrics"]) | |
| with lb_tab1: | |
| # Full Leaderboard Explanation | |
| st.markdown(""" | |
| <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;"> | |
| <p style="font-weight: bold; margin-bottom: 10px;">📋 Columns Explained:</p> | |
| <ul style="margin: 0; padding-left: 20px;"> | |
| <li><strong>Dataset</strong>: Corpus domain (MuSiQue-Wikipedia, QuALITY-Literature, Legal, Medical).</li> | |
| <li><strong>Method</strong>: RAG paradigm (NaiveRAG, GraphRAG, HybridRAG, IterativeRAG).</li> | |
| <li><strong>Factual</strong>: LLM-as-a-Judge accuracy (%) on factual queries (single-hop fact retrieval). <em>Higher is better</em>.</li> | |
| <li><strong>Reasoning</strong>: LLM-as-a-Judge accuracy (%) on reasoning queries (multi-hop inference, 2-4 hops). <em>Higher is better</em>.</li> | |
| <li><strong>Summary</strong>: LLM-as-a-Judge accuracy (%) on summary queries (global information aggregation). <em>Higher is better</em>.</li> | |
| <li><strong>Avg Acc</strong>: Average accuracy (%) across all three query types. <em>Higher is better</em>.</li> | |
| <li><strong>Token</strong>: Average token consumption per query. <em>Lower is more efficient</em>.</li> | |
| </ul> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| df_full = pd.read_csv("utils/full_lb.csv") | |
| col1_f, col2_f, col3_f, col4_f = st.columns([2, 2, 2, 3]) | |
| with col1_f: | |
| model_select_f = st.selectbox( | |
| "Model", | |
| options=["All"] + df_full["Model"].unique().tolist(), | |
| index=0, | |
| key="model_full" | |
| ) | |
| with col2_f: | |
| sort_by_f = st.selectbox( | |
| "Sort by", | |
| options=df_full.columns.tolist(), | |
| index=df_full.columns.tolist().index("Avg Acc"), | |
| key="sort_full" | |
| ) | |
| with col3_f: | |
| order_f = st.radio( | |
| "Order", | |
| options=["Descending", "Ascending"], | |
| horizontal=True, | |
| key="order_full" | |
| ) | |
| with col4_f: | |
| search_f = st.text_input("Search", placeholder="Search in all columns...", key="search_full") | |
| df_display_f = df_full.copy() | |
| if model_select_f != "All": | |
| df_display_f = df_display_f[df_display_f["Model"] == model_select_f] | |
| if search_f: | |
| mask_f = df_display_f.apply(lambda row: row.astype(str).str.contains(search_f, case=False).any(), axis=1) | |
| df_display_f = df_display_f[mask_f] | |
| ascending_f = True if order_f == "Ascending" else False | |
| df_display_f = df_display_f.sort_values(by=sort_by_f, ascending=ascending_f).reset_index(drop=True) | |
| st.markdown(df_to_html_table(df_display_f), unsafe_allow_html=True) | |
| with lb_tab2: | |
| # Structure Metrics Explanation | |
| st.markdown(""" | |
| <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;"> | |
| <p style="font-weight: bold; margin-bottom: 10px;">🔗 Structural Topology Metrics:</p> | |
| <ul style="margin: 0; padding-left: 20px;"> | |
| <li>Nodes: Number of nodes in the knowledge graph.</li> | |
| <li><strong>Edges</strong>: Number of edges in the knowledge graph.</li> | |
| <li><strong>Density</strong>: Edge saturation level. <em>Excessive sparsity limits relational bridges</em>.</li> | |
| <li><strong>Rel_Types (Relation Type Diversity)</strong>: Semantic richness of edges for precise graph traversal.</li> | |
| <li><strong>Avg_Deg (Average Degree)</strong>: Average connections per node, reflecting connection intensity.</li> | |
| <li><strong>Comp (Connected Components)</strong>: Number of independent subgraphs.</li> | |
| <li><strong>LCC_Ratio (Largest Connected Component Ratio)</strong>: Proportion of nodes in the largest subgraph. <em>Low values indicate graph fragmentation that breaks multi-hop paths</em>.</li> | |
| <li><strong>Cluster_Coeff (Clustering Coefficient)</strong>: Local cohesiveness. <em>High values indicate tight communities that facilitate evidence aggregation</em>.</li> | |
| </ul> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| df_structure = pd.read_csv("utils/corpus_structure.csv") | |
| col1_s, col2_s, col3_s = st.columns([2, 2, 3]) | |
| with col1_s: | |
| sort_by_s = st.selectbox( | |
| "Sort by", | |
| options=df_structure.columns.tolist(), | |
| index=0, | |
| key="sort_structure" | |
| ) | |
| with col2_s: | |
| order_s = st.radio( | |
| "Order", | |
| options=["Descending", "Ascending"], | |
| horizontal=True, | |
| key="order_structure" | |
| ) | |
| with col3_s: | |
| search_s = st.text_input("Search", placeholder="Search in all columns...", key="search_structure") | |
| df_display_s = df_structure.copy() | |
| if search_s: | |
| mask_s = df_display_s.apply(lambda row: row.astype(str).str.contains(search_s, case=False).any(), axis=1) | |
| df_display_s = df_display_s[mask_s] | |
| ascending_s = True if order_s == "Ascending" else False | |
| df_display_s = df_display_s.sort_values(by=sort_by_s, ascending=ascending_s).reset_index(drop=True) | |
| st.markdown(df_to_html_table(df_display_s, height=200), unsafe_allow_html=True) | |
| # Semantic Metrics Explanation | |
| st.markdown(""" | |
| <div style="font-size: 15px; line-height: 1.5; color: #333; margin-top: 30px; margin-bottom: 20px;"> | |
| <p style="font-weight: bold; margin-bottom: 10px;">🧠 Semantic Space Metrics:</p> | |
| <ul style="margin: 0; padding-left: 20px;"> | |
| <li><strong>Chunks</strong>: Number of text chunks in the corpus.</li> | |
| <li><strong>Int_Dim (Intrinsic Dimension)</strong>: Effective degrees of freedom estimated via TwoNN. <em>High dimensionality exacerbates the curse of dimensionality, diminishing distance-based similarity</em>.</li> | |
| <li><strong>Hubness</strong>: Skewness of k-occurrence distribution, measuring retrieval interference. <em>High values indicate hub vectors that dominate nearest-neighbor lists, causing bias toward frequently retrieved but potentially irrelevant passages</em>.</li> | |
| <li><strong>Avg_Dist (Average Distance)</strong>: Average distance to centroid, reflecting overall distribution spread.</li> | |
| <li><strong>Std_Dist (Standard Deviation)</strong>: Distance standard deviation, revealing distributional imbalance. <em>High values indicate uneven distribution</em>.</li> | |
| <li><strong>Min_Dist (Minimum Distance)</strong>: Distance of closest cluster pair, identifying most confusable semantic regions. <em>Low dispersion causes semantic crowding that hinders hard-negative discrimination</em>.</li> | |
| <li><strong>Max_Dist (Maximum Distance)</strong>: Distance of farthest cluster pair, reflecting maximum semantic space span.</li> | |
| </ul> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| df_semantic = pd.read_csv("utils/corpus_semantic.csv") | |
| col1_m, col2_m, col3_m = st.columns([2, 2, 3]) | |
| with col1_m: | |
| sort_by_m = st.selectbox( | |
| "Sort by", | |
| options=df_semantic.columns.tolist(), | |
| index=0, | |
| key="sort_semantic" | |
| ) | |
| with col2_m: | |
| order_m = st.radio( | |
| "Order", | |
| options=["Descending", "Ascending"], | |
| horizontal=True, | |
| key="order_semantic" | |
| ) | |
| with col3_m: | |
| search_m = st.text_input("Search", placeholder="Search in all columns...", key="search_semantic") | |
| df_display_m = df_semantic.copy() | |
| if search_m: | |
| mask_m = df_display_m.apply(lambda row: row.astype(str).str.contains(search_m, case=False).any(), axis=1) | |
| df_display_m = df_display_m[mask_m] | |
| ascending_m = True if order_m == "Ascending" else False | |
| df_display_m = df_display_m.sort_values(by=sort_by_m, ascending=ascending_m).reset_index(drop=True) | |
| st.markdown(df_to_html_table(df_display_m, height=200), unsafe_allow_html=True) | |
| with lb_tab3: | |
| # Metrics Explanation | |
| st.markdown(""" | |
| <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;"> | |
| <p style="font-weight: bold; margin-bottom: 10px;">📊 Metrics Explained:</p> | |
| <ul style="margin: 0; padding-left: 20px;"> | |
| <li><strong>Sem_F1 (Semantic F1)</strong>: Token-level semantic similarity between generated and reference answers using BERTScore. Range: 0-1, <em>higher is better</em>.</li> | |
| <li><strong>COV (Coverage)</strong>: Extent to which the answer covers key information using sentence embeddings. Range: 0-1, <em>higher is better</em>.</li> | |
| <li><strong>Faith_H (Faithfulness Hard)</strong>: Strict support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li> | |
| <li><strong>Faith_S (Faithfulness Soft)</strong>: Relaxed support relationship between answer and retrieved content. Range: 0-1, <em>higher is better</em>.</li> | |
| <li><strong>LLM_Cor_Pct (LLM-as-a-Judge)</strong>: Correctness rate via LLM ternary classification, aligned with human judgment. Range: 0-100%, <em>higher is better</em>.</li> | |
| </ul> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Model files mapping | |
| model_files = { | |
| "DeepSeek-V3": "utils/effect_deepseek.csv", | |
| "Llama-3-8B": "utils/effect_llama.csv" | |
| } | |
| # Controls | |
| col1_e, col2_e, col3_e, col4_e = st.columns([2, 2, 2, 3]) | |
| with col1_e: | |
| model_select = st.selectbox( | |
| "Model", | |
| options=list(model_files.keys()), | |
| index=0, | |
| key="model_effect" | |
| ) | |
| df_effect = pd.read_csv(model_files[model_select]) | |
| with col2_e: | |
| sort_by_e = st.selectbox( | |
| "Sort by", | |
| options=df_effect.columns.tolist(), | |
| index=0, | |
| key="sort_effect" | |
| ) | |
| with col3_e: | |
| order_e = st.radio( | |
| "Order", | |
| options=["Descending", "Ascending"], | |
| horizontal=True, | |
| key="order_effect" | |
| ) | |
| with col4_e: | |
| search_e = st.text_input("Search", placeholder="Search in all columns...", key="search_effect") | |
| df_display_e = df_effect.copy() | |
| if search_e: | |
| mask_e = df_display_e.apply(lambda row: row.astype(str).str.contains(search_e, case=False).any(), axis=1) | |
| df_display_e = df_display_e[mask_e] | |
| ascending_e = True if order_e == "Ascending" else False | |
| df_display_e = df_display_e.sort_values(by=sort_by_e, ascending=ascending_e).reset_index(drop=True) | |
| st.markdown(df_to_html_table(df_display_e), unsafe_allow_html=True) | |
| with lb_tab4: | |
| # Cost Explanation | |
| st.markdown(""" | |
| <div style="font-size: 15px; line-height: 1.5; color: #333; margin-bottom: 20px;"> | |
| <p style="font-weight: bold; margin-bottom: 10px;">💰 Cost Explained:</p> | |
| <ul style="margin: 0; padding-left: 20px;"> | |
| <li><strong>Total_Tokens</strong>: Total token consumption = Retrieval_Total + Generation_Total.</li> | |
| <li><strong>Retrieval_Total</strong>: Total tokens in retrieval phase = Retrieval_Input + Retrieval_Output. Includes entity extraction, multi-turn queries. For GraphRAG/HybridRAG, includes amortized one-time graph construction cost.</li> | |
| <li><strong>Generation_Total</strong>: Total tokens in generation phase = Generation_Input + Generation_Output. Primarily determined by context length.</li> | |
| <li><strong>Avg_Context_Tokens</strong>: Average retrieved context length per query. <em>Higher means more retrieved content but also higher cost</em>.</li> | |
| <li><strong>Num_Questions</strong>: Number of queries in the dataset.</li> | |
| </ul> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Read data | |
| df_efficiency = pd.read_csv("utils/retrieval_generation_cost.csv") | |
| # Controls | |
| col1, col2, col3 = st.columns([2, 2, 3]) | |
| with col1: | |
| sort_by = st.selectbox( | |
| "Sort by", | |
| options=df_efficiency.columns.tolist(), | |
| index=df_efficiency.columns.tolist().index("Total_Tokens") # 默认按 total_tokens 排序 | |
| ) | |
| with col2: | |
| order = st.radio( | |
| "Order", | |
| options=["Descending", "Ascending"], | |
| horizontal=True | |
| ) | |
| with col3: | |
| search = st.text_input("Search", placeholder="Search in all columns...") | |
| df_display = df_efficiency.copy() | |
| if search: | |
| mask = df_display.apply(lambda row: row.astype(str).str.contains(search, case=False).any(), axis=1) | |
| df_display = df_display[mask] | |
| ascending = True if order == "Ascending" else False | |
| df_display = df_display.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True) | |
| st.markdown(df_to_html_table(df_display), unsafe_allow_html=True) | |
| # Questions & Contact 部分 | |
| contact_icon = get_image_base64("utils/contact_icon.png") | |
| st.markdown(f""" | |
| <h2 id="contact" style="color: #333333; | |
| padding-bottom: 10px; | |
| margin-top: 10px; | |
| font-family: 'Ubuntu Mono', monospace; | |
| font-size: 30px;"> | |
| <img src="data:image/png;base64,{contact_icon}" width="30" style="vertical-align: middle; margin-right: 1px;"> | |
| Questions & Contact | |
| </h2> | |
| """, unsafe_allow_html=True) | |
| st.markdown(""" | |
| <div style=" | |
| border: 2px solid #e0e0e0; | |
| border-radius: 15px; | |
| padding: 25px 30px; | |
| background-color: #fafafa; | |
| margin-bottom: 10px; | |
| font-size: 16px; | |
| line-height: 1.6; | |
| color: #333; | |
| "> | |
| <p style="margin-bottom: 5px;"> | |
| If you have any questions about RAGRouter-Bench, please feel free to reach out to us: | |
| </p> | |
| <ul style="margin: 0; padding-left: 20px;"> | |
| <li><strong>Email</strong>: <a href="mailto:ziqi.wang0908@rutgers.edu" style="color: #667eea;">ziqi.wang0908@rutgers.edu</a></li> | |
| <li><strong>GitHub</strong>: <a href="https://github.com/ziqiwang0908/RAGRouter-Bench" style="color: #667eea;" target="_blank">https://github.com/ziqiwang0908/RAGRouter-Bench</a></li> | |
| </ul> | |
| <p style="margin-top: 5px; margin-bottom: 0;"> | |
| For bug reports or feature requests, please open an issue on our GitHub repository. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) |