Spaces:
Build error
Build error
博闻 commited on
Commit ·
4b8f4b4
1
Parent(s): 66d686a
update leaderboard
Browse files- .gitattributes +1 -1
- app.py +44 -144
- chinatravel/data/load_datasets.py +18 -10
- chinatravel/ui/__init__.py +0 -0
- chinatravel/ui/content.py +24 -0
- chinatravel/ui/eval_runner.py +125 -0
- chinatravel/ui/leaderboard.py +44 -0
- eval_exp.py +1 -1
- leaderboard_data/easy.csv +23 -0
- leaderboard_data/human.csv +21 -0
- leaderboard_data/human1000.csv +7 -0
- leaderboard_data/lb_all/easy.csv +23 -0
- leaderboard_data/lb_all/human.csv +21 -0
- leaderboard_data/lb_all/human1000.csv +7 -0
.gitattributes
CHANGED
|
@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -1,165 +1,65 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import time
|
| 3 |
-
import json
|
| 4 |
-
import shutil
|
| 5 |
-
import zipfile
|
| 6 |
import gradio as gr
|
| 7 |
-
from eval_exp import evaluate
|
| 8 |
-
from datetime import datetime
|
| 9 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
def load_splits():
|
| 13 |
-
splits_dir = "chinatravel/evaluation/default_splits"
|
| 14 |
-
splits = []
|
| 15 |
-
for filename in os.listdir(splits_dir):
|
| 16 |
-
if filename.endswith(".txt"):
|
| 17 |
-
splits.append(filename.replace(".txt", ""))
|
| 18 |
-
return splits
|
| 19 |
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
SPLITS_LIST = load_splits()
|
| 22 |
-
# SUBMIT_DIR = "./submissions"
|
| 23 |
-
# OUTPUT_DIR = "./outputs"
|
| 24 |
-
SUBMIT_DIR = os.path.abspath("submissions")
|
| 25 |
-
OUTPUT_DIR = os.path.abspath("outputs")
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
print(f"Submission directory: {SUBMIT_DIR}")
|
| 32 |
-
print(f"Output directory: {OUTPUT_DIR}")
|
| 33 |
-
# clear directories if they already exist
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
def clean_old_outputs(folder, keep_hours=24):
|
| 38 |
-
now = time.time()
|
| 39 |
-
for fname in os.listdir(folder):
|
| 40 |
-
fpath = os.path.join(folder, fname)
|
| 41 |
-
if os.path.isfile(fpath) and now - os.path.getmtime(fpath) > keep_hours * 3600:
|
| 42 |
-
os.remove(fpath)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
scheduler = BackgroundScheduler()
|
| 46 |
-
scheduler.add_job(lambda: clean_old_outputs(OUTPUT_DIR), "interval", hours=6)
|
| 47 |
-
scheduler.start()
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
class Arguments:
|
| 51 |
-
def __init__(self, splits, result_dir):
|
| 52 |
-
self.splits = splits
|
| 53 |
-
self.result_dir = result_dir
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def handle_submission(zip_file, dataset_choice):
|
| 57 |
-
if zip_file is None:
|
| 58 |
-
# yield "❌ 请上传 zip 文件!", 0, 0, 0, None
|
| 59 |
-
yield "❌ Please upload a zip file!", 0, 0, 0, None
|
| 60 |
-
return
|
| 61 |
-
|
| 62 |
-
shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
|
| 63 |
-
os.makedirs(SUBMIT_DIR, exist_ok=True)
|
| 64 |
-
|
| 65 |
-
# 解压操作
|
| 66 |
-
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
| 67 |
-
print(f"Extracting {zip_file} to {SUBMIT_DIR}...")
|
| 68 |
-
zip_ref.extractall(SUBMIT_DIR)
|
| 69 |
-
|
| 70 |
-
# 获取当前时间戳
|
| 71 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 72 |
-
print(f"Submission dir: {SUBMIT_DIR}")
|
| 73 |
-
|
| 74 |
-
# 如果 SUBMIT_DIR 下只有一个子目录,就使用这个子目录,否则使用 SUBMIT_DIR 本身
|
| 75 |
-
subdirs = [d for d in os.listdir(SUBMIT_DIR) if os.path.isdir(os.path.join(SUBMIT_DIR, d))]
|
| 76 |
-
if len(subdirs) == 1:
|
| 77 |
-
unzipped_dir = os.path.join(SUBMIT_DIR, subdirs[0])
|
| 78 |
-
else:
|
| 79 |
-
unzipped_dir = SUBMIT_DIR # 没有子目录或有多个子目录时使用 SUBMIT_DIR
|
| 80 |
-
print(f"Unzipped directory: {unzipped_dir}")
|
| 81 |
-
output_path = os.path.join(OUTPUT_DIR, f"result_main_{timestamp}.json")
|
| 82 |
-
args = Arguments(splits=dataset_choice, result_dir=unzipped_dir)
|
| 83 |
-
|
| 84 |
-
try:
|
| 85 |
-
# yield "🚀 开始测评...", 0, 0, 0, None
|
| 86 |
-
yield "🚀 Starting evaluation...", 0, 0, 0, None
|
| 87 |
-
|
| 88 |
-
result = {}
|
| 89 |
-
for progress in evaluate(args, result):
|
| 90 |
-
stage = progress.get("stage", "")
|
| 91 |
-
progress_value = progress.get("progress", 0)
|
| 92 |
-
|
| 93 |
-
if stage == "schema":
|
| 94 |
-
# yield "Schema 阶段测评中...", progress_value, 0, 0, None
|
| 95 |
-
yield "Schema evaluation in progress...", 100, progress_value, 0, None
|
| 96 |
-
elif stage == "commonsense":
|
| 97 |
-
# yield "Commonsense 阶段测评中...", 100, progress_value, 0, None
|
| 98 |
-
yield "Commonsense evaluation in progress...", 100, 100, progress_value, None
|
| 99 |
-
elif stage == "logic":
|
| 100 |
-
# yield "Logic 阶段测评中...", 100, 100, progress_value, None
|
| 101 |
-
yield "Logic evaluation in progress...", 100, 100, 100, None
|
| 102 |
-
elif stage == "final":
|
| 103 |
-
result.update(progress.get("result", {}))
|
| 104 |
-
# yield "测评完成,正在保存结果...", 100, 100, 100, None
|
| 105 |
-
yield "Evaluation completed, saving results...", 100, 100, 100, None
|
| 106 |
-
|
| 107 |
-
# 保存结果到文件
|
| 108 |
-
with open(output_path, "w", encoding="utf-8") as f:
|
| 109 |
-
json.dump(result, f, ensure_ascii=False, indent=4)
|
| 110 |
-
|
| 111 |
-
# 在测评完成后更新结果文���的值和可见性
|
| 112 |
-
result_file.value = output_path
|
| 113 |
-
result_file.visible = True
|
| 114 |
-
# yield "✅ 测评完成!", 100, 100, 100, output_path
|
| 115 |
-
yield "✅ Evaluation completed!", 100, 100, 100, output_path
|
| 116 |
-
|
| 117 |
-
except Exception as e:
|
| 118 |
-
import traceback
|
| 119 |
-
|
| 120 |
-
traceback.print_exc()
|
| 121 |
-
# yield f"❌ 测评异常:{e}", 0, 0, 0, None
|
| 122 |
-
yield f"❌ Evaluation error: {e}", 0, 0, 0, None
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
with gr.Blocks() as demo:
|
| 126 |
-
# gr.Markdown("# 📊 ChinaTravel 模型测评")
|
| 127 |
-
gr.Markdown(
|
| 128 |
-
"# 📊 ChinaTravel Benchmark Evaluation"
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
# with gr.Row():
|
| 132 |
-
# zip_input = gr.File(label="上传模型预测 zip 文件", file_types=[".zip"])
|
| 133 |
-
# dataset_choice = gr.Radio(
|
| 134 |
-
# SPLITS_LIST, label="选择评估数据集", value="validation"
|
| 135 |
-
# )
|
| 136 |
-
zip_input = gr.File(label="Upload zip file of results", file_types=[".zip"])
|
| 137 |
dataset_choice = gr.Radio(
|
| 138 |
-
SPLITS_LIST,
|
|
|
|
|
|
|
|
|
|
| 139 |
)
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
| 147 |
)
|
| 148 |
-
commonsense_progress = gr.
|
| 149 |
-
label="Commonsense
|
|
|
|
|
|
|
| 150 |
)
|
| 151 |
-
logic_progress = gr.
|
| 152 |
-
label="Logic
|
|
|
|
|
|
|
| 153 |
)
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
|
| 158 |
submit_btn.click(
|
| 159 |
handle_submission,
|
| 160 |
inputs=[zip_input, dataset_choice],
|
| 161 |
outputs=[
|
| 162 |
-
|
| 163 |
schema_progress,
|
| 164 |
commonsense_progress,
|
| 165 |
logic_progress,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from chinatravel.ui import content
|
| 4 |
+
from chinatravel.ui.eval_runner import DEFAULT_SPLIT, SPLITS_LIST, handle_submission
|
| 5 |
+
from chinatravel.ui.leaderboard import build_placeholder_frames
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
default_split_value = DEFAULT_SPLIT or (SPLITS_LIST[0] if SPLITS_LIST else None)
|
| 9 |
+
leaderboard_frames = build_placeholder_frames(SPLITS_LIST)
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
with gr.Blocks(title="ChinaTravel Benchmark Evaluation") as demo:
|
| 13 |
+
gr.HTML(content.TITLE_HTML)
|
| 14 |
+
gr.Markdown(content.INTRO_MARKDOWN)
|
| 15 |
+
gr.Markdown(content.SUBMISSION_GUIDE)
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
gr.Markdown("### Leaderboard")
|
| 18 |
+
gr.Markdown("Methods marked with \* leverage Oracle DSL or an Oracle Verifier.")
|
| 19 |
+
if SPLITS_LIST:
|
| 20 |
+
with gr.Tabs():
|
| 21 |
+
for split in SPLITS_LIST:
|
| 22 |
+
with gr.Tab(split):
|
| 23 |
+
gr.Dataframe(
|
| 24 |
+
value=leaderboard_frames.get(split),
|
| 25 |
+
interactive=False,
|
| 26 |
+
wrap=True,
|
| 27 |
+
)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
dataset_choice = gr.Radio(
|
| 30 |
+
SPLITS_LIST,
|
| 31 |
+
label="Select evaluation split",
|
| 32 |
+
value=default_split_value,
|
| 33 |
+
interactive=True,
|
| 34 |
)
|
| 35 |
+
zip_input = gr.File(label="Upload result archive (.zip)", file_types=[".zip"])
|
| 36 |
+
submit_btn = gr.Button("Run evaluation", variant="primary")
|
| 37 |
+
|
| 38 |
+
status = gr.Markdown("Ready to evaluate.")
|
| 39 |
+
schema_progress = gr.Textbox(
|
| 40 |
+
label="Schema progress",
|
| 41 |
+
value="0%",
|
| 42 |
+
interactive=False,
|
| 43 |
)
|
| 44 |
+
commonsense_progress = gr.Textbox(
|
| 45 |
+
label="Commonsense progress",
|
| 46 |
+
value="0%",
|
| 47 |
+
interactive=False,
|
| 48 |
)
|
| 49 |
+
logic_progress = gr.Textbox(
|
| 50 |
+
label="Logic progress",
|
| 51 |
+
value="0%",
|
| 52 |
+
interactive=False,
|
| 53 |
)
|
| 54 |
+
result_file = gr.File(label="Download evaluation report", interactive=False)
|
| 55 |
+
|
| 56 |
+
gr.Markdown(content.CONTACT)
|
| 57 |
|
| 58 |
submit_btn.click(
|
| 59 |
handle_submission,
|
| 60 |
inputs=[zip_input, dataset_choice],
|
| 61 |
outputs=[
|
| 62 |
+
status,
|
| 63 |
schema_progress,
|
| 64 |
commonsense_progress,
|
| 65 |
logic_progress,
|
chinatravel/data/load_datasets.py
CHANGED
|
@@ -84,13 +84,21 @@ def save_json_file(json_data, file_path):
|
|
| 84 |
json.dump(json_data, dump_f, ensure_ascii=False, indent=4, cls=NpEncoder)
|
| 85 |
|
| 86 |
|
| 87 |
-
|
| 88 |
def load_query(args):
|
| 89 |
-
|
| 90 |
-
if not args.splits in [
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
return load_query_local(args)
|
| 95 |
config_name = "synthetic"
|
| 96 |
if args.splits in ["preference0_base50", "preference1_base50", "preference2_base50",
|
|
@@ -100,13 +108,14 @@ def load_query(args):
|
|
| 100 |
config_name = "validation"
|
| 101 |
elif args.splits in ["human1000"]:
|
| 102 |
config_name = "test"
|
| 103 |
-
query_data = hg_load_dataset("LAMDA-NeSy/chinatravel_test", name=config_name)[
|
| 104 |
-
|
|
|
|
| 105 |
|
| 106 |
for data_i in query_data:
|
| 107 |
if "hard_logic_py" in data_i:
|
| 108 |
data_i["hard_logic_py"] = ast.literal_eval(data_i["hard_logic_py"])
|
| 109 |
-
|
| 110 |
query_id_list = [data_i["uid"] for data_i in query_data]
|
| 111 |
data_dict = {}
|
| 112 |
for data_i in query_data:
|
|
@@ -140,4 +149,3 @@ if __name__ == "__main__":
|
|
| 140 |
print(uid, query_data[uid])
|
| 141 |
else:
|
| 142 |
raise ValueError(f"{uid} not in query_data")
|
| 143 |
-
|
|
|
|
| 84 |
json.dump(json_data, dump_f, ensure_ascii=False, indent=4, cls=NpEncoder)
|
| 85 |
|
| 86 |
|
|
|
|
| 87 |
def load_query(args):
|
| 88 |
+
|
| 89 |
+
if not args.splits in [
|
| 90 |
+
"easy",
|
| 91 |
+
"medium",
|
| 92 |
+
"human",
|
| 93 |
+
"human1000",
|
| 94 |
+
"preference_base50",
|
| 95 |
+
"preference0_base50",
|
| 96 |
+
"preference1_base50",
|
| 97 |
+
"preference2_base50",
|
| 98 |
+
"preference3_base50",
|
| 99 |
+
"preference4_base50",
|
| 100 |
+
"preference5_base50",
|
| 101 |
+
]:
|
| 102 |
return load_query_local(args)
|
| 103 |
config_name = "synthetic"
|
| 104 |
if args.splits in ["preference0_base50", "preference1_base50", "preference2_base50",
|
|
|
|
| 108 |
config_name = "validation"
|
| 109 |
elif args.splits in ["human1000"]:
|
| 110 |
config_name = "test"
|
| 111 |
+
query_data = hg_load_dataset("LAMDA-NeSy/chinatravel_test", name=config_name)[
|
| 112 |
+
args.splits
|
| 113 |
+
].to_list()
|
| 114 |
|
| 115 |
for data_i in query_data:
|
| 116 |
if "hard_logic_py" in data_i:
|
| 117 |
data_i["hard_logic_py"] = ast.literal_eval(data_i["hard_logic_py"])
|
| 118 |
+
|
| 119 |
query_id_list = [data_i["uid"] for data_i in query_data]
|
| 120 |
data_dict = {}
|
| 121 |
for data_i in query_data:
|
|
|
|
| 149 |
print(uid, query_data[uid])
|
| 150 |
else:
|
| 151 |
raise ValueError(f"{uid} not in query_data")
|
|
|
chinatravel/ui/__init__.py
ADDED
|
File without changes
|
chinatravel/ui/content.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TITLE_HTML = """
|
| 2 |
+
<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">ChinaTravel Benchmark Evaluation</h1>
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
INTRO_MARKDOWN = """
|
| 6 |
+
ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents. (See our [paper](https://arxiv.org/abs/2412.13682) for more details.)
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
SUBMISSION_GUIDE = """
|
| 10 |
+
**How to submit**
|
| 11 |
+
- Pick a split. The split determines which query UIDs are expected.
|
| 12 |
+
- Upload a `.zip` that contains one JSON file per UID. The evaluator recursively scans subfolders, so any directory layout is acceptable.
|
| 13 |
+
- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
|
| 14 |
+
- You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
|
| 15 |
+
|
| 16 |
+
**Output**
|
| 17 |
+
- We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
|
| 18 |
+
- A detailed JSON report is produced for download after evaluation.
|
| 19 |
+
|
| 20 |
+
**Contact**
|
| 21 |
+
- If you are interested in showing your results on our leaderboard or have any questions, please contact [Jie-Jing Shao](shaojj@lamda.nju.edu.cn), [Bo-Wen Zhang](221900200@smail.nju.edu.cn), [Xiao-Wen Yang](yangxw@lamda.nju.edu.cn)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
CONTACT = "Contact: zbw@smail.nju.edu.cn, shaojj@lamda.nju.edu.cn"
|
chinatravel/ui/eval_runner.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import json
|
| 4 |
+
import shutil
|
| 5 |
+
import zipfile
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 8 |
+
from eval_exp import evaluate
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
SUBMIT_DIR = os.path.abspath("submissions")
|
| 12 |
+
OUTPUT_DIR = os.path.abspath("outputs")
|
| 13 |
+
_CLEAN_INTERVAL_HOURS = 6
|
| 14 |
+
_KEEP_HOURS = 24
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_splits():
|
| 18 |
+
splits_dir = os.path.join("chinatravel", "evaluation", "default_splits")
|
| 19 |
+
if not os.path.isdir(splits_dir):
|
| 20 |
+
return []
|
| 21 |
+
names = []
|
| 22 |
+
for filename in os.listdir(splits_dir):
|
| 23 |
+
if filename.endswith(".txt"):
|
| 24 |
+
names.append(filename.replace(".txt", ""))
|
| 25 |
+
return sorted(names)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
SPLITS_LIST = load_splits()
|
| 29 |
+
DEFAULT_SPLIT = (
|
| 30 |
+
"easy" if "easy" in SPLITS_LIST else (SPLITS_LIST[0] if SPLITS_LIST else None)
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Arguments:
|
| 35 |
+
def __init__(self, splits, result_dir):
|
| 36 |
+
self.splits = splits
|
| 37 |
+
self.result_dir = result_dir
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _reset_workdirs():
|
| 41 |
+
shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
|
| 42 |
+
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
| 43 |
+
os.makedirs(SUBMIT_DIR, exist_ok=True)
|
| 44 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def clean_old_outputs(folder, keep_hours=_KEEP_HOURS):
|
| 48 |
+
now = time.time()
|
| 49 |
+
for fname in os.listdir(folder):
|
| 50 |
+
fpath = os.path.join(folder, fname)
|
| 51 |
+
if os.path.isfile(fpath) and now - os.path.getmtime(fpath) > keep_hours * 3600:
|
| 52 |
+
os.remove(fpath)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
_reset_workdirs()
|
| 56 |
+
_scheduler = BackgroundScheduler(daemon=True)
|
| 57 |
+
_scheduler.add_job(
|
| 58 |
+
lambda: clean_old_outputs(OUTPUT_DIR), "interval", hours=_CLEAN_INTERVAL_HOURS
|
| 59 |
+
)
|
| 60 |
+
_scheduler.start()
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _extract_submission(zip_file):
|
| 64 |
+
zip_path = getattr(zip_file, "name", zip_file)
|
| 65 |
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
| 66 |
+
zip_ref.extractall(SUBMIT_DIR)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _locate_unzipped_root():
|
| 70 |
+
subdirs = [
|
| 71 |
+
d for d in os.listdir(SUBMIT_DIR) if os.path.isdir(os.path.join(SUBMIT_DIR, d))
|
| 72 |
+
]
|
| 73 |
+
if len(subdirs) == 1:
|
| 74 |
+
return os.path.join(SUBMIT_DIR, subdirs[0])
|
| 75 |
+
return SUBMIT_DIR
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def handle_submission(zip_file, dataset_choice):
|
| 79 |
+
if zip_file is None:
|
| 80 |
+
yield "❌ Please upload a zip file with your predictions.", 0, 0, 0, None
|
| 81 |
+
return
|
| 82 |
+
if not dataset_choice:
|
| 83 |
+
yield "❌ Please choose an evaluation split.", 0, 0, 0, None
|
| 84 |
+
return
|
| 85 |
+
|
| 86 |
+
_reset_workdirs()
|
| 87 |
+
try:
|
| 88 |
+
yield "🚀 Starting evaluation...", "0%", "0%", "0%", None
|
| 89 |
+
|
| 90 |
+
_extract_submission(zip_file)
|
| 91 |
+
unzipped_dir = _locate_unzipped_root()
|
| 92 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 93 |
+
output_path = os.path.join(OUTPUT_DIR, f"result_main_{timestamp}.json")
|
| 94 |
+
args = Arguments(splits=dataset_choice, result_dir=unzipped_dir)
|
| 95 |
+
|
| 96 |
+
schema_progress = commonsense_progress = logic_progress = 0
|
| 97 |
+
result = {}
|
| 98 |
+
for progress in evaluate(args, result):
|
| 99 |
+
stage = progress.get("stage", "")
|
| 100 |
+
value = int(round(progress.get("progress", 0)))
|
| 101 |
+
|
| 102 |
+
if stage == "schema":
|
| 103 |
+
schema_progress = value
|
| 104 |
+
yield "Schema evaluation in progress...", f"{schema_progress}%", f"{commonsense_progress}%", f"{logic_progress}%", None
|
| 105 |
+
elif stage == "commonsense":
|
| 106 |
+
commonsense_progress = value
|
| 107 |
+
yield "Commonsense evaluation in progress...", "100%", f"{commonsense_progress}%", f"{logic_progress}%", None
|
| 108 |
+
elif stage == "logic":
|
| 109 |
+
logic_progress = value
|
| 110 |
+
yield "Logic evaluation in progress...", "100%", "100%", f"{logic_progress}%", None
|
| 111 |
+
elif stage == "final":
|
| 112 |
+
result.update(progress.get("result", {}))
|
| 113 |
+
yield "Saving results...", "100%", "100%", "100%", None
|
| 114 |
+
|
| 115 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 116 |
+
json.dump(result, f, ensure_ascii=False, indent=4)
|
| 117 |
+
|
| 118 |
+
filename = os.path.basename(output_path)
|
| 119 |
+
yield f"✅ Evaluation completed. Report saved to {filename}.", "100%", "100%", "100%", output_path
|
| 120 |
+
|
| 121 |
+
except Exception as exc: # pragma: no cover - surfaced to UI
|
| 122 |
+
import traceback
|
| 123 |
+
|
| 124 |
+
traceback.print_exc()
|
| 125 |
+
yield f"❌ Evaluation error: {exc}", "0%", "0%", "0%", None
|
chinatravel/ui/leaderboard.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
PLACEHOLDER_COLUMNS = [
|
| 5 |
+
"Organization",
|
| 6 |
+
"Method",
|
| 7 |
+
"Model",
|
| 8 |
+
"DR",
|
| 9 |
+
"EPR(Micro/Macro)",
|
| 10 |
+
"LPR(Micro/Macro)",
|
| 11 |
+
"C-LPR",
|
| 12 |
+
"FPR",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_placeholder_frames(splits):
|
| 17 |
+
frames = {}
|
| 18 |
+
for split in splits:
|
| 19 |
+
# frames[split] = pd.DataFrame(columns=PLACEHOLDER_COLUMNS)
|
| 20 |
+
frames[split] = pd.read_csv(f"leaderboard_data/{split}.csv", index_col=0)
|
| 21 |
+
# 按照FPR,C-LPR,EPR(Macro),LPR(Macro),EPR(Micro),LPR(Micro),DR降序排序
|
| 22 |
+
# 由于EPR和LPR是字符串形式的"xx/yy",需要先拆分再排序
|
| 23 |
+
frames[split][["EPR_Micro", "EPR_Macro"]] = (
|
| 24 |
+
frames[split]["EPR(Micro/Macro)"].str.split("/", expand=True).astype(float)
|
| 25 |
+
)
|
| 26 |
+
frames[split][["LPR_Micro", "LPR_Macro"]] = (
|
| 27 |
+
frames[split]["LPR(Micro/Macro)"].str.split("/", expand=True).astype(float)
|
| 28 |
+
)
|
| 29 |
+
frames[split] = frames[split].sort_values(
|
| 30 |
+
by=[
|
| 31 |
+
"FPR",
|
| 32 |
+
"C-LPR",
|
| 33 |
+
"EPR_Macro",
|
| 34 |
+
"LPR_Macro",
|
| 35 |
+
"EPR_Micro",
|
| 36 |
+
"LPR_Micro",
|
| 37 |
+
"DR",
|
| 38 |
+
],
|
| 39 |
+
ascending=False,
|
| 40 |
+
)
|
| 41 |
+
frames[split] = frames[split].drop(
|
| 42 |
+
columns=["EPR_Micro", "EPR_Macro", "LPR_Micro", "LPR_Macro"]
|
| 43 |
+
)
|
| 44 |
+
return frames
|
eval_exp.py
CHANGED
|
@@ -15,7 +15,7 @@ from chinatravel.symbol_verification.commonsense_constraint import (
|
|
| 15 |
)
|
| 16 |
from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py
|
| 17 |
|
| 18 |
-
os.environ["HF_DATASETS_OFFLINE"] = "1"
|
| 19 |
|
| 20 |
|
| 21 |
def load_result(result_dir, query_index):
|
|
|
|
| 15 |
)
|
| 16 |
from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py
|
| 17 |
|
| 18 |
+
# os.environ["HF_DATASETS_OFFLINE"] = "1"
|
| 19 |
|
| 20 |
|
| 21 |
def load_result(result_dir, query_index):
|
leaderboard_data/easy.csv
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
|
| 3 |
+
NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
|
| 4 |
+
NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
|
| 5 |
+
NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
|
| 6 |
+
NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
|
| 7 |
+
NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
|
| 8 |
+
NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
|
| 9 |
+
NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
|
| 10 |
+
NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
|
| 11 |
+
NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
|
| 12 |
+
NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
|
| 13 |
+
NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
|
| 14 |
+
NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
|
| 15 |
+
NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
|
| 16 |
+
NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
|
| 17 |
+
NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
|
| 18 |
+
NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
|
| 19 |
+
NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
|
| 20 |
+
NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
|
| 21 |
+
NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
|
| 22 |
+
NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
|
| 23 |
+
NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
|
leaderboard_data/human.csv
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
|
| 3 |
+
NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
|
| 4 |
+
NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
|
| 5 |
+
NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
|
| 6 |
+
NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
|
| 7 |
+
NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
|
| 8 |
+
NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
|
| 9 |
+
NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
|
| 10 |
+
NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
|
| 11 |
+
NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
|
| 12 |
+
NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
|
| 13 |
+
NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
|
| 14 |
+
NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
|
| 15 |
+
NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
|
| 16 |
+
NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
|
| 17 |
+
NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
|
| 18 |
+
NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
|
| 19 |
+
NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
|
| 20 |
+
NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
|
| 21 |
+
NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
|
leaderboard_data/human1000.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
|
| 3 |
+
NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
|
| 4 |
+
NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
|
| 5 |
+
NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
|
| 6 |
+
NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
|
| 7 |
+
NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
|
leaderboard_data/lb_all/easy.csv
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
|
| 3 |
+
NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
|
| 4 |
+
NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
|
| 5 |
+
NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
|
| 6 |
+
NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
|
| 7 |
+
NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
|
| 8 |
+
NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
|
| 9 |
+
NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
|
| 10 |
+
NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
|
| 11 |
+
NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
|
| 12 |
+
NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
|
| 13 |
+
NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
|
| 14 |
+
NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
|
| 15 |
+
NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
|
| 16 |
+
NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
|
| 17 |
+
NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
|
| 18 |
+
NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
|
| 19 |
+
NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
|
| 20 |
+
NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
|
| 21 |
+
NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
|
| 22 |
+
NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
|
| 23 |
+
NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
|
leaderboard_data/lb_all/human.csv
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
|
| 3 |
+
NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
|
| 4 |
+
NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
|
| 5 |
+
NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
|
| 6 |
+
NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
|
| 7 |
+
NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
|
| 8 |
+
NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
|
| 9 |
+
NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
|
| 10 |
+
NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
|
| 11 |
+
NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
|
| 12 |
+
NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
|
| 13 |
+
NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
|
| 14 |
+
NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
|
| 15 |
+
NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
|
| 16 |
+
NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
|
| 17 |
+
NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
|
| 18 |
+
NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
|
| 19 |
+
NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
|
| 20 |
+
NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
|
| 21 |
+
NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
|
leaderboard_data/lb_all/human1000.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
|
| 2 |
+
NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
|
| 3 |
+
NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
|
| 4 |
+
NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
|
| 5 |
+
NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
|
| 6 |
+
NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
|
| 7 |
+
NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
|