Spaces:

LAMDA-NeSy
/

ChinaTravel

Build error

App Files Files Community

博闻 commited on Feb 4

Commit

4b8f4b4

1 Parent(s): 66d686a

update leaderboard

Browse files

Files changed (14) hide show

.gitattributes +1 -1
app.py +44 -144
chinatravel/data/load_datasets.py +18 -10
chinatravel/ui/__init__.py +0 -0
chinatravel/ui/content.py +24 -0
chinatravel/ui/eval_runner.py +125 -0
chinatravel/ui/leaderboard.py +44 -0
eval_exp.py +1 -1
leaderboard_data/easy.csv +23 -0
leaderboard_data/human.csv +21 -0
leaderboard_data/human1000.csv +7 -0
leaderboard_data/lb_all/easy.csv +23 -0
leaderboard_data/lb_all/human.csv +21 -0
leaderboard_data/lb_all/human1000.csv +7 -0

.gitattributes CHANGED Viewed

@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,165 +1,65 @@
-import os
-import time
-import json
-import shutil
-import zipfile
 import gradio as gr
-from eval_exp import evaluate
-from datetime import datetime
-from apscheduler.schedulers.background import BackgroundScheduler
-def load_splits():
-    splits_dir = "chinatravel/evaluation/default_splits"
-    splits = []
-    for filename in os.listdir(splits_dir):
-        if filename.endswith(".txt"):
-            splits.append(filename.replace(".txt", ""))
-    return splits
-SPLITS_LIST = load_splits()
-# SUBMIT_DIR = "./submissions"
-# OUTPUT_DIR = "./outputs"
-SUBMIT_DIR = os.path.abspath("submissions")
-OUTPUT_DIR = os.path.abspath("outputs")
-shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
-shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
-os.makedirs(SUBMIT_DIR, exist_ok=True)
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-print(f"Submission directory: {SUBMIT_DIR}")
-print(f"Output directory: {OUTPUT_DIR}")
-# clear directories if they already exist
-def clean_old_outputs(folder, keep_hours=24):
-    now = time.time()
-    for fname in os.listdir(folder):
-        fpath = os.path.join(folder, fname)
-        if os.path.isfile(fpath) and now - os.path.getmtime(fpath) > keep_hours * 3600:
-            os.remove(fpath)
-scheduler = BackgroundScheduler()
-scheduler.add_job(lambda: clean_old_outputs(OUTPUT_DIR), "interval", hours=6)
-scheduler.start()
-class Arguments:
-    def __init__(self, splits, result_dir):
-        self.splits = splits
-        self.result_dir = result_dir
-def handle_submission(zip_file, dataset_choice):
-    if zip_file is None:
-        # yield "❌ 请上传 zip 文件！", 0, 0, 0, None
-        yield "❌ Please upload a zip file!", 0, 0, 0, None
-        return
-    shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
-    os.makedirs(SUBMIT_DIR, exist_ok=True)
-    # 解压操作
-    with zipfile.ZipFile(zip_file, "r") as zip_ref:
-        print(f"Extracting {zip_file} to {SUBMIT_DIR}...")
-        zip_ref.extractall(SUBMIT_DIR)
-    # 获取当前时间戳
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    print(f"Submission dir: {SUBMIT_DIR}")
-    # 如果 SUBMIT_DIR 下只有一个子目录，就使用这个子目录，否则使用 SUBMIT_DIR 本身
-    subdirs = [d for d in os.listdir(SUBMIT_DIR) if os.path.isdir(os.path.join(SUBMIT_DIR, d))]
-    if len(subdirs) == 1:
-        unzipped_dir = os.path.join(SUBMIT_DIR, subdirs[0])
-    else:
-        unzipped_dir = SUBMIT_DIR  # 没有子目录或有多个子目录时使用 SUBMIT_DIR
-    print(f"Unzipped directory: {unzipped_dir}")
-    output_path = os.path.join(OUTPUT_DIR, f"result_main_{timestamp}.json")
-    args = Arguments(splits=dataset_choice, result_dir=unzipped_dir)
-    try:
-        # yield "🚀 开始测评...", 0, 0, 0, None
-        yield "🚀 Starting evaluation...", 0, 0, 0, None
-        result = {}
-        for progress in evaluate(args, result):
-            stage = progress.get("stage", "")
-            progress_value = progress.get("progress", 0)
-            if stage == "schema":
-                # yield "Schema 阶段测评中...", progress_value, 0, 0, None
-                yield "Schema evaluation in progress...", 100, progress_value, 0, None
-            elif stage == "commonsense":
-                # yield "Commonsense 阶段测评中...", 100, progress_value, 0, None
-                yield "Commonsense evaluation in progress...", 100, 100, progress_value, None
-            elif stage == "logic":
-                # yield "Logic 阶段测评中...", 100, 100, progress_value, None
-                yield "Logic evaluation in progress...", 100, 100, 100, None
-            elif stage == "final":
-                result.update(progress.get("result", {}))
-                # yield "测评完成，正在保存结果...", 100, 100, 100, None
-                yield "Evaluation completed, saving results...", 100, 100, 100, None
-        # 保存结果到文件
-        with open(output_path, "w", encoding="utf-8") as f:
-            json.dump(result, f, ensure_ascii=False, indent=4)
-        # 在测评完成后更新结果文���的值和可见性
-        result_file.value = output_path
-        result_file.visible = True
-        # yield "✅ 测评完成！", 100, 100, 100, output_path
-        yield "✅ Evaluation completed!", 100, 100, 100, output_path
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        # yield f"❌ 测评异常：{e}", 0, 0, 0, None
-        yield f"❌ Evaluation error: {e}", 0, 0, 0, None
-with gr.Blocks() as demo:
-    # gr.Markdown("# 📊 ChinaTravel 模型测评")
-    gr.Markdown(
-        "# 📊 ChinaTravel Benchmark Evaluation"
-    )
-    # with gr.Row():
-        # zip_input = gr.File(label="上传模型预测 zip 文件", file_types=[".zip"])
-        # dataset_choice = gr.Radio(
-        #     SPLITS_LIST, label="选择评估数据集", value="validation"
-        # )
-    zip_input = gr.File(label="Upload zip file of results", file_types=[".zip"])
     dataset_choice = gr.Radio(
-        SPLITS_LIST, label="Select evaluation dataset", value="validation"
     )
-    # submit_btn = gr.Button("开始测评")
-    submit_btn = gr.Button("Start Evaluation")
-    schema_progress = gr.Slider(
-        label="Schema Stage Progress", minimum=0, maximum=100, value=0, interactive=False
     )
-    commonsense_progress = gr.Slider(
-        label="Commonsense Stage Progress", minimum=0, maximum=100, value=0, interactive=False
     )
-    logic_progress = gr.Slider(
-        label="Logic Stage Progress", minimum=0, maximum=100, value=0, interactive=False
     )
-    output_msg = gr.Markdown()
-    # result_file = gr.File(label="结果文件下载")  # , visible=False)
-    result_file = gr.File(label="Result File Download")
     submit_btn.click(
         handle_submission,
         inputs=[zip_input, dataset_choice],
         outputs=[
-            output_msg,
             schema_progress,
             commonsense_progress,
             logic_progress,

 import gradio as gr
+from chinatravel.ui import content
+from chinatravel.ui.eval_runner import DEFAULT_SPLIT, SPLITS_LIST, handle_submission
+from chinatravel.ui.leaderboard import build_placeholder_frames
+default_split_value = DEFAULT_SPLIT or (SPLITS_LIST[0] if SPLITS_LIST else None)
+leaderboard_frames = build_placeholder_frames(SPLITS_LIST)
+with gr.Blocks(title="ChinaTravel Benchmark Evaluation") as demo:
+    gr.HTML(content.TITLE_HTML)
+    gr.Markdown(content.INTRO_MARKDOWN)
+    gr.Markdown(content.SUBMISSION_GUIDE)
+    gr.Markdown("### Leaderboard")
+    gr.Markdown("Methods marked with \* leverage Oracle DSL or an Oracle Verifier.")
+    if SPLITS_LIST:
+        with gr.Tabs():
+            for split in SPLITS_LIST:
+                with gr.Tab(split):
+                    gr.Dataframe(
+                        value=leaderboard_frames.get(split),
+                        interactive=False,
+                        wrap=True,
+                    )
     dataset_choice = gr.Radio(
+        SPLITS_LIST,
+        label="Select evaluation split",
+        value=default_split_value,
+        interactive=True,
     )
+    zip_input = gr.File(label="Upload result archive (.zip)", file_types=[".zip"])
+    submit_btn = gr.Button("Run evaluation", variant="primary")
+    status = gr.Markdown("Ready to evaluate.")
+    schema_progress = gr.Textbox(
+        label="Schema progress",
+        value="0%",
+        interactive=False,
     )
+    commonsense_progress = gr.Textbox(
+        label="Commonsense progress",
+        value="0%",
+        interactive=False,
     )
+    logic_progress = gr.Textbox(
+        label="Logic progress",
+        value="0%",
+        interactive=False,
     )
+    result_file = gr.File(label="Download evaluation report", interactive=False)
+    gr.Markdown(content.CONTACT)
     submit_btn.click(
         handle_submission,
         inputs=[zip_input, dataset_choice],
         outputs=[
+            status,
             schema_progress,
             commonsense_progress,
             logic_progress,

chinatravel/data/load_datasets.py CHANGED Viewed

@@ -84,13 +84,21 @@ def save_json_file(json_data, file_path):
         json.dump(json_data, dump_f, ensure_ascii=False, indent=4, cls=NpEncoder)
 def load_query(args):
-    if not args.splits in ["easy", "medium", "human", "preference_base50",
-                           "preference0_base50", "preference1_base50", "preference2_base50",
-                           "preference3_base50", "preference4_base50", "preference5_base50",
-                           "human1000"]:
         return load_query_local(args)
     config_name = "synthetic"
     if args.splits in ["preference0_base50", "preference1_base50", "preference2_base50",
@@ -100,13 +108,14 @@ def load_query(args):
         config_name = "validation"
     elif args.splits in ["human1000"]:
         config_name = "test"
-    query_data = hg_load_dataset("LAMDA-NeSy/chinatravel_test", name=config_name)[args.splits].to_list()
     for data_i in query_data:
         if "hard_logic_py" in data_i:
             data_i["hard_logic_py"] = ast.literal_eval(data_i["hard_logic_py"])
     query_id_list = [data_i["uid"] for data_i in query_data]
     data_dict = {}
     for data_i in query_data:
@@ -140,4 +149,3 @@ if __name__ == "__main__":
             print(uid, query_data[uid])
         else:
             raise ValueError(f"{uid} not in query_data")

         json.dump(json_data, dump_f, ensure_ascii=False, indent=4, cls=NpEncoder)
 def load_query(args):
+    if not args.splits in [
+        "easy",
+        "medium",
+        "human",
+        "human1000",
+        "preference_base50",
+        "preference0_base50",
+        "preference1_base50",
+        "preference2_base50",
+        "preference3_base50",
+        "preference4_base50",
+        "preference5_base50",
+    ]:
         return load_query_local(args)
     config_name = "synthetic"
     if args.splits in ["preference0_base50", "preference1_base50", "preference2_base50",
         config_name = "validation"
     elif args.splits in ["human1000"]:
         config_name = "test"
+    query_data = hg_load_dataset("LAMDA-NeSy/chinatravel_test", name=config_name)[
+        args.splits
+    ].to_list()
     for data_i in query_data:
         if "hard_logic_py" in data_i:
             data_i["hard_logic_py"] = ast.literal_eval(data_i["hard_logic_py"])
     query_id_list = [data_i["uid"] for data_i in query_data]
     data_dict = {}
     for data_i in query_data:
             print(uid, query_data[uid])
         else:
             raise ValueError(f"{uid} not in query_data")

chinatravel/ui/__init__.py ADDED Viewed

File without changes

chinatravel/ui/content.py ADDED Viewed

	@@ -0,0 +1,24 @@

+TITLE_HTML = """
+<h1 style=\"text-align:center; margin-bottom: 0.25rem;\">ChinaTravel Benchmark Evaluation</h1>
+"""
+INTRO_MARKDOWN = """
+ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents. (See our [paper](https://arxiv.org/abs/2412.13682) for more details.)
+"""
+SUBMISSION_GUIDE = """
+**How to submit**
+- Pick a split. The split determines which query UIDs are expected.
+- Upload a `.zip` that contains one JSON file per UID. The evaluator recursively scans subfolders, so any directory layout is acceptable.
+- Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
+- You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
+**Output**
+- We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
+- A detailed JSON report is produced for download after evaluation.
+**Contact**
+- If you are interested in showing your results on our leaderboard or have any questions, please contact [Jie-Jing Shao](shaojj@lamda.nju.edu.cn), [Bo-Wen Zhang](221900200@smail.nju.edu.cn), [Xiao-Wen Yang](yangxw@lamda.nju.edu.cn)
+"""
+CONTACT = "Contact: zbw@smail.nju.edu.cn, shaojj@lamda.nju.edu.cn"

chinatravel/ui/eval_runner.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import time
+import json
+import shutil
+import zipfile
+from datetime import datetime
+from apscheduler.schedulers.background import BackgroundScheduler
+from eval_exp import evaluate
+SUBMIT_DIR = os.path.abspath("submissions")
+OUTPUT_DIR = os.path.abspath("outputs")
+_CLEAN_INTERVAL_HOURS = 6
+_KEEP_HOURS = 24
+def load_splits():
+    splits_dir = os.path.join("chinatravel", "evaluation", "default_splits")
+    if not os.path.isdir(splits_dir):
+        return []
+    names = []
+    for filename in os.listdir(splits_dir):
+        if filename.endswith(".txt"):
+            names.append(filename.replace(".txt", ""))
+    return sorted(names)
+SPLITS_LIST = load_splits()
+DEFAULT_SPLIT = (
+    "easy" if "easy" in SPLITS_LIST else (SPLITS_LIST[0] if SPLITS_LIST else None)
+)
+class Arguments:
+    def __init__(self, splits, result_dir):
+        self.splits = splits
+        self.result_dir = result_dir
+def _reset_workdirs():
+    shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
+    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+    os.makedirs(SUBMIT_DIR, exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+def clean_old_outputs(folder, keep_hours=_KEEP_HOURS):
+    now = time.time()
+    for fname in os.listdir(folder):
+        fpath = os.path.join(folder, fname)
+        if os.path.isfile(fpath) and now - os.path.getmtime(fpath) > keep_hours * 3600:
+            os.remove(fpath)
+_reset_workdirs()
+_scheduler = BackgroundScheduler(daemon=True)
+_scheduler.add_job(
+    lambda: clean_old_outputs(OUTPUT_DIR), "interval", hours=_CLEAN_INTERVAL_HOURS
+)
+_scheduler.start()
+def _extract_submission(zip_file):
+    zip_path = getattr(zip_file, "name", zip_file)
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(SUBMIT_DIR)
+def _locate_unzipped_root():
+    subdirs = [
+        d for d in os.listdir(SUBMIT_DIR) if os.path.isdir(os.path.join(SUBMIT_DIR, d))
+    ]
+    if len(subdirs) == 1:
+        return os.path.join(SUBMIT_DIR, subdirs[0])
+    return SUBMIT_DIR
+def handle_submission(zip_file, dataset_choice):
+    if zip_file is None:
+        yield "❌ Please upload a zip file with your predictions.", 0, 0, 0, None
+        return
+    if not dataset_choice:
+        yield "❌ Please choose an evaluation split.", 0, 0, 0, None
+        return
+    _reset_workdirs()
+    try:
+        yield "🚀 Starting evaluation...", "0%", "0%", "0%", None
+        _extract_submission(zip_file)
+        unzipped_dir = _locate_unzipped_root()
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_path = os.path.join(OUTPUT_DIR, f"result_main_{timestamp}.json")
+        args = Arguments(splits=dataset_choice, result_dir=unzipped_dir)
+        schema_progress = commonsense_progress = logic_progress = 0
+        result = {}
+        for progress in evaluate(args, result):
+            stage = progress.get("stage", "")
+            value = int(round(progress.get("progress", 0)))
+            if stage == "schema":
+                schema_progress = value
+                yield "Schema evaluation in progress...", f"{schema_progress}%", f"{commonsense_progress}%", f"{logic_progress}%", None
+            elif stage == "commonsense":
+                commonsense_progress = value
+                yield "Commonsense evaluation in progress...", "100%", f"{commonsense_progress}%", f"{logic_progress}%", None
+            elif stage == "logic":
+                logic_progress = value
+                yield "Logic evaluation in progress...", "100%", "100%", f"{logic_progress}%", None
+            elif stage == "final":
+                result.update(progress.get("result", {}))
+                yield "Saving results...", "100%", "100%", "100%", None
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=4)
+        filename = os.path.basename(output_path)
+        yield f"✅ Evaluation completed. Report saved to {filename}.", "100%", "100%", "100%", output_path
+    except Exception as exc:  # pragma: no cover - surfaced to UI
+        import traceback
+        traceback.print_exc()
+        yield f"❌ Evaluation error: {exc}", "0%", "0%", "0%", None

chinatravel/ui/leaderboard.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import pandas as pd
+PLACEHOLDER_COLUMNS = [
+    "Organization",
+    "Method",
+    "Model",
+    "DR",
+    "EPR(Micro/Macro)",
+    "LPR(Micro/Macro)",
+    "C-LPR",
+    "FPR",
+]
+def build_placeholder_frames(splits):
+    frames = {}
+    for split in splits:
+        # frames[split] = pd.DataFrame(columns=PLACEHOLDER_COLUMNS)
+        frames[split] = pd.read_csv(f"leaderboard_data/{split}.csv", index_col=0)
+        # 按照FPR，C-LPR，EPR(Macro)，LPR(Macro)，EPR(Micro)，LPR(Micro)，DR降序排序
+        # 由于EPR和LPR是字符串形式的"xx/yy"，需要先拆分再排序
+        frames[split][["EPR_Micro", "EPR_Macro"]] = (
+            frames[split]["EPR(Micro/Macro)"].str.split("/", expand=True).astype(float)
+        )
+        frames[split][["LPR_Micro", "LPR_Macro"]] = (
+            frames[split]["LPR(Micro/Macro)"].str.split("/", expand=True).astype(float)
+        )
+        frames[split] = frames[split].sort_values(
+            by=[
+                "FPR",
+                "C-LPR",
+                "EPR_Macro",
+                "LPR_Macro",
+                "EPR_Micro",
+                "LPR_Micro",
+                "DR",
+            ],
+            ascending=False,
+        )
+        frames[split] = frames[split].drop(
+            columns=["EPR_Micro", "EPR_Macro", "LPR_Micro", "LPR_Macro"]
+        )
+    return frames

eval_exp.py CHANGED Viewed

@@ -15,7 +15,7 @@ from chinatravel.symbol_verification.commonsense_constraint import (
 )
 from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py
-os.environ["HF_DATASETS_OFFLINE"] = "1"
 def load_result(result_dir, query_index):

 )
 from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py
+# os.environ["HF_DATASETS_OFFLINE"] = "1"
 def load_result(result_dir, query_index):

leaderboard_data/easy.csv ADDED Viewed

	@@ -0,0 +1,23 @@

+Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
+NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
+NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
+NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
+NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
+NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
+NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
+NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
+NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
+NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
+NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
+NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
+NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
+NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
+NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
+NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
+NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
+NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
+NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
+NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
+NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
+NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3

leaderboard_data/human.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
+NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
+NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
+NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
+NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
+NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
+NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
+NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
+NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
+NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
+NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
+NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
+NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
+NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
+NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
+NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
+NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
+NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
+NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
+NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4

leaderboard_data/human1000.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
+NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
+NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
+NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
+NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
+NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8

leaderboard_data/lb_all/easy.csv ADDED Viewed

	@@ -0,0 +1,23 @@

+Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
+NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
+NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
+NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
+NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
+NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
+NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
+NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
+NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
+NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
+NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
+NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
+NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
+NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
+NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
+NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
+NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
+NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
+NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
+NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
+NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
+NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3

leaderboard_data/lb_all/human.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
+NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
+NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
+NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
+NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
+NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
+NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
+NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
+NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
+NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
+NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
+NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
+NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
+NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
+NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
+NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
+NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
+NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
+NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
+NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4

leaderboard_data/lb_all/human1000.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
+NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
+NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
+NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
+NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
+NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
+NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8