博闻 commited on
Commit
4b8f4b4
·
1 Parent(s): 66d686a

update leaderboard

Browse files
.gitattributes CHANGED
@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,165 +1,65 @@
1
- import os
2
- import time
3
- import json
4
- import shutil
5
- import zipfile
6
  import gradio as gr
7
- from eval_exp import evaluate
8
- from datetime import datetime
9
- from apscheduler.schedulers.background import BackgroundScheduler
10
 
 
 
 
11
 
12
- def load_splits():
13
- splits_dir = "chinatravel/evaluation/default_splits"
14
- splits = []
15
- for filename in os.listdir(splits_dir):
16
- if filename.endswith(".txt"):
17
- splits.append(filename.replace(".txt", ""))
18
- return splits
19
 
 
 
20
 
21
- SPLITS_LIST = load_splits()
22
- # SUBMIT_DIR = "./submissions"
23
- # OUTPUT_DIR = "./outputs"
24
- SUBMIT_DIR = os.path.abspath("submissions")
25
- OUTPUT_DIR = os.path.abspath("outputs")
26
 
27
- shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
28
- shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
29
- os.makedirs(SUBMIT_DIR, exist_ok=True)
30
- os.makedirs(OUTPUT_DIR, exist_ok=True)
31
- print(f"Submission directory: {SUBMIT_DIR}")
32
- print(f"Output directory: {OUTPUT_DIR}")
33
- # clear directories if they already exist
34
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
-
37
- def clean_old_outputs(folder, keep_hours=24):
38
- now = time.time()
39
- for fname in os.listdir(folder):
40
- fpath = os.path.join(folder, fname)
41
- if os.path.isfile(fpath) and now - os.path.getmtime(fpath) > keep_hours * 3600:
42
- os.remove(fpath)
43
-
44
-
45
- scheduler = BackgroundScheduler()
46
- scheduler.add_job(lambda: clean_old_outputs(OUTPUT_DIR), "interval", hours=6)
47
- scheduler.start()
48
-
49
-
50
- class Arguments:
51
- def __init__(self, splits, result_dir):
52
- self.splits = splits
53
- self.result_dir = result_dir
54
-
55
-
56
- def handle_submission(zip_file, dataset_choice):
57
- if zip_file is None:
58
- # yield "❌ 请上传 zip 文件!", 0, 0, 0, None
59
- yield "❌ Please upload a zip file!", 0, 0, 0, None
60
- return
61
-
62
- shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
63
- os.makedirs(SUBMIT_DIR, exist_ok=True)
64
-
65
- # 解压操作
66
- with zipfile.ZipFile(zip_file, "r") as zip_ref:
67
- print(f"Extracting {zip_file} to {SUBMIT_DIR}...")
68
- zip_ref.extractall(SUBMIT_DIR)
69
-
70
- # 获取当前时间戳
71
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
72
- print(f"Submission dir: {SUBMIT_DIR}")
73
-
74
- # 如果 SUBMIT_DIR 下只有一个子目录,就使用这个子目录,否则使用 SUBMIT_DIR 本身
75
- subdirs = [d for d in os.listdir(SUBMIT_DIR) if os.path.isdir(os.path.join(SUBMIT_DIR, d))]
76
- if len(subdirs) == 1:
77
- unzipped_dir = os.path.join(SUBMIT_DIR, subdirs[0])
78
- else:
79
- unzipped_dir = SUBMIT_DIR # 没有子目录或有多个子目录时使用 SUBMIT_DIR
80
- print(f"Unzipped directory: {unzipped_dir}")
81
- output_path = os.path.join(OUTPUT_DIR, f"result_main_{timestamp}.json")
82
- args = Arguments(splits=dataset_choice, result_dir=unzipped_dir)
83
-
84
- try:
85
- # yield "🚀 开始测评...", 0, 0, 0, None
86
- yield "🚀 Starting evaluation...", 0, 0, 0, None
87
-
88
- result = {}
89
- for progress in evaluate(args, result):
90
- stage = progress.get("stage", "")
91
- progress_value = progress.get("progress", 0)
92
-
93
- if stage == "schema":
94
- # yield "Schema 阶段测评中...", progress_value, 0, 0, None
95
- yield "Schema evaluation in progress...", 100, progress_value, 0, None
96
- elif stage == "commonsense":
97
- # yield "Commonsense 阶段测评中...", 100, progress_value, 0, None
98
- yield "Commonsense evaluation in progress...", 100, 100, progress_value, None
99
- elif stage == "logic":
100
- # yield "Logic 阶段测评中...", 100, 100, progress_value, None
101
- yield "Logic evaluation in progress...", 100, 100, 100, None
102
- elif stage == "final":
103
- result.update(progress.get("result", {}))
104
- # yield "测评完成,正在保存结果...", 100, 100, 100, None
105
- yield "Evaluation completed, saving results...", 100, 100, 100, None
106
-
107
- # 保存结果到文件
108
- with open(output_path, "w", encoding="utf-8") as f:
109
- json.dump(result, f, ensure_ascii=False, indent=4)
110
-
111
- # 在测评完成后更新结果文���的值和可见性
112
- result_file.value = output_path
113
- result_file.visible = True
114
- # yield "✅ 测评完成!", 100, 100, 100, output_path
115
- yield "✅ Evaluation completed!", 100, 100, 100, output_path
116
-
117
- except Exception as e:
118
- import traceback
119
-
120
- traceback.print_exc()
121
- # yield f"❌ 测评异常:{e}", 0, 0, 0, None
122
- yield f"❌ Evaluation error: {e}", 0, 0, 0, None
123
-
124
-
125
- with gr.Blocks() as demo:
126
- # gr.Markdown("# 📊 ChinaTravel 模型测评")
127
- gr.Markdown(
128
- "# 📊 ChinaTravel Benchmark Evaluation"
129
- )
130
-
131
- # with gr.Row():
132
- # zip_input = gr.File(label="上传模型预测 zip 文件", file_types=[".zip"])
133
- # dataset_choice = gr.Radio(
134
- # SPLITS_LIST, label="选择评估数据集", value="validation"
135
- # )
136
- zip_input = gr.File(label="Upload zip file of results", file_types=[".zip"])
137
  dataset_choice = gr.Radio(
138
- SPLITS_LIST, label="Select evaluation dataset", value="validation"
 
 
 
139
  )
140
-
141
-
142
- # submit_btn = gr.Button("开始测评")
143
- submit_btn = gr.Button("Start Evaluation")
144
-
145
- schema_progress = gr.Slider(
146
- label="Schema Stage Progress", minimum=0, maximum=100, value=0, interactive=False
 
147
  )
148
- commonsense_progress = gr.Slider(
149
- label="Commonsense Stage Progress", minimum=0, maximum=100, value=0, interactive=False
 
 
150
  )
151
- logic_progress = gr.Slider(
152
- label="Logic Stage Progress", minimum=0, maximum=100, value=0, interactive=False
 
 
153
  )
154
- output_msg = gr.Markdown()
155
- # result_file = gr.File(label="结果文件下载") # , visible=False)
156
- result_file = gr.File(label="Result File Download")
157
 
158
  submit_btn.click(
159
  handle_submission,
160
  inputs=[zip_input, dataset_choice],
161
  outputs=[
162
- output_msg,
163
  schema_progress,
164
  commonsense_progress,
165
  logic_progress,
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
 
3
+ from chinatravel.ui import content
4
+ from chinatravel.ui.eval_runner import DEFAULT_SPLIT, SPLITS_LIST, handle_submission
5
+ from chinatravel.ui.leaderboard import build_placeholder_frames
6
 
 
 
 
 
 
 
 
7
 
8
+ default_split_value = DEFAULT_SPLIT or (SPLITS_LIST[0] if SPLITS_LIST else None)
9
+ leaderboard_frames = build_placeholder_frames(SPLITS_LIST)
10
 
 
 
 
 
 
11
 
12
+ with gr.Blocks(title="ChinaTravel Benchmark Evaluation") as demo:
13
+ gr.HTML(content.TITLE_HTML)
14
+ gr.Markdown(content.INTRO_MARKDOWN)
15
+ gr.Markdown(content.SUBMISSION_GUIDE)
 
 
 
16
 
17
+ gr.Markdown("### Leaderboard")
18
+ gr.Markdown("Methods marked with \* leverage Oracle DSL or an Oracle Verifier.")
19
+ if SPLITS_LIST:
20
+ with gr.Tabs():
21
+ for split in SPLITS_LIST:
22
+ with gr.Tab(split):
23
+ gr.Dataframe(
24
+ value=leaderboard_frames.get(split),
25
+ interactive=False,
26
+ wrap=True,
27
+ )
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  dataset_choice = gr.Radio(
30
+ SPLITS_LIST,
31
+ label="Select evaluation split",
32
+ value=default_split_value,
33
+ interactive=True,
34
  )
35
+ zip_input = gr.File(label="Upload result archive (.zip)", file_types=[".zip"])
36
+ submit_btn = gr.Button("Run evaluation", variant="primary")
37
+
38
+ status = gr.Markdown("Ready to evaluate.")
39
+ schema_progress = gr.Textbox(
40
+ label="Schema progress",
41
+ value="0%",
42
+ interactive=False,
43
  )
44
+ commonsense_progress = gr.Textbox(
45
+ label="Commonsense progress",
46
+ value="0%",
47
+ interactive=False,
48
  )
49
+ logic_progress = gr.Textbox(
50
+ label="Logic progress",
51
+ value="0%",
52
+ interactive=False,
53
  )
54
+ result_file = gr.File(label="Download evaluation report", interactive=False)
55
+
56
+ gr.Markdown(content.CONTACT)
57
 
58
  submit_btn.click(
59
  handle_submission,
60
  inputs=[zip_input, dataset_choice],
61
  outputs=[
62
+ status,
63
  schema_progress,
64
  commonsense_progress,
65
  logic_progress,
chinatravel/data/load_datasets.py CHANGED
@@ -84,13 +84,21 @@ def save_json_file(json_data, file_path):
84
  json.dump(json_data, dump_f, ensure_ascii=False, indent=4, cls=NpEncoder)
85
 
86
 
87
-
88
  def load_query(args):
89
-
90
- if not args.splits in ["easy", "medium", "human", "preference_base50",
91
- "preference0_base50", "preference1_base50", "preference2_base50",
92
- "preference3_base50", "preference4_base50", "preference5_base50",
93
- "human1000"]:
 
 
 
 
 
 
 
 
 
94
  return load_query_local(args)
95
  config_name = "synthetic"
96
  if args.splits in ["preference0_base50", "preference1_base50", "preference2_base50",
@@ -100,13 +108,14 @@ def load_query(args):
100
  config_name = "validation"
101
  elif args.splits in ["human1000"]:
102
  config_name = "test"
103
- query_data = hg_load_dataset("LAMDA-NeSy/chinatravel_test", name=config_name)[args.splits].to_list()
104
-
 
105
 
106
  for data_i in query_data:
107
  if "hard_logic_py" in data_i:
108
  data_i["hard_logic_py"] = ast.literal_eval(data_i["hard_logic_py"])
109
-
110
  query_id_list = [data_i["uid"] for data_i in query_data]
111
  data_dict = {}
112
  for data_i in query_data:
@@ -140,4 +149,3 @@ if __name__ == "__main__":
140
  print(uid, query_data[uid])
141
  else:
142
  raise ValueError(f"{uid} not in query_data")
143
-
 
84
  json.dump(json_data, dump_f, ensure_ascii=False, indent=4, cls=NpEncoder)
85
 
86
 
 
87
  def load_query(args):
88
+
89
+ if not args.splits in [
90
+ "easy",
91
+ "medium",
92
+ "human",
93
+ "human1000",
94
+ "preference_base50",
95
+ "preference0_base50",
96
+ "preference1_base50",
97
+ "preference2_base50",
98
+ "preference3_base50",
99
+ "preference4_base50",
100
+ "preference5_base50",
101
+ ]:
102
  return load_query_local(args)
103
  config_name = "synthetic"
104
  if args.splits in ["preference0_base50", "preference1_base50", "preference2_base50",
 
108
  config_name = "validation"
109
  elif args.splits in ["human1000"]:
110
  config_name = "test"
111
+ query_data = hg_load_dataset("LAMDA-NeSy/chinatravel_test", name=config_name)[
112
+ args.splits
113
+ ].to_list()
114
 
115
  for data_i in query_data:
116
  if "hard_logic_py" in data_i:
117
  data_i["hard_logic_py"] = ast.literal_eval(data_i["hard_logic_py"])
118
+
119
  query_id_list = [data_i["uid"] for data_i in query_data]
120
  data_dict = {}
121
  for data_i in query_data:
 
149
  print(uid, query_data[uid])
150
  else:
151
  raise ValueError(f"{uid} not in query_data")
 
chinatravel/ui/__init__.py ADDED
File without changes
chinatravel/ui/content.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE_HTML = """
2
+ <h1 style=\"text-align:center; margin-bottom: 0.25rem;\">ChinaTravel Benchmark Evaluation</h1>
3
+ """
4
+
5
+ INTRO_MARKDOWN = """
6
+ ChinaTravel is an open-ended travel planning benchmark with compositional constraint validation for language agents. (See our [paper](https://arxiv.org/abs/2412.13682) for more details.)
7
+ """
8
+
9
+ SUBMISSION_GUIDE = """
10
+ **How to submit**
11
+ - Pick a split. The split determines which query UIDs are expected.
12
+ - Upload a `.zip` that contains one JSON file per UID. The evaluator recursively scans subfolders, so any directory layout is acceptable.
13
+ - Each JSON must follow the target schema: see [chinatravel/evaluation/output_schema.json](chinatravel/evaluation/output_schema.json).
14
+ - You can dry-run locally via `python eval_exp.py --splits <split> --method <your_method>` to mirror the hosted evaluation.
15
+
16
+ **Output**
17
+ - We compute DR (schema pass rate), EPR_micro/EPR_macro (commonsense), LPR_micro/LPR_macro/C-LPR (logic), and FPR (all-pass rate).
18
+ - A detailed JSON report is produced for download after evaluation.
19
+
20
+ **Contact**
21
+ - If you are interested in showing your results on our leaderboard or have any questions, please contact [Jie-Jing Shao](shaojj@lamda.nju.edu.cn), [Bo-Wen Zhang](221900200@smail.nju.edu.cn), [Xiao-Wen Yang](yangxw@lamda.nju.edu.cn)
22
+ """
23
+
24
+ CONTACT = "Contact: zbw@smail.nju.edu.cn, shaojj@lamda.nju.edu.cn"
chinatravel/ui/eval_runner.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import json
4
+ import shutil
5
+ import zipfile
6
+ from datetime import datetime
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
+ from eval_exp import evaluate
9
+
10
+
11
+ SUBMIT_DIR = os.path.abspath("submissions")
12
+ OUTPUT_DIR = os.path.abspath("outputs")
13
+ _CLEAN_INTERVAL_HOURS = 6
14
+ _KEEP_HOURS = 24
15
+
16
+
17
+ def load_splits():
18
+ splits_dir = os.path.join("chinatravel", "evaluation", "default_splits")
19
+ if not os.path.isdir(splits_dir):
20
+ return []
21
+ names = []
22
+ for filename in os.listdir(splits_dir):
23
+ if filename.endswith(".txt"):
24
+ names.append(filename.replace(".txt", ""))
25
+ return sorted(names)
26
+
27
+
28
+ SPLITS_LIST = load_splits()
29
+ DEFAULT_SPLIT = (
30
+ "easy" if "easy" in SPLITS_LIST else (SPLITS_LIST[0] if SPLITS_LIST else None)
31
+ )
32
+
33
+
34
+ class Arguments:
35
+ def __init__(self, splits, result_dir):
36
+ self.splits = splits
37
+ self.result_dir = result_dir
38
+
39
+
40
+ def _reset_workdirs():
41
+ shutil.rmtree(SUBMIT_DIR, ignore_errors=True)
42
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
43
+ os.makedirs(SUBMIT_DIR, exist_ok=True)
44
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
45
+
46
+
47
+ def clean_old_outputs(folder, keep_hours=_KEEP_HOURS):
48
+ now = time.time()
49
+ for fname in os.listdir(folder):
50
+ fpath = os.path.join(folder, fname)
51
+ if os.path.isfile(fpath) and now - os.path.getmtime(fpath) > keep_hours * 3600:
52
+ os.remove(fpath)
53
+
54
+
55
+ _reset_workdirs()
56
+ _scheduler = BackgroundScheduler(daemon=True)
57
+ _scheduler.add_job(
58
+ lambda: clean_old_outputs(OUTPUT_DIR), "interval", hours=_CLEAN_INTERVAL_HOURS
59
+ )
60
+ _scheduler.start()
61
+
62
+
63
+ def _extract_submission(zip_file):
64
+ zip_path = getattr(zip_file, "name", zip_file)
65
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
66
+ zip_ref.extractall(SUBMIT_DIR)
67
+
68
+
69
+ def _locate_unzipped_root():
70
+ subdirs = [
71
+ d for d in os.listdir(SUBMIT_DIR) if os.path.isdir(os.path.join(SUBMIT_DIR, d))
72
+ ]
73
+ if len(subdirs) == 1:
74
+ return os.path.join(SUBMIT_DIR, subdirs[0])
75
+ return SUBMIT_DIR
76
+
77
+
78
+ def handle_submission(zip_file, dataset_choice):
79
+ if zip_file is None:
80
+ yield "❌ Please upload a zip file with your predictions.", 0, 0, 0, None
81
+ return
82
+ if not dataset_choice:
83
+ yield "❌ Please choose an evaluation split.", 0, 0, 0, None
84
+ return
85
+
86
+ _reset_workdirs()
87
+ try:
88
+ yield "🚀 Starting evaluation...", "0%", "0%", "0%", None
89
+
90
+ _extract_submission(zip_file)
91
+ unzipped_dir = _locate_unzipped_root()
92
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
93
+ output_path = os.path.join(OUTPUT_DIR, f"result_main_{timestamp}.json")
94
+ args = Arguments(splits=dataset_choice, result_dir=unzipped_dir)
95
+
96
+ schema_progress = commonsense_progress = logic_progress = 0
97
+ result = {}
98
+ for progress in evaluate(args, result):
99
+ stage = progress.get("stage", "")
100
+ value = int(round(progress.get("progress", 0)))
101
+
102
+ if stage == "schema":
103
+ schema_progress = value
104
+ yield "Schema evaluation in progress...", f"{schema_progress}%", f"{commonsense_progress}%", f"{logic_progress}%", None
105
+ elif stage == "commonsense":
106
+ commonsense_progress = value
107
+ yield "Commonsense evaluation in progress...", "100%", f"{commonsense_progress}%", f"{logic_progress}%", None
108
+ elif stage == "logic":
109
+ logic_progress = value
110
+ yield "Logic evaluation in progress...", "100%", "100%", f"{logic_progress}%", None
111
+ elif stage == "final":
112
+ result.update(progress.get("result", {}))
113
+ yield "Saving results...", "100%", "100%", "100%", None
114
+
115
+ with open(output_path, "w", encoding="utf-8") as f:
116
+ json.dump(result, f, ensure_ascii=False, indent=4)
117
+
118
+ filename = os.path.basename(output_path)
119
+ yield f"✅ Evaluation completed. Report saved to {filename}.", "100%", "100%", "100%", output_path
120
+
121
+ except Exception as exc: # pragma: no cover - surfaced to UI
122
+ import traceback
123
+
124
+ traceback.print_exc()
125
+ yield f"❌ Evaluation error: {exc}", "0%", "0%", "0%", None
chinatravel/ui/leaderboard.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ PLACEHOLDER_COLUMNS = [
5
+ "Organization",
6
+ "Method",
7
+ "Model",
8
+ "DR",
9
+ "EPR(Micro/Macro)",
10
+ "LPR(Micro/Macro)",
11
+ "C-LPR",
12
+ "FPR",
13
+ ]
14
+
15
+
16
+ def build_placeholder_frames(splits):
17
+ frames = {}
18
+ for split in splits:
19
+ # frames[split] = pd.DataFrame(columns=PLACEHOLDER_COLUMNS)
20
+ frames[split] = pd.read_csv(f"leaderboard_data/{split}.csv", index_col=0)
21
+ # 按照FPR,C-LPR,EPR(Macro),LPR(Macro),EPR(Micro),LPR(Micro),DR降序排序
22
+ # 由于EPR和LPR是字符串形式的"xx/yy",需要先拆分再排序
23
+ frames[split][["EPR_Micro", "EPR_Macro"]] = (
24
+ frames[split]["EPR(Micro/Macro)"].str.split("/", expand=True).astype(float)
25
+ )
26
+ frames[split][["LPR_Micro", "LPR_Macro"]] = (
27
+ frames[split]["LPR(Micro/Macro)"].str.split("/", expand=True).astype(float)
28
+ )
29
+ frames[split] = frames[split].sort_values(
30
+ by=[
31
+ "FPR",
32
+ "C-LPR",
33
+ "EPR_Macro",
34
+ "LPR_Macro",
35
+ "EPR_Micro",
36
+ "LPR_Micro",
37
+ "DR",
38
+ ],
39
+ ascending=False,
40
+ )
41
+ frames[split] = frames[split].drop(
42
+ columns=["EPR_Micro", "EPR_Macro", "LPR_Micro", "LPR_Macro"]
43
+ )
44
+ return frames
eval_exp.py CHANGED
@@ -15,7 +15,7 @@ from chinatravel.symbol_verification.commonsense_constraint import (
15
  )
16
  from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py
17
 
18
- os.environ["HF_DATASETS_OFFLINE"] = "1"
19
 
20
 
21
  def load_result(result_dir, query_index):
 
15
  )
16
  from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py
17
 
18
+ # os.environ["HF_DATASETS_OFFLINE"] = "1"
19
 
20
 
21
  def load_result(result_dir, query_index):
leaderboard_data/easy.csv ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
3
+ NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
4
+ NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
5
+ NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
6
+ NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
7
+ NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
8
+ NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
9
+ NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
10
+ NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
11
+ NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
12
+ NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
13
+ NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
14
+ NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
15
+ NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
16
+ NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
17
+ NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
18
+ NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
19
+ NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
20
+ NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
21
+ NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
22
+ NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
23
+ NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
leaderboard_data/human.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
3
+ NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
4
+ NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
5
+ NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
6
+ NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
7
+ NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
8
+ NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
9
+ NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
10
+ NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
11
+ NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
12
+ NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
13
+ NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
14
+ NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
15
+ NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
16
+ NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
17
+ NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
18
+ NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
19
+ NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
20
+ NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
21
+ NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
leaderboard_data/human1000.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
3
+ NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
4
+ NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
5
+ NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
6
+ NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
7
+ NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8
leaderboard_data/lb_all/easy.csv ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,Act,DeepSeek,70.4,49.9 / 0,64.6 / 30.6,0,0
3
+ NJU,Act,GPT,97.5,70.8 / 0,86.8 / 68.6,0,0
4
+ NJU,ReAct (zero-shot),DeepSeek,43.3,40.8 / 0,41.9 / 19.6,0,0
5
+ NJU,ReAct (zero-shot),GPT,95.4,48.2 / 0,71.3 / 33.0,0,0
6
+ NJU,ReAct (one-shot),DeepSeek,77.5,68.3 / 6.00,74.1 / 52.3,5.77,5.33
7
+ NJU,ReAct (one-shot),GPT,94.2,68.1 / 0,89.4 / 70.6,0,0
8
+ NJU,NeSy Planning,DeepSeek,75.3,75.3 / 75.3,70.4 / 52.6,70.4,52.6
9
+ NJU,NeSy Planning,GPT,75.0,73.6 / 64.0,73.5 / 63.3,61.7,60.6
10
+ NJU,NeSy Planning,Qwen3-8B,72.3,67.0 / 34.0,70.4 / 49.6,32.6,28.3
11
+ NJU,NeSy Planning,Llama3.1-8B,32.0,31.9 / 31.3,29.1 / 21.0,28.3,21.0
12
+ NJU,NeSy Planning,Mistral-7B,30.3,30.3 / 30.3,27.6 / 19.6,27.6,19.6
13
+ NJU,TTG (oracle),DeepSeek,18.3,21.5 / 8.66,17.2 / 15.0,8.23,8.66
14
+ NJU,LLM-Modulo*,DeepSeek,48.3,94.5 / 4.33,58.4 / 43.6,4.11,4.33
15
+ NJU,LLM-Modulo*,GPT,91.6,88.2 / 7.66,95.5 / 84.6,7.66,7.00
16
+ NJU,LLM-Modulo*,Qwen3-8B,30.0,80.5 / 0.0,62.7 / 25.0,0.0,0.0
17
+ NJU,LLM-Modulo*,Llama3.1-8B,28.6,69.4 / 0.0,55.2 / 8.33,0.0,0.0
18
+ NJU,LLM-Modulo*,Mistral-7B,10.3,90.5 / 0.0,39.1 / 9.0,0.0,0.0
19
+ NJU,NeSy Planning*,DeepSeek,82.6,81.7 / 75.0,82.2 / 75.3,75.0,74.0
20
+ NJU,NeSy Planning*,GPT,66.6,66.7 / 66.0,64.6 / 63.6,64.6,62.6
21
+ NJU,NeSy Planning*,Qwen3-8B,69.3,69.3 / 59.3,70.2 / 59.6,59.3,57.9
22
+ NJU,NeSy Planning*,Mistral-7B,52.6,52.6 / 52.6,50.4 / 45.3,50.4,45.6
23
+ NJU,NeSy Planning*,Llama3.1-8B,33.3,33.2 / 32.6,32.1 / 32.0,31.4,32.3
leaderboard_data/lb_all/human.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,ReAct (zero-shot),DeepSeek,36.4,29.5 / 0.65,35.2 / 16.2,0.38,0
3
+ NJU,ReAct (zero-shot),GPT,96.1,50.5 / 0,72.4 / 32.5,0,0
4
+ NJU,ReAct (one-shot),DeepSeek,55.2,57.3 / 2.59,64.6 / 44.2,1.71,2.59
5
+ NJU,ReAct (one-shot),GPT,69.5,46.3 / 0,63.6 / 46.8,0,0
6
+ NJU,NeSy Planning,DeepSeek,51.9,53.2 / 52.5,47.0 / 37.6,46.5,37.0
7
+ NJU,NeSy Planning,GPT,45.4,50.1 / 45.4,40.9 / 29.8,38.5,27.9
8
+ NJU,NeSy Planning,Qwen3-8B,42.8,47.4 / 42.2,36.2 / 27.2,34.4,25.3
9
+ NJU,NeSy Planning,Llama3.1-8B,25.9,25.8 / 24.0,22.3 / 12.3,20.5,11.0
10
+ NJU,NeSy Planning,Mistral-7B,37.6,38.2 / 37.6,32.7 / 18.8,32.2,18.8
11
+ NJU,TTG (oracle),DeepSeek,9.09,12.8 / 2.59,7.65 / 5.19,2.39,1.29
12
+ NJU,LLM-Modulo*,DeepSeek,61.6,90.2 / 2.59,75.9 / 51.2,2.75,2.59
13
+ NJU,LLM-Modulo*,GPT,91.5,87.2 / 3.24,92.9 / 66.2,2.87,3.24
14
+ NJU,LLM-Modulo*,Qwen3-8B,35.0,75.3 / 0.0,61.6 / 19.4,0.0,0.0
15
+ NJU,LLM-Modulo*,Llama3.1-8B,19.4,74.1 / 0.0,43.4 / 5.19,0.0,0.0
16
+ NJU,LLM-Modulo*,Mistral-7B,3.24,92.2 / 0.0,31.4 / 4.54,0.0,0.0
17
+ NJU,NeSy Planning*,DeepSeek,58.4,59.6 / 57.7,53.8 / 46.1,52.0,45.4
18
+ NJU,NeSy Planning*,GPT,52.6,46.9 / 42.9,47.6 / 40.9,43.9,40.9
19
+ NJU,NeSy Planning*,Qwen3-8B,53.2,55.1 / 54.5,48.0 / 42.8,47.6,40.9
20
+ NJU,NeSy Planning*,Mistral-7B,40.9,42.8 / 42.8,37.7 / 28.5,37.7,27.9
21
+ NJU,NeSy Planning*,Llama3.1-8B,29.2,29.1 / 26.6,25.4 / 20.1,23.4,19.4
leaderboard_data/lb_all/human1000.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Organization,Method,Model,DR,EPR(Micro/Macro),LPR(Micro/Macro),C-LPR,FPR
2
+ NJU,NeSy Planning,DeepSeek,44.6,44.5 / 42.6,38.7 / 23.3,37.6,23.3
3
+ NJU,NeSy Planning,GPT,37.3,37.2 / 35.0,30.7 / 11.3,29.2,11.3
4
+ NJU,NeSy Planning,Qwen3-8B,36.6,36.5 / 34.6,29.6 / 6.43,28.5,6.43
5
+ NJU,NeSy Planning*,DeepSeek,60.6,60.3 / 59.0,53.6 / 32.0,52.5,31.6
6
+ NJU,NeSy Planning*,GPT,27.8,27.8 / 27.1,24.8 / 12.8,24.4,12.8
7
+ NJU,NeSy Planning*,Qwen3-8B,41.1,41.1 / 40.6,34.6 / 13.8,34.2,13.8