Spaces:
Sleeping
Sleeping
Don Rishabh Claude Opus 4.7 (1M context) commited on
Commit ·
ea78734
1
Parent(s): 86be5e0
build_before_after_csv: --min-verbose-accuracy flag
Browse filesDrops rows where the verbose-prompt accuracy is ≤ threshold (default 0.0
= keep everything). Set to 0.0001 to filter out tasks where the target
genuinely fails regardless of prompt — those tasks dilute headline
numbers without adding signal.
Falls back to the trained eval's raw_task_score when the profile CSV
doesn't have an entry for a task (e.g. policy_* tasks added after the
profile was run).
Logs the filtered count + first 10 dropped task ids.
Used to mirror the Gradio demo Space's filter logic (same intent: hide
tasks dead on this target).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
training/build_before_after_csv.py
CHANGED
|
@@ -50,6 +50,12 @@ def parse_args() -> argparse.Namespace:
|
|
| 50 |
"If omitted, verbose_accuracy is left blank.")
|
| 51 |
p.add_argument("--target-model", default="meta-llama/Llama-3.2-3B-Instruct",
|
| 52 |
help="Used to count tokens of the verbose description.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
p.add_argument("--output-csv", default="outputs/before_after_prompts.csv")
|
| 54 |
p.add_argument("--push-to-hub", default=None,
|
| 55 |
help="HF model repo id; uploaded as evals/before_after_prompts.csv")
|
|
@@ -117,6 +123,38 @@ def main() -> None:
|
|
| 117 |
# Union of task ids present in either file (in case of partial runs)
|
| 118 |
all_tids = sorted(set(base_rows) | set(trained_rows))
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
rows_out: List[Dict] = []
|
| 121 |
for tid in all_tids:
|
| 122 |
spec = _ALL_TASKS.get(tid)
|
|
|
|
| 50 |
"If omitted, verbose_accuracy is left blank.")
|
| 51 |
p.add_argument("--target-model", default="meta-llama/Llama-3.2-3B-Instruct",
|
| 52 |
help="Used to count tokens of the verbose description.")
|
| 53 |
+
p.add_argument("--min-verbose-accuracy", type=float, default=0.0,
|
| 54 |
+
help="Drop tasks where the verbose-prompt accuracy is "
|
| 55 |
+
"≤ this value. Default 0.0 (keep everything). Set "
|
| 56 |
+
"to e.g. 0.0001 to drop only tasks where the "
|
| 57 |
+
"target genuinely fails regardless of prompt — "
|
| 58 |
+
"those tasks dilute headline numbers.")
|
| 59 |
p.add_argument("--output-csv", default="outputs/before_after_prompts.csv")
|
| 60 |
p.add_argument("--push-to-hub", default=None,
|
| 61 |
help="HF model repo id; uploaded as evals/before_after_prompts.csv")
|
|
|
|
| 123 |
# Union of task ids present in either file (in case of partial runs)
|
| 124 |
all_tids = sorted(set(base_rows) | set(trained_rows))
|
| 125 |
|
| 126 |
+
# Filter on verbose-prompt accuracy floor.
|
| 127 |
+
# We use the profile CSV's `verbose_accuracy` if available; for tasks
|
| 128 |
+
# missing from the profile, we fall back to the trained eval's
|
| 129 |
+
# raw_task_score (since 0 there + 0 on profile = target genuinely
|
| 130 |
+
# can't do this task).
|
| 131 |
+
if args.min_verbose_accuracy > 0:
|
| 132 |
+
kept = []
|
| 133 |
+
dropped = []
|
| 134 |
+
for tid in all_tids:
|
| 135 |
+
v = verbose_acc_map.get(tid)
|
| 136 |
+
if v is None:
|
| 137 |
+
# Fall back to trained eval — if BOTH base and trained
|
| 138 |
+
# got 0, drop it as untrainable on this target.
|
| 139 |
+
b = base_rows.get(tid, {}).get("raw_task_score", 0) or 0
|
| 140 |
+
t = trained_rows.get(tid, {}).get("raw_task_score", 0) or 0
|
| 141 |
+
if max(b, t) <= 0:
|
| 142 |
+
dropped.append(tid)
|
| 143 |
+
continue
|
| 144 |
+
kept.append(tid)
|
| 145 |
+
elif v > args.min_verbose_accuracy:
|
| 146 |
+
kept.append(tid)
|
| 147 |
+
else:
|
| 148 |
+
dropped.append(tid)
|
| 149 |
+
print(f"[csv] filtered: kept {len(kept)} / {len(all_tids)} tasks "
|
| 150 |
+
f"(dropped {len(dropped)} where verbose_accuracy ≤ "
|
| 151 |
+
f"{args.min_verbose_accuracy})", flush=True)
|
| 152 |
+
if dropped:
|
| 153 |
+
print(f"[csv] dropped: {', '.join(dropped[:10])}"
|
| 154 |
+
+ (f" ...+{len(dropped)-10} more" if len(dropped) > 10 else ""),
|
| 155 |
+
flush=True)
|
| 156 |
+
all_tids = kept
|
| 157 |
+
|
| 158 |
rows_out: List[Dict] = []
|
| 159 |
for tid in all_tids:
|
| 160 |
spec = _ALL_TASKS.get(tid)
|