Don Rishabh Claude Opus 4.7 (1M context) commited on
Commit
ea78734
·
1 Parent(s): 86be5e0

build_before_after_csv: --min-verbose-accuracy flag

Browse files

Drops rows where the verbose-prompt accuracy is ≤ threshold (default 0.0
= keep everything). Set to 0.0001 to filter out tasks where the target
genuinely fails regardless of prompt — those tasks dilute headline
numbers without adding signal.

Falls back to the trained eval's raw_task_score when the profile CSV
doesn't have an entry for a task (e.g. policy_* tasks added after the
profile was run).

Logs the filtered count + first 10 dropped task ids.

Used to mirror the Gradio demo Space's filter logic (same intent: hide
tasks dead on this target).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. training/build_before_after_csv.py +38 -0
training/build_before_after_csv.py CHANGED
@@ -50,6 +50,12 @@ def parse_args() -> argparse.Namespace:
50
  "If omitted, verbose_accuracy is left blank.")
51
  p.add_argument("--target-model", default="meta-llama/Llama-3.2-3B-Instruct",
52
  help="Used to count tokens of the verbose description.")
 
 
 
 
 
 
53
  p.add_argument("--output-csv", default="outputs/before_after_prompts.csv")
54
  p.add_argument("--push-to-hub", default=None,
55
  help="HF model repo id; uploaded as evals/before_after_prompts.csv")
@@ -117,6 +123,38 @@ def main() -> None:
117
  # Union of task ids present in either file (in case of partial runs)
118
  all_tids = sorted(set(base_rows) | set(trained_rows))
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  rows_out: List[Dict] = []
121
  for tid in all_tids:
122
  spec = _ALL_TASKS.get(tid)
 
50
  "If omitted, verbose_accuracy is left blank.")
51
  p.add_argument("--target-model", default="meta-llama/Llama-3.2-3B-Instruct",
52
  help="Used to count tokens of the verbose description.")
53
+ p.add_argument("--min-verbose-accuracy", type=float, default=0.0,
54
+ help="Drop tasks where the verbose-prompt accuracy is "
55
+ "≤ this value. Default 0.0 (keep everything). Set "
56
+ "to e.g. 0.0001 to drop only tasks where the "
57
+ "target genuinely fails regardless of prompt — "
58
+ "those tasks dilute headline numbers.")
59
  p.add_argument("--output-csv", default="outputs/before_after_prompts.csv")
60
  p.add_argument("--push-to-hub", default=None,
61
  help="HF model repo id; uploaded as evals/before_after_prompts.csv")
 
123
  # Union of task ids present in either file (in case of partial runs)
124
  all_tids = sorted(set(base_rows) | set(trained_rows))
125
 
126
+ # Filter on verbose-prompt accuracy floor.
127
+ # We use the profile CSV's `verbose_accuracy` if available; for tasks
128
+ # missing from the profile, we fall back to the trained eval's
129
+ # raw_task_score (since 0 there + 0 on profile = target genuinely
130
+ # can't do this task).
131
+ if args.min_verbose_accuracy > 0:
132
+ kept = []
133
+ dropped = []
134
+ for tid in all_tids:
135
+ v = verbose_acc_map.get(tid)
136
+ if v is None:
137
+ # Fall back to trained eval — if BOTH base and trained
138
+ # got 0, drop it as untrainable on this target.
139
+ b = base_rows.get(tid, {}).get("raw_task_score", 0) or 0
140
+ t = trained_rows.get(tid, {}).get("raw_task_score", 0) or 0
141
+ if max(b, t) <= 0:
142
+ dropped.append(tid)
143
+ continue
144
+ kept.append(tid)
145
+ elif v > args.min_verbose_accuracy:
146
+ kept.append(tid)
147
+ else:
148
+ dropped.append(tid)
149
+ print(f"[csv] filtered: kept {len(kept)} / {len(all_tids)} tasks "
150
+ f"(dropped {len(dropped)} where verbose_accuracy ≤ "
151
+ f"{args.min_verbose_accuracy})", flush=True)
152
+ if dropped:
153
+ print(f"[csv] dropped: {', '.join(dropped[:10])}"
154
+ + (f" ...+{len(dropped)-10} more" if len(dropped) > 10 else ""),
155
+ flush=True)
156
+ all_tids = kept
157
+
158
  rows_out: List[Dict] = []
159
  for tid in all_tids:
160
  spec = _ALL_TASKS.get(tid)