Don Rishabh Claude Opus 4.7 (1M context) commited on
Commit
25d9413
·
1 Parent(s): 450384e

tasks_tough: add 10 domain-classifier tough scenarios (seed batch)

Browse files

10 hand-crafted classification scenarios where the verbose hand-written
prompt is 200-300 tokens but the minimum effective prompt is non-obvious.
All use the existing exact_label scorer, no new scorer code needed.
Wired into _ALL_TASKS, train loop, and eval harness.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

server/prompt_golf_environment.py CHANGED
@@ -48,6 +48,7 @@ try:
48
  from .target_model import TargetBackend, TargetGeneration, get_target_backend
49
  from .tasks import TASKS, TaskSpec, get_task, list_task_ids
50
  from .tasks_v2 import TASKS_V2
 
51
  except ImportError:
52
  from models import (
53
  DEFAULT_PROMPT_BUDGET,
@@ -62,9 +63,11 @@ except ImportError:
62
  from server.target_model import TargetBackend, TargetGeneration, get_target_backend
63
  from server.tasks import TASKS, TaskSpec, get_task, list_task_ids
64
  from server.tasks_v2 import TASKS_V2
 
65
 
66
- # Merged v1 + v2 task bank. v2 task_ids don't clash with v1 by construction.
67
- _ALL_TASKS = {**TASKS, **TASKS_V2}
 
68
 
69
 
70
  # Baseline zero-shot scores are (target_id, task_id) -> score. Computed on
 
48
  from .target_model import TargetBackend, TargetGeneration, get_target_backend
49
  from .tasks import TASKS, TaskSpec, get_task, list_task_ids
50
  from .tasks_v2 import TASKS_V2
51
+ from .tasks_tough import TASKS_TOUGH
52
  except ImportError:
53
  from models import (
54
  DEFAULT_PROMPT_BUDGET,
 
63
  from server.target_model import TargetBackend, TargetGeneration, get_target_backend
64
  from server.tasks import TASKS, TaskSpec, get_task, list_task_ids
65
  from server.tasks_v2 import TASKS_V2
66
+ from server.tasks_tough import TASKS_TOUGH
67
 
68
+ # Merged v1 + v2 + tough task bank. task_ids don't clash by construction
69
+ # (v2 tasks are uniquely named, tough tasks are prefixed `tough_`).
70
+ _ALL_TASKS = {**TASKS, **TASKS_V2, **TASKS_TOUGH}
71
 
72
 
73
  # Baseline zero-shot scores are (target_id, task_id) -> score. Computed on
server/tasks_tough.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Tough-scenarios task bank for Prompt Golf (v3).
9
+
10
+ Goal: scenarios where the *original* (verbose, hand-written) prompt that
11
+ naturally steers the target is 150-300 tokens long, but the MINIMUM
12
+ effective prompt is much shorter and non-obvious. The agent's job is to
13
+ find that compressed prompt — i.e. learn which fragments of the verbose
14
+ specification are load-bearing for the target model.
15
+
16
+ This file is the seed batch (10 scenarios — domain classifiers). The
17
+ remaining 42 will be added in later commits across:
18
+ - Structured extraction
19
+ - Format-strict generation
20
+ - Persona + constraint
21
+ - Multi-step reasoning
22
+ - Adversarial / calibration
23
+
24
+ Why classifiers first: they exercise the existing `exact_label` scorer
25
+ deterministically, so we can validate the whole base→trained CSV
26
+ pipeline before investing in the fuzzier tasks.
27
+
28
+ Each scenario follows the existing TaskSpec contract from server/tasks.py
29
+ so it merges into _ALL_TASKS without code changes elsewhere.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ try:
35
+ from .tasks import TaskSpec
36
+ except ImportError:
37
+ from server.tasks import TaskSpec
38
+
39
+
40
+ TASKS_TOUGH: dict[str, TaskSpec] = {}
41
+
42
+
43
+ def _add(task: TaskSpec) -> None:
44
+ TASKS_TOUGH[task.task_id] = task
45
+
46
+
47
+ def list_task_ids_tough() -> list[str]:
48
+ return list(TASKS_TOUGH.keys())
49
+
50
+
51
+ # ============================================================================
52
+ # Domain classifiers (10)
53
+ #
54
+ # All use scorer="exact_label". Expected output is exactly one token from a
55
+ # closed vocabulary (lowercase, hyphenated, no punctuation, no explanation).
56
+ # ============================================================================
57
+
58
+ _add(TaskSpec(
59
+ task_id="tough_fallacy_classify",
60
+ category="classification_tough",
61
+ description=(
62
+ "Read the short argument and identify the dominant logical fallacy "
63
+ "it commits. The target must output exactly one label from this "
64
+ "closed vocabulary, in lowercase with hyphens, with no punctuation "
65
+ "and no explanation:\n"
66
+ " - ad-hominem (attacking the person, not the argument)\n"
67
+ " - straw-man (misrepresenting an opponent's position to refute it)\n"
68
+ " - false-dilemma (presenting only two options when more exist)\n"
69
+ " - slippery-slope (claiming one event inevitably leads to extreme "
70
+ "consequences without evidence)\n"
71
+ " - appeal-to-authority (citing an irrelevant or unqualified "
72
+ "authority as proof)\n"
73
+ " - circular-reasoning (the conclusion is assumed in the premises)\n"
74
+ " - hasty-generalization (drawing a broad conclusion from a small "
75
+ "or biased sample)\n"
76
+ " - red-herring (introducing an irrelevant topic to distract)\n"
77
+ "If multiple fallacies are present, choose the one most central to "
78
+ "the argument's structure. Output ONLY the label."
79
+ ),
80
+ scorer="exact_label",
81
+ train_examples=[
82
+ ("You can't trust Maria's economic analysis — she failed math in "
83
+ "high school.", "ad-hominem"),
84
+ ("Either we ban all cars or we accept that cities will be unlivable "
85
+ "forever.", "false-dilemma"),
86
+ ("My grandfather smoked his whole life and lived to 95, so smoking "
87
+ "isn't really dangerous.", "hasty-generalization"),
88
+ ],
89
+ test_examples=[
90
+ ("If we let students redo one exam, soon they'll demand to redo "
91
+ "every assignment and graduation will be meaningless.", "slippery-slope"),
92
+ ("Senator Park says climate policy is hurting jobs. He's been "
93
+ "divorced twice — why would anyone listen to him?", "ad-hominem"),
94
+ ("Of course the new drug works. It works because it's effective at "
95
+ "treating the condition.", "circular-reasoning"),
96
+ ("My opponent wants modest gun-safety reform. So she wants to "
97
+ "confiscate every firearm in America.", "straw-man"),
98
+ ("A famous actor endorses this supplement, so it must be "
99
+ "medically sound.", "appeal-to-authority"),
100
+ ("You ask about the budget overruns? Let's talk about how much "
101
+ "the previous administration wasted.", "red-herring"),
102
+ ],
103
+ budget_tokens=120,
104
+ difficulty="hard",
105
+ tags=["classification", "tough", "reasoning"],
106
+ ))
107
+
108
+
109
+ _add(TaskSpec(
110
+ task_id="tough_bias_detect",
111
+ category="classification_tough",
112
+ description=(
113
+ "Identify the cognitive bias most clearly demonstrated by the "
114
+ "scenario. Output exactly one label from this closed vocabulary "
115
+ "(lowercase, hyphenated, no punctuation, no explanation):\n"
116
+ " - confirmation (seeking/weighing only evidence that supports a "
117
+ "prior belief)\n"
118
+ " - anchoring (over-relying on the first number or fact "
119
+ "encountered)\n"
120
+ " - availability (judging probability by how easily examples come "
121
+ "to mind)\n"
122
+ " - sunk-cost (continuing because of past investment rather than "
123
+ "future value)\n"
124
+ " - survivorship (drawing conclusions from successful cases while "
125
+ "ignoring failed ones)\n"
126
+ " - dunning-kruger (low-skill overconfidence; high-skill "
127
+ "under-confidence)\n"
128
+ " - hindsight (believing past events were predictable after the "
129
+ "fact)\n"
130
+ " - recency (overweighting the most recent data point)\n"
131
+ "Output ONLY the label."
132
+ ),
133
+ scorer="exact_label",
134
+ train_examples=[
135
+ ("After watching three plane-crash documentaries, Priya is now "
136
+ "afraid to fly even though she drives daily.", "availability"),
137
+ ("The first house Raj saw was listed at $800k. Every other house "
138
+ "now feels overpriced or like a steal compared to that number.",
139
+ "anchoring"),
140
+ ("Studied successful CEOs all dropped out of college, so dropping "
141
+ "out is the path to success.", "survivorship"),
142
+ ],
143
+ test_examples=[
144
+ ("I've already spent two years on this PhD topic — even though I "
145
+ "don't believe in it anymore, I have to finish.", "sunk-cost"),
146
+ ("After his stock dropped 8% yesterday, Arun is sure the whole "
147
+ "market is collapsing despite a steady year.", "recency"),
148
+ ("She only reads news outlets that agree with her political views "
149
+ "and dismisses the rest as biased.", "confirmation"),
150
+ ("After the company went bankrupt, every analyst said the warning "
151
+ "signs were obvious all along.", "hindsight"),
152
+ ("A first-year coder confidently tells the senior team their "
153
+ "architecture is wrong; she's never shipped to production.",
154
+ "dunning-kruger"),
155
+ ("He only studies founders of unicorn startups to figure out how "
156
+ "to build a unicorn, ignoring the thousands that failed.",
157
+ "survivorship"),
158
+ ],
159
+ budget_tokens=120,
160
+ difficulty="hard",
161
+ tags=["classification", "tough", "psychology"],
162
+ ))
163
+
164
+
165
+ _add(TaskSpec(
166
+ task_id="tough_rhetorical_device",
167
+ category="classification_tough",
168
+ description=(
169
+ "Identify the dominant rhetorical device used in the sentence. "
170
+ "Output exactly one label from this closed vocabulary (lowercase, "
171
+ "no punctuation, no explanation):\n"
172
+ " - anaphora (repeating the same word/phrase at the start of "
173
+ "successive clauses)\n"
174
+ " - metaphor (implicit comparison, no 'like' or 'as')\n"
175
+ " - hyperbole (deliberate, obvious exaggeration)\n"
176
+ " - irony (saying the opposite of what is meant)\n"
177
+ " - alliteration (repeated initial consonant sounds)\n"
178
+ " - antithesis (juxtaposed contrasting ideas in parallel "
179
+ "structure)\n"
180
+ " - rhetorical-question (a question asked for effect, not an "
181
+ "answer)\n"
182
+ " - chiasmus (reversed grammatical structure: A-B-B-A)\n"
183
+ "Pick the device most central to the rhetorical effect. Output "
184
+ "ONLY the label."
185
+ ),
186
+ scorer="exact_label",
187
+ train_examples=[
188
+ ("We shall fight on the beaches. We shall fight on the landing "
189
+ "grounds. We shall fight in the fields.", "anaphora"),
190
+ ("Time is a thief that steals our youth.", "metaphor"),
191
+ ("Ask not what your country can do for you — ask what you can do "
192
+ "for your country.", "chiasmus"),
193
+ ],
194
+ test_examples=[
195
+ ("I've told you a million times to clean your room.", "hyperbole"),
196
+ ("Peter Piper picked a peck of pickled peppers.", "alliteration"),
197
+ ("It is the best of times, it is the worst of times.", "antithesis"),
198
+ ("Oh great, another Monday — exactly what I was hoping for.",
199
+ "irony"),
200
+ ("Do we really need another committee to study the obvious?",
201
+ "rhetorical-question"),
202
+ ("The classroom was a zoo during the substitute's lesson.",
203
+ "metaphor"),
204
+ ],
205
+ budget_tokens=120,
206
+ difficulty="hard",
207
+ tags=["classification", "tough", "literature"],
208
+ ))
209
+
210
+
211
+ _add(TaskSpec(
212
+ task_id="tough_medical_urgency",
213
+ category="classification_tough",
214
+ description=(
215
+ "Read the short patient complaint and triage it into one of four "
216
+ "urgency tiers. Output exactly one label, lowercase, no "
217
+ "punctuation, no explanation. This is a TRIAGE classifier — not "
218
+ "medical advice — so be conservative on red-flag symptoms.\n"
219
+ " - emergency: chest pain, stroke signs, severe bleeding, loss of "
220
+ "consciousness, anaphylaxis, suicidal intent — call ambulance now.\n"
221
+ " - urgent: significant injury, high fever with stiffness, severe "
222
+ "pain, infection signs, pregnancy complications — same-day care.\n"
223
+ " - routine: persistent but stable symptoms, follow-ups, "
224
+ "medication refills — schedule within 1-2 weeks.\n"
225
+ " - self-care: minor cuts, common cold, mild headache, fatigue "
226
+ "without alarm features — rest, OTC, monitor.\n"
227
+ "Default to the higher tier when symptoms are ambiguous. Output "
228
+ "ONLY the label."
229
+ ),
230
+ scorer="exact_label",
231
+ train_examples=[
232
+ ("Crushing chest pressure radiating to my left arm, sweating, "
233
+ "started 20 minutes ago.", "emergency"),
234
+ ("Sore throat and runny nose for two days, no fever, eating "
235
+ "normally.", "self-care"),
236
+ ("Rash on forearm that's been spreading slowly for three weeks, "
237
+ "no pain.", "routine"),
238
+ ],
239
+ test_examples=[
240
+ ("Sudden numbness on one side of my face and slurred speech for "
241
+ "the last 10 minutes.", "emergency"),
242
+ ("Deep cut on my hand from a kitchen knife, bleeding has slowed "
243
+ "but it might need stitches.", "urgent"),
244
+ ("Ongoing knee stiffness in the mornings for the past month, "
245
+ "manageable.", "routine"),
246
+ ("Mild headache after a long day on screens, no other symptoms.",
247
+ "self-care"),
248
+ ("High fever 39.5C, stiff neck, and a new pinpoint rash that "
249
+ "started this evening.", "emergency"),
250
+ ("Persistent cough for four days, low-grade fever, achy but "
251
+ "drinking fluids and resting.", "urgent"),
252
+ ],
253
+ budget_tokens=140,
254
+ difficulty="hard",
255
+ tags=["classification", "tough", "medical"],
256
+ ))
257
+
258
+
259
+ _add(TaskSpec(
260
+ task_id="tough_code_smell",
261
+ category="classification_tough",
262
+ description=(
263
+ "Read the short code description and identify the dominant code "
264
+ "smell. Output exactly one label from this closed vocabulary "
265
+ "(lowercase, hyphenated, no punctuation, no explanation):\n"
266
+ " - long-method (a single function does too many things over too "
267
+ "many lines)\n"
268
+ " - god-class (one class accumulates unrelated responsibilities)\n"
269
+ " - duplicate-code (the same logic appears in multiple places)\n"
270
+ " - dead-code (unused variables, branches, or functions)\n"
271
+ " - magic-number (unexplained literal constants in logic)\n"
272
+ " - primitive-obsession (using strings/ints where a small type "
273
+ "would clarify intent)\n"
274
+ " - feature-envy (a method uses another class's data more than "
275
+ "its own)\n"
276
+ " - shotgun-surgery (one logical change requires edits across "
277
+ "many files)\n"
278
+ "Output ONLY the label."
279
+ ),
280
+ scorer="exact_label",
281
+ train_examples=[
282
+ ("`processOrder()` is 600 lines long and handles validation, "
283
+ "pricing, payment, shipping, email, and audit logging in one "
284
+ "function.", "long-method"),
285
+ ("`if total > 4500: applyDiscount(0.07)` — neither number is "
286
+ "explained.", "magic-number"),
287
+ ("Adding a new currency requires editing the database schema, "
288
+ "three services, the UI, and two config files.",
289
+ "shotgun-surgery"),
290
+ ],
291
+ test_examples=[
292
+ ("`UserManager` handles authentication, profile editing, billing, "
293
+ "email sending, audit logs, and CSV export.", "god-class"),
294
+ ("The same 30-line block computing tax appears in CheckoutService, "
295
+ "InvoiceService, and ReportService.", "duplicate-code"),
296
+ ("`Order.calculateShipping()` reads 8 fields from `Customer` and "
297
+ "uses only 1 from its own object.", "feature-envy"),
298
+ ("There's a private helper `oldFormatLegacy()` that nothing in "
299
+ "the repo references anymore.", "dead-code"),
300
+ ("Phone numbers, emails, postal codes, and currency amounts are "
301
+ "all stored as plain `str` everywhere.", "primitive-obsession"),
302
+ ("A single function `handleRequest()` parses input, validates, "
303
+ "queries DB, formats output, logs, and emails — 400 lines.",
304
+ "long-method"),
305
+ ],
306
+ budget_tokens=140,
307
+ difficulty="hard",
308
+ tags=["classification", "tough", "software"],
309
+ ))
310
+
311
+
312
+ _add(TaskSpec(
313
+ task_id="tough_news_framing",
314
+ category="classification_tough",
315
+ description=(
316
+ "Read the short news headline and identify its dominant framing "
317
+ "technique. Output exactly one label from this closed vocabulary "
318
+ "(lowercase, hyphenated, no punctuation, no explanation):\n"
319
+ " - episodic (focuses on a single event or individual case)\n"
320
+ " - thematic (focuses on broader trends, statistics, or "
321
+ "context)\n"
322
+ " - conflict (frames the story as a clash between sides)\n"
323
+ " - human-interest (emotional angle on a person's experience)\n"
324
+ " - economic (frames consequences in financial / market terms)\n"
325
+ " - morality (frames the story in terms of right vs wrong, "
326
+ "values)\n"
327
+ " - responsibility (assigns blame or credit to a specific "
328
+ "actor)\n"
329
+ "Pick the dominant frame even if minor frames are present. Output "
330
+ "ONLY the label."
331
+ ),
332
+ scorer="exact_label",
333
+ train_examples=[
334
+ ("Single mother of three struggles to afford groceries as prices "
335
+ "rise.", "human-interest"),
336
+ ("National food-insecurity rate hits 12-year high, USDA report "
337
+ "shows.", "thematic"),
338
+ ("Senate Democrats and Republicans clash over food-stamp "
339
+ "spending bill.", "conflict"),
340
+ ],
341
+ test_examples=[
342
+ ("Local bakery owner closes shop after 30 years, blames soaring "
343
+ "rent.", "episodic"),
344
+ ("Inflation eats into household budgets as wages stagnate.",
345
+ "economic"),
346
+ ("Mayor accused of approving the contract that caused the water "
347
+ "crisis.", "responsibility"),
348
+ ("Is it ever right to lie to protect a friend? Readers weigh in.",
349
+ "morality"),
350
+ ("Climate-policy fight escalates as governors trade public "
351
+ "letters.", "conflict"),
352
+ ("Childhood obesity rates nationwide climbed 4% over the last "
353
+ "decade.", "thematic"),
354
+ ],
355
+ budget_tokens=130,
356
+ difficulty="hard",
357
+ tags=["classification", "tough", "media"],
358
+ ))
359
+
360
+
361
+ _add(TaskSpec(
362
+ task_id="tough_claim_verifiability",
363
+ category="classification_tough",
364
+ description=(
365
+ "Classify the claim by what kind of statement it is. Output "
366
+ "exactly one label, lowercase, hyphenated, no punctuation, no "
367
+ "explanation:\n"
368
+ " - verifiable (in principle checkable against publicly "
369
+ "available facts or measurements)\n"
370
+ " - unverifiable (about private mental states, future events, "
371
+ "or otherwise not externally checkable)\n"
372
+ " - value-judgment (expresses a preference, taste, or moral "
373
+ "evaluation rather than a fact)\n"
374
+ " - tautology (true by definition; carries no empirical "
375
+ "content)\n"
376
+ "Distinguish carefully: an unverifiable empirical claim is NOT "
377
+ "the same as a value-judgment. Output ONLY the label."
378
+ ),
379
+ scorer="exact_label",
380
+ train_examples=[
381
+ ("The Eiffel Tower is 330 meters tall.", "verifiable"),
382
+ ("Chocolate ice cream is the best dessert ever invented.",
383
+ "value-judgment"),
384
+ ("All bachelors are unmarried.", "tautology"),
385
+ ],
386
+ test_examples=[
387
+ ("The president secretly regrets signing the trade deal last "
388
+ "year.", "unverifiable"),
389
+ ("Mount Everest is taller than Mount Kilimanjaro.", "verifiable"),
390
+ ("A triangle has three sides.", "tautology"),
391
+ ("Modern art is shallow and pretentious.", "value-judgment"),
392
+ ("Earth's average surface temperature has risen since 1900.",
393
+ "verifiable"),
394
+ ("Pluto will be reclassified as a planet again before 2050.",
395
+ "unverifiable"),
396
+ ],
397
+ budget_tokens=120,
398
+ difficulty="hard",
399
+ tags=["classification", "tough", "epistemology"],
400
+ ))
401
+
402
+
403
+ _add(TaskSpec(
404
+ task_id="tough_argument_strength",
405
+ category="classification_tough",
406
+ description=(
407
+ "Evaluate the short argument and classify its logical status. "
408
+ "Output exactly one label, lowercase, hyphenated, no punctuation, "
409
+ "no explanation:\n"
410
+ " - sound (valid form AND all premises are true / plausibly "
411
+ "true)\n"
412
+ " - valid-but-unsound (the conclusion follows IF the premises "
413
+ "are true, but at least one premise is false)\n"
414
+ " - invalid (the conclusion does NOT follow from the premises "
415
+ "even if they were true)\n"
416
+ " - fallacious (commits a recognized informal fallacy that "
417
+ "undermines the inference)\n"
418
+ "Apply this order of priority: if the argument commits a clear "
419
+ "informal fallacy, label it `fallacious` over `invalid`. Output "
420
+ "ONLY the label."
421
+ ),
422
+ scorer="exact_label",
423
+ train_examples=[
424
+ ("All humans are mortal. Socrates is human. Therefore Socrates is "
425
+ "mortal.", "sound"),
426
+ ("All birds can fly. Penguins are birds. Therefore penguins can "
427
+ "fly.", "valid-but-unsound"),
428
+ ("Some dogs are brown. My cat is brown. Therefore my cat is a "
429
+ "dog.", "invalid"),
430
+ ],
431
+ test_examples=[
432
+ ("The new policy must be wrong because the senator proposing it "
433
+ "had an affair last year.", "fallacious"),
434
+ ("All squares have four sides. This shape is a square. Therefore "
435
+ "it has four sides.", "sound"),
436
+ ("If it rains, the streets get wet. The streets are wet. "
437
+ "Therefore it rained.", "invalid"),
438
+ ("Every prime number is odd. Seven is prime. Therefore seven is "
439
+ "odd.", "valid-but-unsound"),
440
+ ("Either you support our tax bill or you hate working families.",
441
+ "fallacious"),
442
+ ("All mammals are warm-blooded. Whales are mammals. Therefore "
443
+ "whales are warm-blooded.", "sound"),
444
+ ],
445
+ budget_tokens=140,
446
+ difficulty="hard",
447
+ tags=["classification", "tough", "logic"],
448
+ ))
449
+
450
+
451
+ _add(TaskSpec(
452
+ task_id="tough_emotion_primary",
453
+ category="classification_tough",
454
+ description=(
455
+ "Identify the dominant primary emotion expressed by the speaker, "
456
+ "using Plutchik's eight basic emotions. Output exactly one label, "
457
+ "lowercase, no punctuation, no explanation:\n"
458
+ " - joy (happiness, delight, contentment)\n"
459
+ " - trust (acceptance, confidence in someone/something)\n"
460
+ " - fear (apprehension, worry about a threat)\n"
461
+ " - surprise (unexpectedness, being caught off guard)\n"
462
+ " - sadness (sorrow, loss, dejection)\n"
463
+ " - disgust (revulsion, moral or physical aversion)\n"
464
+ " - anger (frustration, hostility, indignation)\n"
465
+ " - anticipation (expectation, looking forward)\n"
466
+ "Pick the SINGLE strongest emotion even if blends are present. "
467
+ "Output ONLY the label."
468
+ ),
469
+ scorer="exact_label",
470
+ train_examples=[
471
+ ("I can't believe she actually showed up — I had no idea she was "
472
+ "in town!", "surprise"),
473
+ ("My team has my back; I know they'll deliver no matter what.",
474
+ "trust"),
475
+ ("Everything I worked for these last five years is just gone.",
476
+ "sadness"),
477
+ ],
478
+ test_examples=[
479
+ ("I'm counting down the days until the trip — only two weeks "
480
+ "left!", "anticipation"),
481
+ ("How DARE they reroute my flight without a single email?",
482
+ "anger"),
483
+ ("Reading those emails made my skin crawl. I had to stop "
484
+ "halfway.", "disgust"),
485
+ ("What if the test results come back bad? I haven't slept in "
486
+ "days.", "fear"),
487
+ ("Got the offer, the salary, AND the team I wanted — best week "
488
+ "ever.", "joy"),
489
+ ("I keep replaying the call. She just isn't coming back.",
490
+ "sadness"),
491
+ ],
492
+ budget_tokens=130,
493
+ difficulty="hard",
494
+ tags=["classification", "tough", "psychology"],
495
+ ))
496
+
497
+
498
+ _add(TaskSpec(
499
+ task_id="tough_policy_stance",
500
+ category="classification_tough",
501
+ description=(
502
+ "Classify the speaker's stance on the policy proposal mentioned "
503
+ "in the quote. Output exactly one label, lowercase, hyphenated, "
504
+ "no punctuation, no explanation:\n"
505
+ " - support (clearly endorses the proposal)\n"
506
+ " - oppose (clearly rejects the proposal)\n"
507
+ " - neutral (declines to take a side, observes both views, or "
508
+ "stays purely descriptive)\n"
509
+ " - conditional-support (would support IF certain conditions "
510
+ "were met)\n"
511
+ " - conditional-oppose (would oppose UNLESS certain conditions "
512
+ "were met)\n"
513
+ "Distinguish carefully: a hedged endorsement that names "
514
+ "preconditions is conditional-support, not neutral. A statement "
515
+ "of mixed views without a stance is neutral. Output ONLY the "
516
+ "label."
517
+ ),
518
+ scorer="exact_label",
519
+ train_examples=[
520
+ ("I'm fully behind the rent-cap proposal — it'll protect "
521
+ "vulnerable tenants.", "support"),
522
+ ("The mining permit is a disaster for the watershed and I will "
523
+ "vote no.", "oppose"),
524
+ ("Some economists like the tariff plan, others don't — the "
525
+ "evidence is genuinely mixed.", "neutral"),
526
+ ],
527
+ test_examples=[
528
+ ("I'd back the carbon-tax bill, but only if the revenue is "
529
+ "rebated to households.", "conditional-support"),
530
+ ("I cannot support the surveillance program unless judicial "
531
+ "review is built in from day one.", "conditional-oppose"),
532
+ ("The infrastructure package is exactly what this district has "
533
+ "needed for a decade.", "support"),
534
+ ("I won't comment on the merits of the bill; that's for the "
535
+ "committee to weigh.", "neutral"),
536
+ ("This zoning change will gut the neighborhood — count me as a "
537
+ "firm no.", "oppose"),
538
+ ("I'll support the immigration reform if it includes a real "
539
+ "pathway to citizenship.", "conditional-support"),
540
+ ],
541
+ budget_tokens=140,
542
+ difficulty="hard",
543
+ tags=["classification", "tough", "politics"],
544
+ ))
545
+
546
+
547
+ # ============================================================================
548
+ # Module-level helpers
549
+ # ============================================================================
550
+
551
+ if __name__ == "__main__":
552
+ print(f"tasks_tough: {len(TASKS_TOUGH)} scenarios")
553
+ for tid, spec in TASKS_TOUGH.items():
554
+ print(f" {tid:36s} {spec.category:24s} budget={spec.budget_tokens}")
training/eval_before_after.py CHANGED
@@ -126,11 +126,12 @@ def main() -> None:
126
  from prompt_golf_env.server.prompt_golf_environment import PromptGolfEnvironment
127
  from prompt_golf_env.server.tasks import TASKS, list_task_ids as list_v1
128
  from prompt_golf_env.server.tasks_v2 import TASKS_V2, list_task_ids_v2
 
129
 
130
- _ALL_TASKS = {**TASKS, **TASKS_V2}
131
 
132
  def list_task_ids():
133
- return list_v1() + list_task_ids_v2()
134
 
135
  # Load agent
136
  model, tok = load_agent(args.agent_model, args.adapter)
 
126
  from prompt_golf_env.server.prompt_golf_environment import PromptGolfEnvironment
127
  from prompt_golf_env.server.tasks import TASKS, list_task_ids as list_v1
128
  from prompt_golf_env.server.tasks_v2 import TASKS_V2, list_task_ids_v2
129
+ from prompt_golf_env.server.tasks_tough import TASKS_TOUGH, list_task_ids_tough
130
 
131
+ _ALL_TASKS = {**TASKS, **TASKS_V2, **TASKS_TOUGH}
132
 
133
  def list_task_ids():
134
+ return list_v1() + list_task_ids_v2() + list_task_ids_tough()
135
 
136
  # Load agent
137
  model, tok = load_agent(args.agent_model, args.adapter)
training/train_grpo.py CHANGED
@@ -324,6 +324,7 @@ def main() -> None:
324
  from prompt_golf_env.server.prompt_golf_environment import PromptGolfEnvironment
325
  from prompt_golf_env.server.tasks import list_task_ids as list_task_ids_v1
326
  from prompt_golf_env.server.tasks_v2 import list_task_ids_v2
 
327
 
328
  # NOTE: we deliberately do NOT import Unsloth here. Unsloth patches
329
  # Qwen2Attention at import time, which breaks the target model
@@ -343,7 +344,7 @@ def main() -> None:
343
 
344
  # ----- env (target loaded lazily on first forward pass) -----
345
  env = PromptGolfEnvironment()
346
- all_tasks = list_task_ids_v1() + list_task_ids_v2()
347
  held_out = {t.strip() for t in args.held_out_tasks.split(",") if t.strip()}
348
  train_tasks = [t for t in all_tasks if t not in held_out]
349
  print(f"[setup] tasks total={len(all_tasks)} train={len(train_tasks)} held_out={len(held_out)}", flush=True)
 
324
  from prompt_golf_env.server.prompt_golf_environment import PromptGolfEnvironment
325
  from prompt_golf_env.server.tasks import list_task_ids as list_task_ids_v1
326
  from prompt_golf_env.server.tasks_v2 import list_task_ids_v2
327
+ from prompt_golf_env.server.tasks_tough import list_task_ids_tough
328
 
329
  # NOTE: we deliberately do NOT import Unsloth here. Unsloth patches
330
  # Qwen2Attention at import time, which breaks the target model
 
344
 
345
  # ----- env (target loaded lazily on first forward pass) -----
346
  env = PromptGolfEnvironment()
347
+ all_tasks = list_task_ids_v1() + list_task_ids_v2() + list_task_ids_tough()
348
  held_out = {t.strip() for t in args.held_out_tasks.split(",") if t.strip()}
349
  train_tasks = [t for t in all_tasks if t not in held_out]
350
  print(f"[setup] tasks total={len(all_tasks)} train={len(train_tasks)} held_out={len(held_out)}", flush=True)