akseljoonas HF Staff commited on
Commit
158d846
·
1 Parent(s): 8f4b322

intermediate commit until i let amp loose

Browse files
eval/.amp_batch_solve.py.swp ADDED
Binary file (12.3 kB). View file
 
eval/amp_batch_solve.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ import threading
6
+
7
+ from amp_sdk import AmpOptions, execute
8
+
9
+ # Thread-safe file writing
10
+ file_lock = threading.Lock()
11
+
12
+
13
+ async def solve_task(
14
+ question: str, difficulty: str, task_idx: int, total: int, semaphore: asyncio.Semaphore
15
+ ) -> dict:
16
+ """Solve a single task using Amp SDK."""
17
+ async with semaphore:
18
+ print(f"[{task_idx}/{total}] Starting: {question[:60]}...")
19
+
20
+ messages = []
21
+ solution = None
22
+
23
+ try:
24
+ async for message in execute(
25
+ question,
26
+ AmpOptions(
27
+ cwd=os.getcwd(),
28
+ visibility="workspace",
29
+ dangerously_allow_all=True,
30
+ ),
31
+ ):
32
+ messages.append(message.model_dump())
33
+
34
+ # Extract the final text response as solution
35
+ if message.type == "assistant":
36
+ content = message.message.get("content", [])
37
+ for item in content:
38
+ if isinstance(item, dict) and item.get("type") == "text":
39
+ solution = item.get("text")
40
+ elif message.type == "result":
41
+ if message.result:
42
+ solution = message.result
43
+
44
+ print(f"[{task_idx}/{total}] ✓ Done: {question[:60]}...")
45
+ return {
46
+ "question": question,
47
+ "difficulty": difficulty,
48
+ "solution": solution,
49
+ "messages": messages,
50
+ }
51
+ except Exception as e:
52
+ print(f"[{task_idx}/{total}] ✗ Error: {e}")
53
+ return {
54
+ "question": question,
55
+ "difficulty": difficulty,
56
+ "solution": None,
57
+ "messages": messages,
58
+ "error": str(e),
59
+ }
60
+
61
+
62
+ def write_result(output_path: Path, result: dict):
63
+ """Thread-safe write to output file."""
64
+ with file_lock:
65
+ with open(output_path, "a") as f:
66
+ f.write(json.dumps(result) + "\n")
67
+
68
+
69
+ async def main():
70
+ # Load tasks
71
+ tasks_path = Path(__file__).parent / "generated_tasks_with_difficulty.json"
72
+ with open(tasks_path) as f:
73
+ tasks = json.load(f)
74
+
75
+ # Output file - clear it first
76
+ output_path = Path(__file__).parent / "solved_tasks.jsonl"
77
+ output_path.write_text("")
78
+
79
+ # Semaphore to limit concurrency
80
+ max_concurrent = 20
81
+ semaphore = asyncio.Semaphore(max_concurrent)
82
+
83
+ total = len(tasks)
84
+ print(f"Processing {total} tasks with {max_concurrent} concurrent agents...")
85
+
86
+ async def process_and_save(question: str, difficulty: str, idx: int):
87
+ result = await solve_task(question, difficulty, idx, total, semaphore)
88
+ write_result(output_path, result)
89
+ return result
90
+
91
+ # Create all tasks
92
+ coroutines = [
93
+ process_and_save(question, difficulty, i + 1)
94
+ for i, (question, difficulty) in enumerate(tasks.items())
95
+ ]
96
+
97
+ # Run all concurrently (semaphore limits actual parallelism)
98
+ results = await asyncio.gather(*coroutines, return_exceptions=True)
99
+
100
+ successful = sum(1 for r in results if isinstance(r, dict) and "error" not in r)
101
+ print(f"\nCompleted: {successful}/{total} successful")
102
+ print(f"Results saved to {output_path}")
103
+
104
+
105
+ if __name__ == "__main__":
106
+ asyncio.run(main())
eval/amp_solve.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+
4
+ from amp_sdk import AmpOptions, execute
5
+
6
+ prompt = """
7
+ what account am I logged in as?
8
+ """
9
+
10
+
11
+ async def main():
12
+ # Use the toolbox directory to share tools with Amp
13
+ toolbox_dir = os.path.join(os.getcwd(), "toolbox")
14
+ messages = []
15
+ async for message in execute(
16
+ prompt,
17
+ AmpOptions(
18
+ cwd=os.getcwd(),
19
+ toolbox=toolbox_dir,
20
+ visibility="workspace",
21
+ dangerously_allow_all=True,
22
+ ),
23
+ ):
24
+ messages.append(message)
25
+
26
+ for msg in messages:
27
+ print(msg.model_dump_json(indent=2))
28
+
29
+
30
+ if __name__ == "__main__":
31
+ asyncio.run(main())
eval/eval_set.ipynb ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 6,
6
+ "id": "19f3dd6b",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# Seed examples for task bootstrapping\n",
11
+ "tasks_with_difficulty = {\n",
12
+ " # lewis\n",
13
+ " \"Evaluate models {M_i} on benchmarks {B_i}\": \"Easy\",\n",
14
+ " \"Train models {M_i} on datasets {D_i} with benchmarks {B_i}\": \"Medium\",\n",
15
+ " \"Run an ablation for hyperparameter P for model M on dataset D\": \"Hard\",\n",
16
+ " \"Generate completions with model M on dataset D using engine E\": \"Medium\",\n",
17
+ " \"Merge models {M_i} using linear averaging to find the best result on benchmarks {B_i}\": \"Hard\",\n",
18
+ " \"Given datasets {D_i}, ablate the best SFT mixture for model M across benchmarks {B_i}\": \"Very hard\",\n",
19
+ " \"Decontaminate dataset D against benchmarks {B_i}\": \"Hard\",\n",
20
+ " \"Benchmark RL framework F for best throughput on G GPUs\": \"Very hard\",\n",
21
+ " \"Implement post-training algorithm A from paper P in framework F. Validate it runs end-to-end\": \"Very hard\",\n",
22
+ " \"Implement benchmark B in framework F. Validate it reproduces some published results\": \"Very hard\",\n",
23
+ " \"Format dataset D for compatibility with framework F on task T\": \"Easy\",\n",
24
+ "\n",
25
+ " # abubakar\n",
26
+ " \"Remove the background from this image: [image path]\": \"Easy\",\n",
27
+ " \"Transcribe all of the audio files in this directory\": \"Easy\",\n",
28
+ " \"Transcribe all of the audio files in this directory, choose the model that'll be cheapest and also relatively accurate\": \"Medium (judgment call or interaction needed to figure out what accuracy levels are acceptable)\",\n",
29
+ " \"Remove the background music from this audio file\": \"Medium (needs to find Gradio Space and call its API0\",\n",
30
+ " \"Change this video track to be from English to Spanish\": \"Medium (needs to link several models together)\",\n",
31
+ " \"Translate this flyer from English to Spanish, keeping the layout and images the same\": \"Medium (needs to link several models together)\",\n",
32
+ "\n",
33
+ " # leandro\n",
34
+ " \"What's the best model for X?\": \"Easy\",\n",
35
+ " \"What datasets are available for X? (X={domain x task x modality})\": \"Easy\",\n",
36
+ " \"Is there a space to do Y?\": \"Easy\",\n",
37
+ " \"I have this script and this error - what's the issue?\": \"Medium\",\n",
38
+ " \"This space is broken, how can i fix it?\": \"Medium\",\n",
39
+ " \"I built a space but it is super slow. What can I do?\": \"Medium\",\n",
40
+ " \"How can I run modal X locally?\": \"Medium\",\n",
41
+ " \"I want to build a space with model Y to do X?\": \"Hard\",\n",
42
+ " \"How can I serve a model with multiple LoRAs?\": \"Hard\",\n",
43
+ "\n",
44
+ " # claude\n",
45
+ " \"What's the best model for sentiment analysis on financial text?\": \"Easy\",\n",
46
+ " \"Are there any medical image segmentation datasets on HuggingFace for CT scans?\": \"Easy\",\n",
47
+ " \"Which text classification models support 4-bit quantization?\": \"Medium\",\n",
48
+ " \"Are there inference endpoints available for Whisper large-v3?\": \"Easy\",\n",
49
+ " \"What's the license for the SA-Med2D-20M dataset?\": \"Easy\",\n",
50
+ " \"Which vision models fit in 8GB VRAM for image segmentation?\": \"Medium\",\n",
51
+ " \"What datasets are available for 3D medical image segmentation?\": \"Medium\",\n",
52
+ " \"Is there a space to do text-to-speech with emotion control?\": \"Medium\",\n",
53
+ " \"I'm getting \\\"CUDA out of memory\\\" when loading Llama-2-7b even though nvidia-smi shows I have 6GB free - what's the issue?\": \"Medium\",\n",
54
+ " \"My Gradio space shows \\\"Connection errored out\\\" after working fine yesterday, no code changes - how can I fix it?\": \"Medium\",\n",
55
+ " \"I built a Gradio space for Stable Diffusion but inference takes 5+ minutes on a 4090 - what can I do?\": \"Medium\",\n",
56
+ " \"My Whisper model outputs different transcriptions after quantization to int8 - why?\": \"Medium\",\n",
57
+ " \"Getting \\\"RuntimeError: CUDA error: out of memory. Tried to allocate 70.00 MiB\\\" but only 2.87 GiB is allocated - what's happening?\": \"Medium\",\n",
58
+ " \"My HuggingFace space build fails with \\\"failed to create containerd task\\\" - how to fix?\": \"Medium\",\n",
59
+ " \"DistilBERT model gives \\\"you should probably train your model\\\" warning even though it's a pretrained model from the Hub\": \"Easy\",\n",
60
+ " \"Space was working fine but now receiving build errors - receiving this error even with a new space\": \"Medium\",\n",
61
+ " \"Inference is correct locally but wrong on deployed space\": \"Medium\",\n",
62
+ " \"Getting CUDA OOM despite having enough memory according to nvidia-smi\": \"Medium\",\n",
63
+ " \"How can I run Mistral-7B-v0.1 locally with multiple LoRA adapters?\": \"Hard\",\n",
64
+ " \"How can I serve Llama-2-7b with vLLM and dynamically load multiple LoRA adapters?\": \"Hard\",\n",
65
+ " \"How do I batch inference requests in my Gradio space for better throughput?\": \"Medium\",\n",
66
+ " \"Can I run Whisper large-v3 with faster-whisper for 4x speedup?\": \"Medium\",\n",
67
+ " \"How to run Llama 2 on CPU after fine-tuning with LoRA?\": \"Medium\",\n",
68
+ " \"Best way to handle 50+ concurrent requests in a Gradio space without OOM?\": \"Hard\",\n",
69
+ " \"How do I add custom stopping criteria for text generation with Transformers?\": \"Hard\",\n",
70
+ " \"Can I merge multiple LoRA adapters before inference to reduce latency?\": \"Hard\",\n",
71
+ " \"How can I optimize my LLM inference with one base LLM and multiple LoRA adapters?\": \"Hard\",\n",
72
+ "}\n"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 7,
78
+ "id": "c7014bef",
79
+ "metadata": {},
80
+ "outputs": [
81
+ {
82
+ "data": {
83
+ "text/plain": [
84
+ "53"
85
+ ]
86
+ },
87
+ "execution_count": 7,
88
+ "metadata": {},
89
+ "output_type": "execute_result"
90
+ }
91
+ ],
92
+ "source": [
93
+ "len(tasks_with_difficulty)"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "id": "3a8bd7ed",
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "import litellm\n",
104
+ "import json\n",
105
+ "from pydantic import BaseModel\n",
106
+ "from enum import Enum\n",
107
+ "\n",
108
+ "\n",
109
+ "class Difficulty(str, Enum):\n",
110
+ " EASY = \"Easy\"\n",
111
+ " MEDIUM = \"Medium\"\n",
112
+ " HARD = \"Hard\"\n",
113
+ " VERY_HARD = \"Very hard\"\n",
114
+ "\n",
115
+ "\n",
116
+ "class Task(BaseModel):\n",
117
+ " description: str\n",
118
+ " difficulty: Difficulty\n",
119
+ "\n",
120
+ "\n",
121
+ "class GeneratedTasks(BaseModel):\n",
122
+ " tasks: list[Task]\n",
123
+ "\n",
124
+ "\n",
125
+ "def build_prompt(tasks_dict: dict[str, str]) -> str:\n",
126
+ " task_descriptions = \"\".join(\n",
127
+ " [f'- \"{task}\" [{difficulty}]\\n' for task, difficulty in tasks_dict.items()]\n",
128
+ " )\n",
129
+ "\n",
130
+ " return f\"\"\"Given the following examples of tasks (with their estimated difficulty levels in brackets):\n",
131
+ "\n",
132
+ "{task_descriptions}\n",
133
+ "\n",
134
+ "Generate exactly 10 new unique tasks with their difficulty levels (Easy, Medium, Hard, or Very hard).\n",
135
+ "The new tasks should be bootstrapped by analogy or creative mutation of the provided ones, but not be direct copies.\n",
136
+ "Vary the domains, instructions, and scenario details. Write crisp, concrete task phrasing. Preserve variety in both tasks and difficulties.\n",
137
+ "Do not repeat any of the input tasks verbatim. Create plausible, meaningful tasks relevant to LLM training, evaluation, dataprocessing, issue handling, tooling, etc.\n",
138
+ "\"\"\"\n",
139
+ "\n"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 10,
145
+ "id": "85ef3dcb",
146
+ "metadata": {},
147
+ "outputs": [
148
+ {
149
+ "name": "stdout",
150
+ "output_type": "stream",
151
+ "text": [
152
+ "Iteration 1/20: Added 10 new tasks. Total: 63\n",
153
+ "Iteration 2/20: Added 10 new tasks. Total: 73\n",
154
+ "Iteration 3/20: Added 10 new tasks. Total: 83\n",
155
+ "Iteration 4/20: Added 10 new tasks. Total: 93\n",
156
+ "Iteration 5/20: Added 10 new tasks. Total: 103\n",
157
+ "Iteration 6/20: Added 10 new tasks. Total: 113\n",
158
+ "Iteration 7/20: Added 10 new tasks. Total: 123\n",
159
+ "Iteration 8/20: Added 10 new tasks. Total: 133\n",
160
+ "Iteration 9/20: Added 10 new tasks. Total: 143\n",
161
+ "Iteration 10/20: Added 10 new tasks. Total: 153\n",
162
+ "Iteration 11/20: Added 10 new tasks. Total: 163\n",
163
+ "Iteration 12/20: Added 10 new tasks. Total: 173\n",
164
+ "Iteration 13/20: Added 10 new tasks. Total: 183\n",
165
+ "Iteration 14/20: Added 10 new tasks. Total: 193\n",
166
+ "Iteration 15/20: Added 10 new tasks. Total: 203\n",
167
+ "Iteration 16/20: Added 10 new tasks. Total: 213\n",
168
+ "Iteration 17/20: Added 10 new tasks. Total: 223\n",
169
+ "Iteration 18/20: Added 10 new tasks. Total: 233\n",
170
+ "Iteration 19/20: Added 10 new tasks. Total: 243\n",
171
+ "Iteration 20/20: Added 10 new tasks. Total: 253\n",
172
+ "\n",
173
+ "Final task count: 253\n"
174
+ ]
175
+ }
176
+ ],
177
+ "source": [
178
+ "model_name = \"gpt-5\"\n",
179
+ "\n",
180
+ "# Number of iterations to generate tasks (10 tasks per iteration)\n",
181
+ "num_iterations = 20\n",
182
+ "\n",
183
+ "# Copy the seed tasks to avoid modifying the original\n",
184
+ "all_tasks = tasks_with_difficulty.copy()\n",
185
+ "\n",
186
+ "for i in range(num_iterations):\n",
187
+ " prompt = build_prompt(all_tasks)\n",
188
+ "\n",
189
+ " # Query LLM using litellm with structured output\n",
190
+ " response = litellm.completion(\n",
191
+ " model=model_name,\n",
192
+ " messages=[\n",
193
+ " {\n",
194
+ " \"role\": \"system\",\n",
195
+ " \"content\": \"You are an expert at generating diverse ML/AI task instructions using products from HuggingFace and can enumerate them with proper difficulty.\",\n",
196
+ " },\n",
197
+ " {\"role\": \"user\", \"content\": prompt},\n",
198
+ " ],\n",
199
+ " response_format=GeneratedTasks,\n",
200
+ " )\n",
201
+ "\n",
202
+ " # Parse the structured output\n",
203
+ " generated = GeneratedTasks.model_validate_json(\n",
204
+ " response.choices[0].message.content\n",
205
+ " )\n",
206
+ "\n",
207
+ " # Add new tasks to the dictionary\n",
208
+ " new_count = 0\n",
209
+ " for task in generated.tasks:\n",
210
+ " if task.description not in all_tasks:\n",
211
+ " all_tasks[task.description] = task.difficulty.value\n",
212
+ " new_count += 1\n",
213
+ "\n",
214
+ " print(f\"Iteration {i + 1}/{num_iterations}: Added {new_count} new tasks. Total: {len(all_tasks)}\")\n",
215
+ "\n",
216
+ "# Save to disk\n",
217
+ "with open(\"generated_tasks_with_difficulty.json\", \"w\") as f:\n",
218
+ " json.dump(all_tasks, f, indent=2)\n",
219
+ "\n",
220
+ "print(f\"\\nFinal task count: {len(all_tasks)}\")\n"
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": 16,
226
+ "id": "9c0ad570",
227
+ "metadata": {},
228
+ "outputs": [
229
+ {
230
+ "name": "stdout",
231
+ "output_type": "stream",
232
+ "text": [
233
+ "\n",
234
+ "Dataset: 253 rows\n",
235
+ "Sample: Evaluate models {M_i} on benchmarks {B_i} (Easy)\n"
236
+ ]
237
+ }
238
+ ],
239
+ "source": [
240
+ "from datasets import Dataset\n",
241
+ "\n",
242
+ "# Convert dict to proper columns\n",
243
+ "questions = list(all_tasks.keys())\n",
244
+ "difficulties = list(all_tasks.values())\n",
245
+ "data = {\"question\": questions, \"difficulty\": difficulties}\n",
246
+ "\n",
247
+ "dataset = Dataset.from_dict(data)\n",
248
+ "print(f\"\\nDataset: {len(dataset)} rows\")\n",
249
+ "print(f\"Sample: {dataset[0]['question']} ({dataset[0]['difficulty']})\")\n"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": 17,
255
+ "id": "427a2186",
256
+ "metadata": {},
257
+ "outputs": [
258
+ {
259
+ "data": {
260
+ "application/vnd.jupyter.widget-view+json": {
261
+ "model_id": "b038f2a6afe84208820c1997e5d15096",
262
+ "version_major": 2,
263
+ "version_minor": 0
264
+ },
265
+ "text/plain": [
266
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ? shards/s]"
267
+ ]
268
+ },
269
+ "metadata": {},
270
+ "output_type": "display_data"
271
+ },
272
+ {
273
+ "data": {
274
+ "application/vnd.jupyter.widget-view+json": {
275
+ "model_id": "9e1c6a36740846fa9b25293abdd4a5e4",
276
+ "version_major": 2,
277
+ "version_minor": 0
278
+ },
279
+ "text/plain": [
280
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
281
+ ]
282
+ },
283
+ "metadata": {},
284
+ "output_type": "display_data"
285
+ },
286
+ {
287
+ "data": {
288
+ "application/vnd.jupyter.widget-view+json": {
289
+ "model_id": "1b37133b27ec49c5a45d73e8d58f0c5a",
290
+ "version_major": 2,
291
+ "version_minor": 0
292
+ },
293
+ "text/plain": [
294
+ "Processing Files (0 / 0): | | 0.00B / 0.00B "
295
+ ]
296
+ },
297
+ "metadata": {},
298
+ "output_type": "display_data"
299
+ },
300
+ {
301
+ "data": {
302
+ "application/vnd.jupyter.widget-view+json": {
303
+ "model_id": "1467e2d055ab42aebd2966972ee54e5b",
304
+ "version_major": 2,
305
+ "version_minor": 0
306
+ },
307
+ "text/plain": [
308
+ "New Data Upload: | | 0.00B / 0.00B "
309
+ ]
310
+ },
311
+ "metadata": {},
312
+ "output_type": "display_data"
313
+ },
314
+ {
315
+ "data": {
316
+ "text/plain": [
317
+ "CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/benchmark-tasks/commit/a96debee2c67ef760ecaea69296f2059f449fad6', commit_message='Upload dataset', commit_description='', oid='a96debee2c67ef760ecaea69296f2059f449fad6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/benchmark-tasks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/benchmark-tasks'), pr_revision=None, pr_num=None)"
318
+ ]
319
+ },
320
+ "execution_count": 17,
321
+ "metadata": {},
322
+ "output_type": "execute_result"
323
+ }
324
+ ],
325
+ "source": [
326
+ "dataset.push_to_hub(\"akseljoonas/benchmark-tasks\", private=True)"
327
+ ]
328
+ },
329
+ {
330
+ "cell_type": "code",
331
+ "execution_count": null,
332
+ "id": "50e67652",
333
+ "metadata": {},
334
+ "outputs": [],
335
+ "source": []
336
+ }
337
+ ],
338
+ "metadata": {
339
+ "kernelspec": {
340
+ "display_name": ".venv",
341
+ "language": "python",
342
+ "name": "python3"
343
+ },
344
+ "language_info": {
345
+ "codemirror_mode": {
346
+ "name": "ipython",
347
+ "version": 3
348
+ },
349
+ "file_extension": ".py",
350
+ "mimetype": "text/x-python",
351
+ "name": "python",
352
+ "nbconvert_exporter": "python",
353
+ "pygments_lexer": "ipython3",
354
+ "version": "3.12.11"
355
+ }
356
+ },
357
+ "nbformat": 4,
358
+ "nbformat_minor": 5
359
+ }
eval/generated_tasks_with_difficulty.json ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Evaluate models {M_i} on benchmarks {B_i}": "Easy",
3
+ "Train models {M_i} on datasets {D_i} with benchmarks {B_i}": "Medium",
4
+ "Run an ablation for hyperparameter P for model M on dataset D": "Hard",
5
+ "Generate completions with model M on dataset D using engine E": "Medium",
6
+ "Merge models {M_i} using linear averaging to find the best result on benchmarks {B_i}": "Hard",
7
+ "Given datasets {D_i}, ablate the best SFT mixture for model M across benchmarks {B_i}": "Very hard",
8
+ "Decontaminate dataset D against benchmarks {B_i}": "Hard",
9
+ "Benchmark RL framework F for best throughput on G GPUs": "Very hard",
10
+ "Implement post-training algorithm A from paper P in framework F. Validate it runs end-to-end": "Very hard",
11
+ "Implement benchmark B in framework F. Validate it reproduces some published results": "Very hard",
12
+ "Format dataset D for compatibility with framework F on task T": "Easy",
13
+ "Remove the background from this image: [image path]": "Easy",
14
+ "Transcribe all of the audio files in this directory": "Easy",
15
+ "Transcribe all of the audio files in this directory, choose the model that'll be cheapest and also relatively accurate": "Medium (judgment call or interaction needed to figure out what accuracy levels are acceptable)",
16
+ "Remove the background music from this audio file": "Medium (needs to find Gradio Space and call its API0",
17
+ "Change this video track to be from English to Spanish": "Medium (needs to link several models together)",
18
+ "Translate this flyer from English to Spanish, keeping the layout and images the same": "Medium (needs to link several models together)",
19
+ "What's the best model for X?": "Easy",
20
+ "What datasets are available for X? (X={domain x task x modality})": "Easy",
21
+ "Is there a space to do Y?": "Easy",
22
+ "I have this script and this error - what's the issue?": "Medium",
23
+ "This space is broken, how can i fix it?": "Medium",
24
+ "I built a space but it is super slow. What can I do?": "Medium",
25
+ "How can I run modal X locally?": "Medium",
26
+ "I want to build a space with model Y to do X?": "Hard",
27
+ "How can I serve a model with multiple LoRAs?": "Hard",
28
+ "What's the best model for sentiment analysis on financial text?": "Easy",
29
+ "Are there any medical image segmentation datasets on HuggingFace for CT scans?": "Easy",
30
+ "Which text classification models support 4-bit quantization?": "Medium",
31
+ "Are there inference endpoints available for Whisper large-v3?": "Easy",
32
+ "What's the license for the SA-Med2D-20M dataset?": "Easy",
33
+ "Which vision models fit in 8GB VRAM for image segmentation?": "Medium",
34
+ "What datasets are available for 3D medical image segmentation?": "Medium",
35
+ "Is there a space to do text-to-speech with emotion control?": "Medium",
36
+ "I'm getting \"CUDA out of memory\" when loading Llama-2-7b even though nvidia-smi shows I have 6GB free - what's the issue?": "Medium",
37
+ "My Gradio space shows \"Connection errored out\" after working fine yesterday, no code changes - how can I fix it?": "Medium",
38
+ "I built a Gradio space for Stable Diffusion but inference takes 5+ minutes on a 4090 - what can I do?": "Medium",
39
+ "My Whisper model outputs different transcriptions after quantization to int8 - why?": "Medium",
40
+ "Getting \"RuntimeError: CUDA error: out of memory. Tried to allocate 70.00 MiB\" but only 2.87 GiB is allocated - what's happening?": "Medium",
41
+ "My HuggingFace space build fails with \"failed to create containerd task\" - how to fix?": "Medium",
42
+ "DistilBERT model gives \"you should probably train your model\" warning even though it's a pretrained model from the Hub": "Easy",
43
+ "Space was working fine but now receiving build errors - receiving this error even with a new space": "Medium",
44
+ "Inference is correct locally but wrong on deployed space": "Medium",
45
+ "Getting CUDA OOM despite having enough memory according to nvidia-smi": "Medium",
46
+ "How can I run Mistral-7B-v0.1 locally with multiple LoRA adapters?": "Hard",
47
+ "How can I serve Llama-2-7b with vLLM and dynamically load multiple LoRA adapters?": "Hard",
48
+ "How do I batch inference requests in my Gradio space for better throughput?": "Medium",
49
+ "Can I run Whisper large-v3 with faster-whisper for 4x speedup?": "Medium",
50
+ "How to run Llama 2 on CPU after fine-tuning with LoRA?": "Medium",
51
+ "Best way to handle 50+ concurrent requests in a Gradio space without OOM?": "Hard",
52
+ "How do I add custom stopping criteria for text generation with Transformers?": "Hard",
53
+ "Can I merge multiple LoRA adapters before inference to reduce latency?": "Hard",
54
+ "How can I optimize my LLM inference with one base LLM and multiple LoRA adapters?": "Hard",
55
+ "Compare tokenizers {T_i} for model M on tasks {classification, QA}; report accuracy and average sequence length per task": "Medium",
56
+ "Run a LoRA rank sweep (r in {4, 8, 16, 32}) for model M on dataset D; plot validation perplexity vs VRAM usage and select Pareto-optimal settings": "Hard",
57
+ "Build a streaming dataloader from Parquet on S3 with deterministic shuffling across N workers; validate epoch reproducibility": "Very hard",
58
+ "Find three open-source TTS models with emotion control and list their sample rates and licenses": "Easy",
59
+ "Create a retrieval-augmented QA pipeline: index corpus C with FAISS, connect to model M, and benchmark top-1 accuracy and p95 latency": "Hard",
60
+ "Diagnose a Space where memory grows per request; add no-grad guards, free caches, and demonstrate stable RSS over 10,000 calls": "Hard",
61
+ "Deduplicate dataset D using MinHash LSH at Jaccard >= 0.9 and publish a cleaned HF dataset with provenance columns": "Medium",
62
+ "Add special tokens to tokenizer T and resize model M embeddings; resume pretraining for 10k steps without loss spikes": "Hard",
63
+ "Create a HuggingFace Dataset from CSV file data.csv and push to repo username/my_dataset": "Easy",
64
+ "Build a real-time Whisper transcription Space with VAD and chunked decoding; keep end-to-end latency under 200 ms": "Hard",
65
+ "Quantize model M to 4-bit (bnb.int4) with bitsandbytes; compare perplexity and p95 latency to 8-bit on dataset D; select config with <1% perplexity increase": "Medium",
66
+ "Fuse LoRA adapter A into base model M and export a single safetensors checkpoint; verify logits parity (<1e-5 MSE) vs on-the-fly LoRA": "Hard",
67
+ "Redact PII from dataset D using a transformer NER pipeline; produce a cleaned HuggingFace Dataset with per-entity removal stats and provenance": "Medium",
68
+ "Train a SentencePiece tokenizer (vocab=64k, byte fallback) on corpus C; compare tokenization speed, unknown-token rate, and bytes/token vs tokenizer T": "Hard",
69
+ "Build a sharded FAISS IVF-PQ index for 100M embeddings stored on S3; integrate with HF datasets streaming and report recall@10 and QPS": "Very hard",
70
+ "Fine-tune model M with QLoRA using TRL PPO on dataset D; log KL, reward, and throughput; validate no divergence on a held-out eval": "Hard",
71
+ "Resolve HfHubHTTPError 401 when pushing dataset repo R: diagnose token scopes, git-lfs config, and large file thresholds; document the fix": "Medium",
72
+ "Implement a custom Transformers LogitsProcessor that bans repeated bigrams; add unit tests and benchmark generation quality (BLEU) on dataset D": "Hard",
73
+ "List and download all Hub models tagged 'text-classification' with Apache-2.0 license and size <500MB; save model ids and downloads to CSV": "Easy",
74
+ "Enable speculative decoding in vLLM with draft model D for base model M; benchmark tokens/sec speedup at batch sizes {1,4,16} and max_new_tokens {64,256}": "Very hard",
75
+ "Profile model M under torch.compile modes {reduce-overhead, max-autotune} on GPU G; report tokens/sec, peak VRAM, and compile overhead": "Medium",
76
+ "Detect and remove near-duplicate images in dataset D using CLIP ViT-L/14 embeddings at cosine >= 0.95; publish a cleaned dataset with duplicate_group ids": "Medium",
77
+ "Convert a TensorFlow SavedModel of T5-base to Transformers PyTorch format; verify logits parity (MSE < 1e-4) on 1,000 random prompts": "Hard",
78
+ "Enable FlashAttention-2 in a Transformers training loop for model M; benchmark step time and confirm loss parity over 2,000 steps vs baseline": "Hard",
79
+ "Deploy vLLM for model M with hot-swappable LoRA adapters {A_i}; provide an API to switch adapters and demonstrate <200 ms switch latency under load": "Very hard",
80
+ "Implement a custom Trainer callback to log gradient norms, activation histograms, and learning rate; diagnose periodic loss spikes and propose a fix": "Hard",
81
+ "Build a bilingual RAG pipeline indexing corpora {en, es} with FAISS HNSW; evaluate exact match@1 on dataset D and report p95 latency": "Hard",
82
+ "Run a mixed-precision sweep (fp16 vs bf16) for model M on A100 and RTX 3090; compare convergence, throughput, and numerical stability issues": "Medium",
83
+ "Create a Gradio Space that batches Whisper-large-v3 transcription via queue + chunked decoding; maintain real-time factor <= 0.5 on a T4": "Hard",
84
+ "List five OCR datasets on the Hub with line-level annotations; include licenses and approximate image counts": "Easy",
85
+ "List models on the Hub tagged 'summarization' that offer safetensors weights and 4-bit quantization; output model ids": "Easy",
86
+ "Evaluate safety filters of models {M_i} on red-team prompt set R; report jailbreak rate and false positive rate": "Medium",
87
+ "Run a prompt template ablation for chat model M on dataset D; compare {alpaca, chatml, llama2} formats and report exact match and average output length": "Hard",
88
+ "Implement tensor parallelism for model M in framework F and show linear scaling across 2\u20138 GPUs with <=10% gap from ideal": "Very hard",
89
+ "Convert and shard dataset D into WebDataset tar files (~500MB/shard); build a streaming loader with checksum validation": "Medium",
90
+ "Deploy a Spaces app serving Stable Diffusion XL with ControlNet; add output caching and keep p95 latency <1s for 20 concurrent users": "Hard",
91
+ "Diagnose and fix 'shape mismatch' when loading LoRA into model M after tokenizer resize; provide minimal repro and patch": "Medium",
92
+ "Add a detailed model card to repo username/model_M with training data, intended use, limitations, and evaluation results": "Easy",
93
+ "Enable KV cache quantization (int8) in Transformers for model M; compare tokens/sec and ROUGE-L on dataset D vs fp16 cache": "Hard",
94
+ "Detect and redact license-incompatible samples in dataset D by matching SPDX identifiers and source domains; publish a compliance report": "Medium",
95
+ "Profile vLLM serving of model M with paged attention; tune block_size to maximize tokens/sec and report p50/p95 latency and peak VRAM": "Medium",
96
+ "Filter dataset D for toxic content using classifier C; log per-label removal rates and recreate stratified train/valid/test splits": "Medium",
97
+ "Train a unigram tokenizer (vocab=80k) on corpora {en, fr}; fine-tune T5-small and compare BLEU vs a BPE baseline; report tokenization speed and OOV rate": "Hard",
98
+ "Run distributed evaluation of models {M_i} on benchmark B across 4 GPUs with DeepSpeed-Inference; ensure identical metrics across 3 seeds": "Hard",
99
+ "Find three open-source ASR models that provide word-level timestamps; record licenses and expected WER on LibriSpeech": "Easy",
100
+ "Diagnose intermittent 'Address already in use' crashes in a FastAPI Space; add graceful shutdown and port probing, verifying stability over 1,000 restart cycles": "Medium",
101
+ "Export a LoRA-finetuned Llama checkpoint to GGUF for llama.cpp; validate perplexity parity (<=1% drift) on WikiText-2": "Hard",
102
+ "Construct a streaming RAG pipeline over S3-stored corpus C with Chroma; index ~1B tokens, implement shard rebalancing, and benchmark recall@5 and QPS": "Very hard",
103
+ "List Hub datasets tagged 'speech-emotion-recognition' with CC-BY or CC-BY-SA licenses and >=10k utterances; write dataset ids and sizes to JSON": "Easy",
104
+ "Train a summarization reward model via pairwise ranking on dataset D; apply DPO to model M and report ROUGE-L and human win rate": "Hard",
105
+ "Find four open-source OCR models that output line- or paragraph-level text and provide ONNX or TensorRT exports; list their licenses and maximum input resolutions": "Easy",
106
+ "Verify tokenizer special tokens for model M are preserved after adding new tokens; write a unit test that asserts CLS/SEP/PAD ids are unchanged before and after resize": "Medium",
107
+ "Implement a constrained decoder for model M that enforces a JSON schema via a custom Transformers LogitsProcessor; add unit tests and benchmark latency on dataset D": "Hard",
108
+ "Build a multilingual RAG index for 50M documents using mDPR with sharded storage on S3; support hot index reloads and report recall@10 and p95 latency at 100 QPS": "Very hard",
109
+ "Quantize T5-base to 8-bit with bitsandbytes (LLM.int8) and compare ROUGE-L and tokens/sec to fp16 on CNN/DailyMail; keep ROUGE-L drop <=1%": "Medium",
110
+ "Diagnose VRAM growth in a vLLM server at batch size 32; add profiling, fix cache eviction behavior, and demonstrate flat memory over 10,000 requests": "Hard",
111
+ "Convert a HuggingFace TokenizerFast to a SentencePiece model; verify >=99.9% token-level agreement on 10,000 sentences and measure tokenization speed delta": "Medium",
112
+ "Train a multi-task adapter stack for {summarization, QA, NLI} on model M; implement routing by prompt prefix and report per-task metrics and cross-task interference": "Very hard",
113
+ "Assess license compatibility between model M (Apache-2.0) and dataset D (CC-BY-SA); produce a one-paragraph verdict with rationale and reference links": "Easy",
114
+ "Enable FSDP with activation checkpointing for a 13B model across 2\u00d7A100 GPUs; achieve <=10% throughput loss vs baseline and verify loss parity over 1,000 steps": "Hard",
115
+ "List three datasets for code summarization with permissive licenses; output their dataset ids and license names": "Easy",
116
+ "Set up nightly continuous evaluation of model M on benchmarks {B_i}; log metrics to Weights & Biases and alert on >2% regression vs last 7-day rolling mean": "Medium",
117
+ "Implement streaming text generation in a Gradio Space for model M using server-sent events; cap median token emission delay at <50 ms": "Hard",
118
+ "Scale out training of a 7B model with FSDP + ZeRO across 8 GPUs; demonstrate checkpoint save/restore and achieve throughput within 15% of ideal linear scaling": "Very hard",
119
+ "Export a mixture-of-experts PyTorch model to ONNX and run with TensorRT; verify top-1 accuracy within 0.5% of PyTorch on dataset D": "Medium",
120
+ "Identify whether model M supports FlashAttention-2 from its config or source; provide supporting repo links and a yes/no compatibility flag": "Easy",
121
+ "Build an audio deduplication pipeline for dataset D using embedding model E with cosine similarity >= 0.98; publish grouped duplicate ids and a cleaned manifest": "Hard",
122
+ "Diagnose slow tokenization in a Transformers pipeline; profile, switch to a fast tokenizer, and demonstrate 2\u00d7 end-to-end speedup on 1M lines": "Medium",
123
+ "Implement a contrastive preference learning loss in TRL; train model M on dataset D and compare KL, reward variance, and human win rate vs a PPO baseline": "Hard",
124
+ "Build an elastic RAG service with Ray that autoscales FAISS shards on S3, supports live corpus updates, and maintains p95 latency <500 ms at 200 QPS": "Very hard",
125
+ "List five chat-optimized LLMs on the Hub that include a tokenizer chat_template and safetensors weights; output model ids": "Easy",
126
+ "Find three biomedical NER datasets with Apache-2.0 or MIT licenses; return dataset ids and license names": "Easy",
127
+ "Create a dataset viewer Space that streams Parquet shards from the Hub using datasets streaming; implement server-side filtering and pagination": "Medium",
128
+ "Enable gradient checkpointing and optimizer state offloading for model M with Accelerate; report step time and peak VRAM vs baseline on a single A100": "Medium",
129
+ "Diagnose and fix 'size mismatch for position_embeddings' after increasing max_position_embeddings; provide a minimal repro and a migration script": "Medium",
130
+ "Implement a regex-constrained Transformers LogitsProcessor that enforces ISO-8601 timestamps; add unit tests and report generation latency overhead on dataset D": "Hard",
131
+ "Train language-specific LoRA adapters for {en, es, de} on model M; add an automatic language router and report per-language BLEU and cross-language interference": "Hard",
132
+ "Build a speaker diarization + ASR Gradio Space using pyannote and Whisper-large-v3; achieve DER <= 12% and real-time factor <= 0.75 on a T4": "Hard",
133
+ "Implement multi-draft speculative decoding with dynamic draft-model selection per prompt; integrate with vLLM and benchmark tokens/sec speedup at batch sizes {1,8,32}": "Very hard",
134
+ "Convert a TensorFlow DistilBERT SavedModel to ONNX (opset 17) and validate logits parity (MSE < 1e-4) on 1,000 random inputs; measure CPU inference speedup vs TensorFlow": "Medium",
135
+ "Evaluate alignment drift after SFT: compare model M vs base M0 on prompt set P; report win rate, refusal rate, and average output length": "Medium",
136
+ "Enable KV cache int4 quantization in vLLM for model M; benchmark tokens/sec and exact match on dataset D vs fp16 cache": "Hard",
137
+ "Implement variable-length packing in a HF Datasets + Transformers training loop; ensure epoch-level sample coverage matches baseline and no truncation beyond max_length": "Medium",
138
+ "Build a multi-tenant LoRA router over vLLM: on-demand load adapters from the Hub with LRU eviction; sustain 100 tenants and <300 ms adapter swap latency under load": "Very hard",
139
+ "Audit generations for PII leakage on prompt set P using detector C; compute precision, recall, and false positive rate; redact before logging and publish a compliance summary": "Medium",
140
+ "Merge a stack of PEFT adapters {A_i} into base model M to produce a single FP16 checkpoint; validate perplexity drift <=0.5% on dataset D and export safetensors": "Hard",
141
+ "Find three Spaces that demonstrate constrained JSON generation; return Space ids and URLs": "Easy",
142
+ "Deploy a cross-lingual vector search service with multilingual-e5-large; shard FAISS across 3 nodes and measure mAP@10 and p95 latency at 500 QPS": "Very hard",
143
+ "Quantize attention and MLP projections only with bitsandbytes (selective 8-bit); compare peak VRAM, tokens/sec, and ROUGE-L vs full-model 8-bit on dataset D": "Hard",
144
+ "Fix \"Token indices sequence length is longer than the specified maximum\" after tokenizer resize; add truncation with stride and update generation config; verify no validation metric regression": "Medium",
145
+ "Identify splits for dataset D and output split names with sample counts": "Easy",
146
+ "Find five multilingual sentence-embedding models on the Hub with Apache-2.0 license; return model ids": "Easy",
147
+ "Set up CI to run evaluation suite E for model M nightly; fail the job if any metric drops >1% vs 7-day rolling mean": "Medium",
148
+ "Add length normalization to beam search for model M; compare vs baseline on dataset D and report ROUGE-L and average output length": "Medium",
149
+ "Detect per-sample language for dataset D; add a 'lang' column and recreate train/valid/test splits preserving language proportions": "Medium",
150
+ "Benchmark vLLM KV-cache eviction strategies (e.g., LRU vs TTL) for model M at batch sizes {1,8,32}; report tokens/sec and peak VRAM": "Medium",
151
+ "Implement a custom DataCollator that packs multiple documents for summarization with separator tokens; add unit tests to prevent cross-sample leakage": "Hard",
152
+ "Build a PDF-to-dataset pipeline: OCR pages with model Donut, store word-level bboxes, and publish a HuggingFace Dataset with a viewer Space": "Hard",
153
+ "Train a ColBERT reranker on corpus C + pairs dataset D; integrate into a RAG search service and report recall@10 and p95 latency delta": "Hard",
154
+ "Deploy vLLM for model M with multi-GPU tensor-parallel inference across 2 nodes using NCCL; demonstrate near-linear throughput scaling and deterministic outputs across 3 seeds": "Very hard",
155
+ "List four Hub models tagged 'named-entity-recognition' that declare bitsandbytes 8-bit support in their README; output model ids": "Easy",
156
+ "Find three Spaces that provide real-time TTS streaming demos; return Space ids and reported sample rates": "Easy",
157
+ "Create a Spaces app that visualizes transformer attention maps for a ViT model using Captum; keep heatmap rendering under 200 ms for 224x224 images": "Medium",
158
+ "Set up datasets streaming with resumable downloads and exponential backoff for S3-hosted Parquet shards; verify checksum integrity after killing and resuming the job": "Medium",
159
+ "Build a tokenizer migration tool to convert a SentencePiece model to a HuggingFace tokenizers JSON with byte-fallback; assert >=99.95% token-level agreement on 20k sentences and report speed delta": "Medium",
160
+ "Implement a custom DataCollator for span masking with variable block sizes for byte-level BPE; add unit tests and demonstrate MLM loss parity over 10k steps on WikiText-103": "Hard",
161
+ "Add speculative decoding with a small draft model to a Transformers-based text-generation server; expose a per-request flag and benchmark tokens/sec speedup at batch sizes {1,8,32}": "Hard",
162
+ "Train an online knowledge-distillation SFT: teacher M0 -> student M on dataset D; log KL divergence, token agreement, and throughput; cap metric drop at <=2% vs teacher": "Hard",
163
+ "Deploy a multi-region vLLM service on Kubernetes with adaptive batching and hot LoRA adapter loading; sustain 200 QPS with p95 latency <300 ms and zero-downtime rollouts": "Very hard",
164
+ "Build a sharded cross-encoder reranking service with Ray: distribute ColBERT scoring across nodes, integrate with FAISS retrieval, and maintain recall@10 within 1% of single-node baseline at 500 QPS": "Very hard",
165
+ "List four Spaces that perform multilingual OCR with layout extraction; return Space ids and supported languages": "Easy",
166
+ "Find five Hub datasets for code generation evaluation with permissive licenses; output dataset ids and license names": "Easy",
167
+ "Add gradient accumulation and gradient clipping to a Transformers Trainer finetune of model M; report step time, peak VRAM, and validation metric vs baseline": "Medium",
168
+ "Implement document chunking with sliding windows and overlap in a Datasets map pipeline; add doc_id and span indices and verify no segment exceeds max_length": "Medium",
169
+ "Export a fine-tuned BERT model to TorchScript and ONNX; verify logits parity (MSE < 1e-4) on 1,000 samples and compare CPU throughput": "Medium",
170
+ "Diagnose 'pad_token_id is not set' warnings during generation; add a PAD token, resize embeddings, and write a unit test asserting identical logits pre/post fix on 200 prompts": "Medium",
171
+ "Implement diverse beam search (group_beam_search) for model M; evaluate on dataset D and report ROUGE-L, distinct-n, and average output length vs standard beam search": "Hard",
172
+ "Build a multi-modal RAG demo that indexes image captions with CLIP and uses LLM M to answer visual questions; report top-1 accuracy and p95 latency": "Hard",
173
+ "Profile activation and KV-cache memory during generation for model M; log per-layer footprints and reduce peak usage via attention slicing; show tokens/sec and VRAM deltas": "Hard",
174
+ "Construct a 200M-document FAISS hybrid (IVF-PQ + HNSW) index with memory-mapped shards on S3; support live add/delete and benchmark recall@10 and QPS at 300 QPS": "Very hard",
175
+ "List five Hub datasets tagged 'topic-modeling' with MIT or Apache-2.0 licenses; output dataset ids": "Easy",
176
+ "Find three Spaces that offer real-time grammar correction with streaming tokens; return Space ids and URLs": "Easy",
177
+ "Convert a spaCy en_core_web_trf NER model to ONNX and wrap it in a Transformers TokenClassification pipeline; verify entity text/label/span parity on 1,000 sentences": "Medium",
178
+ "Set up a GitHub Actions workflow that snapshots tokenizer T weekly and fails if vocab or special token ids drift vs the last snapshot; upload a diff artifact": "Medium",
179
+ "Profile a Datasets map pipeline on corpus C; refactor to use batched=True, num_proc>1, and caching; achieve >=2\u00d7 speedup while preserving deterministic ordering across runs": "Medium",
180
+ "Implement a custom Transformers StoppingCriteria that halts when JSON braces are balanced or max nesting depth is reached; add unit tests and benchmark latency overhead on dataset D": "Hard",
181
+ "Build a visual-and-tabular RAG pipeline: index images with CLIP and CSV tables with TAPAS; answer mixed queries using LLM M; report EM@1 and p95 latency at 50 QPS": "Hard",
182
+ "Enable KV-cache int4 quantization during generation in Transformers for model M; compare tokens/sec and exact match vs fp16 cache on dataset D; keep metric drop <=1%": "Hard",
183
+ "Implement a hot-reloadable sharded FAISS IVF-PQ index for multilingual-e5-base with live add/delete and background re-training; sustain 200 QPS with p95 latency <400 ms across 3 nodes": "Very hard",
184
+ "Deploy a geo-distributed vLLM + LoRA adapter gateway across two regions with consistent hashing and zero-downtime adapter updates; ensure identical outputs across 3 seeds and report cross-region p95 latency": "Very hard",
185
+ "List five Hub LLM repos that disclose training token counts in their model cards; output model ids and token totals": "Easy",
186
+ "Find two ready-to-use Spaces for speaker diarization compatible with Whisper; return Space ids and URLs": "Easy",
187
+ "Create a hashing-based dataset splitter using column 'doc_id' to produce reproducible train/valid/test; verify identical splits across two machines and Python versions": "Medium",
188
+ "Resolve HTTP 403 when creating an organization dataset via the Hub API; diagnose token scopes and org permissions; provide a minimal repro script and the fix": "Medium",
189
+ "Export a PEFT LoRA adapter from a fine-tuned Llama checkpoint as standalone safetensors with a correct adapter_config.json; push to the Hub and verify PEFT.from_pretrained loads it": "Medium",
190
+ "Enable multi-query attention in model M within Transformers; benchmark tokens/sec and peak VRAM vs multi-head attention and verify perplexity parity over 2,000 steps": "Hard",
191
+ "Audit code dataset D for contamination against {HumanEval, MBPP} using exact substring and 3-gram Jaccard >= 0.9; publish per-source contamination rates and a cleaned dataset": "Hard",
192
+ "Implement contrastive search decoding for model M with tunable alpha; compare ROUGE-L, distinct-n, and latency vs nucleus sampling on dataset D": "Hard",
193
+ "Implement pipeline parallelism for model M across 4 GPUs with Accelerate; achieve near-linear scaling (<=15% gap), support checkpoint save/restore, and ensure deterministic outputs across 3 seeds": "Very hard",
194
+ "Deploy a Spaces app that serves two ASR models with automatic language ID routing; maintain real-time factor <= 0.6 on a single T4 and log per-language latency": "Hard",
195
+ "Benchmark JSON-constrained decoding across models {M_i}; report JSON validity rate, exact match on dataset D, and p95 latency under streaming": "Hard",
196
+ "Filter a multilingual dataset D to non-English using fastText language ID; recreate stratified splits and report per-language retention and drop rates": "Medium",
197
+ "Enable paged attention in a custom Transformers generation loop for model M; verify token-level parity on 500 prompts and measure peak VRAM change": "Hard",
198
+ "Shard a 1B-token text corpus into deterministic HF Datasets processing across 16 workers; validate byte-for-byte identical outputs across two runs": "Very hard",
199
+ "Compare LoRA vs QLoRA fine-tunes of Mistral-7B on GSM8K; track loss, exact match, and throughput; select the lowest-VRAM config within 2% EM of best": "Hard",
200
+ "Deploy a quantized T5 encoder-decoder on Triton Inference Server via a Python backend; add token streaming and achieve >=1.5x throughput vs PyTorch baseline": "Hard",
201
+ "Find three Spaces that perform audio source separation (vocals/music); return Space ids and reported sample rates": "Easy",
202
+ "Merge a PEFT IA3 adapter stack into Llama-3-8B base weights; verify perplexity drift <=0.3% on WikiText-103 and export safetensors": "Hard",
203
+ "Resolve DeepSpeed ZeRO-3 stalls during S3 checkpointing; implement async multipart uploads and show stable 5-minute checkpoint cadence over 2 hours": "Very hard",
204
+ "Set up CI to run contamination checks on dataset R against {TruthfulQA, SQuAD} using 4-gram overlap; fail if rate >0.5% and attach offending ids as artifacts": "Medium",
205
+ "List four Hub datasets for sarcasm detection in English; return dataset ids and license tags": "Easy",
206
+ "Identify whether tokenizer T enables byte_fallback in tokenizer.json; output true/false and the file path": "Easy",
207
+ "Find three Spaces that showcase streaming chat with token-by-token updates; return Space ids and whether they use SSE or websockets": "Easy",
208
+ "Create a Datasets loader that parses Praat TextGrid files into word-level timestamps aligned with audio; publish a dataset with an 'audio' column and validate 100 sample alignments": "Medium",
209
+ "Set up a GitHub Actions workflow that lints model cards for repos {R_i} to require intended use, training data, and limitations; fail PRs and post a summary comment on violations": "Medium",
210
+ "Containerize a Gradio Space with optional FlashAttention build: detect GPU capability at startup, compile kernels if supported, and fall back gracefully on unsupported GPUs; test on T4 and A100": "Medium",
211
+ "Evaluate long-context retrieval via needle-in-a-haystack for models {M_i} at context lengths {8k, 32k, 64k}; report retrieval accuracy, tokens/sec, and the max stable context length": "Hard",
212
+ "Implement a curriculum sampler as a HuggingFace Trainer callback that schedules sample difficulty over epochs; compare convergence and final eval metrics vs random sampling": "Hard",
213
+ "Add on-the-fly near-duplicate filtering during training using SimHash over token ids; log per-epoch removal rates and verify no convergence regressions vs a deduplicated baseline": "Hard",
214
+ "Deploy a dual-backend inference router using vLLM and TensorRT-LLM that selects backend per prompt length to minimize latency; maintain deterministic outputs across 3 seeds and sustain 300 QPS with p95 latency SLOs": "Very hard",
215
+ "Identify max_position_embeddings and whether rope_scaling is enabled for model M from its config; output both values.": "Easy",
216
+ "List five Vision Transformer models on the Hub that provide safetensors and have a default image size >= 384; output model ids.": "Easy",
217
+ "Find three Spaces that stream machine-translation outputs token-by-token; return Space ids and whether they use SSE or websockets.": "Easy",
218
+ "Diagnose bursts of [UNK] after adding special tokens to tokenizer T; enable byte_fallback, retrain embeddings for 2k steps, and show unknown-token rate <= baseline+0.1% on corpus C.": "Medium",
219
+ "Create a dataset viewer Space for a dataset with a nested JSON column; convert to Arrow struct arrays, implement server-side filtering on nested keys, and verify row counts match the source.": "Medium",
220
+ "Set up a GitHub Action that hits /health and a no-op inference on Space S after each deploy; fail if cold-start median latency >10s and attach server logs as an artifact.": "Medium",
221
+ "Implement a SQL grammar-constrained Transformers LogitsProcessor using an LL(1) parser; evaluate on Spider dev and report exact match and p95 latency overhead vs nucleus sampling.": "Hard",
222
+ "Add CPU-tier KV-cache offloading with pinned memory for model M in a custom generation loop; compare tokens/sec and peak VRAM vs baseline at context lengths {4k, 16k, 32k}.": "Hard",
223
+ "Deploy a batched cross-encoder reranker microservice using bge-reranker-base; keep recall@10 within 1% of single-request baseline and achieve >=2\u00d7 QPS at 100 concurrent users.": "Hard",
224
+ "Build a heterogeneous inference gateway that routes requests to vLLM or llama.cpp based on prompt length and GPU load; ensure identical normalized outputs across 3 seeds and sustain 200 QPS with p95 latency <300 ms.": "Very hard",
225
+ "Determine whether tokenizer T strips accents (strip_accents); output true/false and the file path where the setting is defined.": "Easy",
226
+ "List four Hub datasets for hate-speech detection in English; return dataset ids and license tags.": "Easy",
227
+ "Write a Datasets loader for a paginated OAuth2 REST API; cache pages, support streaming, and provide deterministic sharding across 8 workers; verify identical row counts across two runs.": "Medium",
228
+ "Add request-level caching (ETag/If-None-Match) to a Gradio summarization Space; achieve >=1.8\u00d7 QPS at 50 concurrent users and report cache hit ratio and p95 latency.": "Medium",
229
+ "Enable HuggingFace tokenizers parallelism and batched encoding for corpus C; benchmark throughput and memory on 10M lines and ensure deterministic outputs across 3 runs.": "Medium",
230
+ "Set up CI to lint dataset cards in repos {R_i} for required fields {license, citation, dataset_summary}; fail PRs and post a summary comment with missing keys.": "Medium",
231
+ "Run a parameter-efficient finetuning sweep comparing LoRA, IA3, and prefix-tuning on RoBERTa-base for MNLI; report accuracy, training time, and peak VRAM; select a Pareto-optimal config.": "Hard",
232
+ "Implement a Transformers LogitsProcessor that enforces balanced parentheses and proper quoted-string escaping; add unit tests and benchmark latency overhead on dataset D.": "Hard",
233
+ "Export Whisper-medium to ONNX with dynamic axes and int8 weights; verify word-timestamp parity on 500 clips and measure CPU real-time factor improvement >=1.3\u00d7 vs PyTorch.": "Hard",
234
+ "Deploy a geo-replicated RAG service: shard FAISS HNSW across three regions with conflict-free index metadata sync; sustain 300 QPS with p95 latency <450 ms and recall@10 within 1% of single-region baseline.": "Very hard",
235
+ "Compare cased vs uncased tokenization for BERT on CoNLL-2003 NER; train both, and report F1, average tokens per sentence, and training time.": "Medium",
236
+ "Create a HuggingFace Datasets loader for EPUB files: extract chapter text and embedded images into Arrow columns, support streaming and deterministic sharding across 8 workers; verify identical row counts across two runs.": "Medium",
237
+ "Configure a Hub webhook to trigger CI when a model card (README.md) changes; fail the job if sections {intended use, limitations} are missing and post a checklist comment on the PR.": "Medium",
238
+ "Add a reranking cache to a RAG service keyed by (query, candidate_ids); achieve >=50% cache hit at 100 QPS and keep recall@10 within 0.5% of baseline.": "Hard",
239
+ "Fix torch.compile graph breaks in a Transformers training loop; patch non-compilable ops, re-enable compilation, and demonstrate >=1.4\u00d7 step-time speedup with matching loss over 2,000 steps.": "Hard",
240
+ "Compute 95% bootstrap confidence intervals for ROUGE-L on dataset D over 3 random seeds; flag regressions when the new CI lies entirely below last week's baseline CI.": "Medium",
241
+ "Build a batch image-captioning Space with ViT-GPT2: accept ZIP uploads, use queue-based batching, and keep p95 latency <2s for 32 images.": "Medium",
242
+ "Implement hybrid parallelism (tensor + pipeline) for a 13B encoder-decoder using Accelerate; scale across 8 GPUs with <=15% gap from linear, support elastic resize (8->6 GPUs) without losing determinism, and verify checkpoint save/restore.": "Very hard",
243
+ "Find five Spaces that stream live vision-language captioning (e.g., LLaVA or BLIP); return Space ids and reported FPS.": "Easy",
244
+ "Identify whether tokenizer T applies Unicode normalization (NFKC/NFC/NFD/NFKD) and where it is configured; output the mode and file path.": "Easy",
245
+ "Identify whether model repo M stores weights exclusively as safetensors; output true/false and list the .safetensors file paths.": "Easy",
246
+ "List three multilingual sentence-embedding models on the Hub that provide ONNX exports; return model ids.": "Easy",
247
+ "Determine if tokenizer T lowercases text (do_lower_case or lowercase flag); output true/false and the file path or JSON key where it is set.": "Easy",
248
+ "Set up a GitHub Action to run a smoke-test text generation for model M on each push; fail if median time to first token >2s and attach container logs as an artifact.": "Medium",
249
+ "Create a Datasets preprocessing pipeline that tokenizes to max_length=512 with stride=64 and retains an 'orig_text' column; verify row counts match input and no NaNs after caching.": "Medium",
250
+ "Resolve 'git-lfs: command not found' when pushing model repo R to the Hub; install and configure Git LFS, set an appropriate large file threshold, and provide a minimal repro plus the verified fix.": "Medium",
251
+ "Enable KV-cache CPU offloading in a custom Transformers generation loop for model M; benchmark tokens/sec and peak VRAM vs baseline at context lengths {4k, 8k}.": "Hard",
252
+ "Implement LoRA rank warmup (r: 4\u219232 over the first 1,000 steps) in a custom Trainer; fine-tune model M on dataset D and report validation perplexity and peak VRAM vs fixed r=32.": "Hard",
253
+ "Export Whisper-small to TensorRT via ONNX (opset 18) with dynamic axes; verify word-timestamp parity (median diff \u22640.05s) on 300 clips and measure \u22651.3\u00d7 GPU speedup vs PyTorch.": "Hard",
254
+ "Deploy a multi-tenant RAG service that hot-loads per-tenant FAISS indices from S3, shares a reranker, and sustains 200 QPS with p95 latency <350 ms across 1,000 tenants; maintain recall@10 within 1% of a single-tenant baseline.": "Very hard"
255
+ }