Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
158d846
1
Parent(s): 8f4b322
intermediate commit until i let amp loose
Browse files- eval/.amp_batch_solve.py.swp +0 -0
- eval/amp_batch_solve.py +106 -0
- eval/amp_solve.py +31 -0
- eval/eval_set.ipynb +359 -0
- eval/generated_tasks_with_difficulty.json +255 -0
eval/.amp_batch_solve.py.swp
ADDED
|
Binary file (12.3 kB). View file
|
|
|
eval/amp_batch_solve.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import threading
|
| 6 |
+
|
| 7 |
+
from amp_sdk import AmpOptions, execute
|
| 8 |
+
|
| 9 |
+
# Thread-safe file writing
|
| 10 |
+
file_lock = threading.Lock()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
async def solve_task(
|
| 14 |
+
question: str, difficulty: str, task_idx: int, total: int, semaphore: asyncio.Semaphore
|
| 15 |
+
) -> dict:
|
| 16 |
+
"""Solve a single task using Amp SDK."""
|
| 17 |
+
async with semaphore:
|
| 18 |
+
print(f"[{task_idx}/{total}] Starting: {question[:60]}...")
|
| 19 |
+
|
| 20 |
+
messages = []
|
| 21 |
+
solution = None
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
async for message in execute(
|
| 25 |
+
question,
|
| 26 |
+
AmpOptions(
|
| 27 |
+
cwd=os.getcwd(),
|
| 28 |
+
visibility="workspace",
|
| 29 |
+
dangerously_allow_all=True,
|
| 30 |
+
),
|
| 31 |
+
):
|
| 32 |
+
messages.append(message.model_dump())
|
| 33 |
+
|
| 34 |
+
# Extract the final text response as solution
|
| 35 |
+
if message.type == "assistant":
|
| 36 |
+
content = message.message.get("content", [])
|
| 37 |
+
for item in content:
|
| 38 |
+
if isinstance(item, dict) and item.get("type") == "text":
|
| 39 |
+
solution = item.get("text")
|
| 40 |
+
elif message.type == "result":
|
| 41 |
+
if message.result:
|
| 42 |
+
solution = message.result
|
| 43 |
+
|
| 44 |
+
print(f"[{task_idx}/{total}] ✓ Done: {question[:60]}...")
|
| 45 |
+
return {
|
| 46 |
+
"question": question,
|
| 47 |
+
"difficulty": difficulty,
|
| 48 |
+
"solution": solution,
|
| 49 |
+
"messages": messages,
|
| 50 |
+
}
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"[{task_idx}/{total}] ✗ Error: {e}")
|
| 53 |
+
return {
|
| 54 |
+
"question": question,
|
| 55 |
+
"difficulty": difficulty,
|
| 56 |
+
"solution": None,
|
| 57 |
+
"messages": messages,
|
| 58 |
+
"error": str(e),
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def write_result(output_path: Path, result: dict):
|
| 63 |
+
"""Thread-safe write to output file."""
|
| 64 |
+
with file_lock:
|
| 65 |
+
with open(output_path, "a") as f:
|
| 66 |
+
f.write(json.dumps(result) + "\n")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
async def main():
|
| 70 |
+
# Load tasks
|
| 71 |
+
tasks_path = Path(__file__).parent / "generated_tasks_with_difficulty.json"
|
| 72 |
+
with open(tasks_path) as f:
|
| 73 |
+
tasks = json.load(f)
|
| 74 |
+
|
| 75 |
+
# Output file - clear it first
|
| 76 |
+
output_path = Path(__file__).parent / "solved_tasks.jsonl"
|
| 77 |
+
output_path.write_text("")
|
| 78 |
+
|
| 79 |
+
# Semaphore to limit concurrency
|
| 80 |
+
max_concurrent = 20
|
| 81 |
+
semaphore = asyncio.Semaphore(max_concurrent)
|
| 82 |
+
|
| 83 |
+
total = len(tasks)
|
| 84 |
+
print(f"Processing {total} tasks with {max_concurrent} concurrent agents...")
|
| 85 |
+
|
| 86 |
+
async def process_and_save(question: str, difficulty: str, idx: int):
|
| 87 |
+
result = await solve_task(question, difficulty, idx, total, semaphore)
|
| 88 |
+
write_result(output_path, result)
|
| 89 |
+
return result
|
| 90 |
+
|
| 91 |
+
# Create all tasks
|
| 92 |
+
coroutines = [
|
| 93 |
+
process_and_save(question, difficulty, i + 1)
|
| 94 |
+
for i, (question, difficulty) in enumerate(tasks.items())
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
# Run all concurrently (semaphore limits actual parallelism)
|
| 98 |
+
results = await asyncio.gather(*coroutines, return_exceptions=True)
|
| 99 |
+
|
| 100 |
+
successful = sum(1 for r in results if isinstance(r, dict) and "error" not in r)
|
| 101 |
+
print(f"\nCompleted: {successful}/{total} successful")
|
| 102 |
+
print(f"Results saved to {output_path}")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
asyncio.run(main())
|
eval/amp_solve.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
from amp_sdk import AmpOptions, execute
|
| 5 |
+
|
| 6 |
+
prompt = """
|
| 7 |
+
what account am I logged in as?
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
async def main():
|
| 12 |
+
# Use the toolbox directory to share tools with Amp
|
| 13 |
+
toolbox_dir = os.path.join(os.getcwd(), "toolbox")
|
| 14 |
+
messages = []
|
| 15 |
+
async for message in execute(
|
| 16 |
+
prompt,
|
| 17 |
+
AmpOptions(
|
| 18 |
+
cwd=os.getcwd(),
|
| 19 |
+
toolbox=toolbox_dir,
|
| 20 |
+
visibility="workspace",
|
| 21 |
+
dangerously_allow_all=True,
|
| 22 |
+
),
|
| 23 |
+
):
|
| 24 |
+
messages.append(message)
|
| 25 |
+
|
| 26 |
+
for msg in messages:
|
| 27 |
+
print(msg.model_dump_json(indent=2))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
if __name__ == "__main__":
|
| 31 |
+
asyncio.run(main())
|
eval/eval_set.ipynb
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 6,
|
| 6 |
+
"id": "19f3dd6b",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"# Seed examples for task bootstrapping\n",
|
| 11 |
+
"tasks_with_difficulty = {\n",
|
| 12 |
+
" # lewis\n",
|
| 13 |
+
" \"Evaluate models {M_i} on benchmarks {B_i}\": \"Easy\",\n",
|
| 14 |
+
" \"Train models {M_i} on datasets {D_i} with benchmarks {B_i}\": \"Medium\",\n",
|
| 15 |
+
" \"Run an ablation for hyperparameter P for model M on dataset D\": \"Hard\",\n",
|
| 16 |
+
" \"Generate completions with model M on dataset D using engine E\": \"Medium\",\n",
|
| 17 |
+
" \"Merge models {M_i} using linear averaging to find the best result on benchmarks {B_i}\": \"Hard\",\n",
|
| 18 |
+
" \"Given datasets {D_i}, ablate the best SFT mixture for model M across benchmarks {B_i}\": \"Very hard\",\n",
|
| 19 |
+
" \"Decontaminate dataset D against benchmarks {B_i}\": \"Hard\",\n",
|
| 20 |
+
" \"Benchmark RL framework F for best throughput on G GPUs\": \"Very hard\",\n",
|
| 21 |
+
" \"Implement post-training algorithm A from paper P in framework F. Validate it runs end-to-end\": \"Very hard\",\n",
|
| 22 |
+
" \"Implement benchmark B in framework F. Validate it reproduces some published results\": \"Very hard\",\n",
|
| 23 |
+
" \"Format dataset D for compatibility with framework F on task T\": \"Easy\",\n",
|
| 24 |
+
"\n",
|
| 25 |
+
" # abubakar\n",
|
| 26 |
+
" \"Remove the background from this image: [image path]\": \"Easy\",\n",
|
| 27 |
+
" \"Transcribe all of the audio files in this directory\": \"Easy\",\n",
|
| 28 |
+
" \"Transcribe all of the audio files in this directory, choose the model that'll be cheapest and also relatively accurate\": \"Medium (judgment call or interaction needed to figure out what accuracy levels are acceptable)\",\n",
|
| 29 |
+
" \"Remove the background music from this audio file\": \"Medium (needs to find Gradio Space and call its API0\",\n",
|
| 30 |
+
" \"Change this video track to be from English to Spanish\": \"Medium (needs to link several models together)\",\n",
|
| 31 |
+
" \"Translate this flyer from English to Spanish, keeping the layout and images the same\": \"Medium (needs to link several models together)\",\n",
|
| 32 |
+
"\n",
|
| 33 |
+
" # leandro\n",
|
| 34 |
+
" \"What's the best model for X?\": \"Easy\",\n",
|
| 35 |
+
" \"What datasets are available for X? (X={domain x task x modality})\": \"Easy\",\n",
|
| 36 |
+
" \"Is there a space to do Y?\": \"Easy\",\n",
|
| 37 |
+
" \"I have this script and this error - what's the issue?\": \"Medium\",\n",
|
| 38 |
+
" \"This space is broken, how can i fix it?\": \"Medium\",\n",
|
| 39 |
+
" \"I built a space but it is super slow. What can I do?\": \"Medium\",\n",
|
| 40 |
+
" \"How can I run modal X locally?\": \"Medium\",\n",
|
| 41 |
+
" \"I want to build a space with model Y to do X?\": \"Hard\",\n",
|
| 42 |
+
" \"How can I serve a model with multiple LoRAs?\": \"Hard\",\n",
|
| 43 |
+
"\n",
|
| 44 |
+
" # claude\n",
|
| 45 |
+
" \"What's the best model for sentiment analysis on financial text?\": \"Easy\",\n",
|
| 46 |
+
" \"Are there any medical image segmentation datasets on HuggingFace for CT scans?\": \"Easy\",\n",
|
| 47 |
+
" \"Which text classification models support 4-bit quantization?\": \"Medium\",\n",
|
| 48 |
+
" \"Are there inference endpoints available for Whisper large-v3?\": \"Easy\",\n",
|
| 49 |
+
" \"What's the license for the SA-Med2D-20M dataset?\": \"Easy\",\n",
|
| 50 |
+
" \"Which vision models fit in 8GB VRAM for image segmentation?\": \"Medium\",\n",
|
| 51 |
+
" \"What datasets are available for 3D medical image segmentation?\": \"Medium\",\n",
|
| 52 |
+
" \"Is there a space to do text-to-speech with emotion control?\": \"Medium\",\n",
|
| 53 |
+
" \"I'm getting \\\"CUDA out of memory\\\" when loading Llama-2-7b even though nvidia-smi shows I have 6GB free - what's the issue?\": \"Medium\",\n",
|
| 54 |
+
" \"My Gradio space shows \\\"Connection errored out\\\" after working fine yesterday, no code changes - how can I fix it?\": \"Medium\",\n",
|
| 55 |
+
" \"I built a Gradio space for Stable Diffusion but inference takes 5+ minutes on a 4090 - what can I do?\": \"Medium\",\n",
|
| 56 |
+
" \"My Whisper model outputs different transcriptions after quantization to int8 - why?\": \"Medium\",\n",
|
| 57 |
+
" \"Getting \\\"RuntimeError: CUDA error: out of memory. Tried to allocate 70.00 MiB\\\" but only 2.87 GiB is allocated - what's happening?\": \"Medium\",\n",
|
| 58 |
+
" \"My HuggingFace space build fails with \\\"failed to create containerd task\\\" - how to fix?\": \"Medium\",\n",
|
| 59 |
+
" \"DistilBERT model gives \\\"you should probably train your model\\\" warning even though it's a pretrained model from the Hub\": \"Easy\",\n",
|
| 60 |
+
" \"Space was working fine but now receiving build errors - receiving this error even with a new space\": \"Medium\",\n",
|
| 61 |
+
" \"Inference is correct locally but wrong on deployed space\": \"Medium\",\n",
|
| 62 |
+
" \"Getting CUDA OOM despite having enough memory according to nvidia-smi\": \"Medium\",\n",
|
| 63 |
+
" \"How can I run Mistral-7B-v0.1 locally with multiple LoRA adapters?\": \"Hard\",\n",
|
| 64 |
+
" \"How can I serve Llama-2-7b with vLLM and dynamically load multiple LoRA adapters?\": \"Hard\",\n",
|
| 65 |
+
" \"How do I batch inference requests in my Gradio space for better throughput?\": \"Medium\",\n",
|
| 66 |
+
" \"Can I run Whisper large-v3 with faster-whisper for 4x speedup?\": \"Medium\",\n",
|
| 67 |
+
" \"How to run Llama 2 on CPU after fine-tuning with LoRA?\": \"Medium\",\n",
|
| 68 |
+
" \"Best way to handle 50+ concurrent requests in a Gradio space without OOM?\": \"Hard\",\n",
|
| 69 |
+
" \"How do I add custom stopping criteria for text generation with Transformers?\": \"Hard\",\n",
|
| 70 |
+
" \"Can I merge multiple LoRA adapters before inference to reduce latency?\": \"Hard\",\n",
|
| 71 |
+
" \"How can I optimize my LLM inference with one base LLM and multiple LoRA adapters?\": \"Hard\",\n",
|
| 72 |
+
"}\n"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"cell_type": "code",
|
| 77 |
+
"execution_count": 7,
|
| 78 |
+
"id": "c7014bef",
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"outputs": [
|
| 81 |
+
{
|
| 82 |
+
"data": {
|
| 83 |
+
"text/plain": [
|
| 84 |
+
"53"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
"execution_count": 7,
|
| 88 |
+
"metadata": {},
|
| 89 |
+
"output_type": "execute_result"
|
| 90 |
+
}
|
| 91 |
+
],
|
| 92 |
+
"source": [
|
| 93 |
+
"len(tasks_with_difficulty)"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"cell_type": "code",
|
| 98 |
+
"execution_count": null,
|
| 99 |
+
"id": "3a8bd7ed",
|
| 100 |
+
"metadata": {},
|
| 101 |
+
"outputs": [],
|
| 102 |
+
"source": [
|
| 103 |
+
"import litellm\n",
|
| 104 |
+
"import json\n",
|
| 105 |
+
"from pydantic import BaseModel\n",
|
| 106 |
+
"from enum import Enum\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"class Difficulty(str, Enum):\n",
|
| 110 |
+
" EASY = \"Easy\"\n",
|
| 111 |
+
" MEDIUM = \"Medium\"\n",
|
| 112 |
+
" HARD = \"Hard\"\n",
|
| 113 |
+
" VERY_HARD = \"Very hard\"\n",
|
| 114 |
+
"\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"class Task(BaseModel):\n",
|
| 117 |
+
" description: str\n",
|
| 118 |
+
" difficulty: Difficulty\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"class GeneratedTasks(BaseModel):\n",
|
| 122 |
+
" tasks: list[Task]\n",
|
| 123 |
+
"\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"def build_prompt(tasks_dict: dict[str, str]) -> str:\n",
|
| 126 |
+
" task_descriptions = \"\".join(\n",
|
| 127 |
+
" [f'- \"{task}\" [{difficulty}]\\n' for task, difficulty in tasks_dict.items()]\n",
|
| 128 |
+
" )\n",
|
| 129 |
+
"\n",
|
| 130 |
+
" return f\"\"\"Given the following examples of tasks (with their estimated difficulty levels in brackets):\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"{task_descriptions}\n",
|
| 133 |
+
"\n",
|
| 134 |
+
"Generate exactly 10 new unique tasks with their difficulty levels (Easy, Medium, Hard, or Very hard).\n",
|
| 135 |
+
"The new tasks should be bootstrapped by analogy or creative mutation of the provided ones, but not be direct copies.\n",
|
| 136 |
+
"Vary the domains, instructions, and scenario details. Write crisp, concrete task phrasing. Preserve variety in both tasks and difficulties.\n",
|
| 137 |
+
"Do not repeat any of the input tasks verbatim. Create plausible, meaningful tasks relevant to LLM training, evaluation, dataprocessing, issue handling, tooling, etc.\n",
|
| 138 |
+
"\"\"\"\n",
|
| 139 |
+
"\n"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"cell_type": "code",
|
| 144 |
+
"execution_count": 10,
|
| 145 |
+
"id": "85ef3dcb",
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"outputs": [
|
| 148 |
+
{
|
| 149 |
+
"name": "stdout",
|
| 150 |
+
"output_type": "stream",
|
| 151 |
+
"text": [
|
| 152 |
+
"Iteration 1/20: Added 10 new tasks. Total: 63\n",
|
| 153 |
+
"Iteration 2/20: Added 10 new tasks. Total: 73\n",
|
| 154 |
+
"Iteration 3/20: Added 10 new tasks. Total: 83\n",
|
| 155 |
+
"Iteration 4/20: Added 10 new tasks. Total: 93\n",
|
| 156 |
+
"Iteration 5/20: Added 10 new tasks. Total: 103\n",
|
| 157 |
+
"Iteration 6/20: Added 10 new tasks. Total: 113\n",
|
| 158 |
+
"Iteration 7/20: Added 10 new tasks. Total: 123\n",
|
| 159 |
+
"Iteration 8/20: Added 10 new tasks. Total: 133\n",
|
| 160 |
+
"Iteration 9/20: Added 10 new tasks. Total: 143\n",
|
| 161 |
+
"Iteration 10/20: Added 10 new tasks. Total: 153\n",
|
| 162 |
+
"Iteration 11/20: Added 10 new tasks. Total: 163\n",
|
| 163 |
+
"Iteration 12/20: Added 10 new tasks. Total: 173\n",
|
| 164 |
+
"Iteration 13/20: Added 10 new tasks. Total: 183\n",
|
| 165 |
+
"Iteration 14/20: Added 10 new tasks. Total: 193\n",
|
| 166 |
+
"Iteration 15/20: Added 10 new tasks. Total: 203\n",
|
| 167 |
+
"Iteration 16/20: Added 10 new tasks. Total: 213\n",
|
| 168 |
+
"Iteration 17/20: Added 10 new tasks. Total: 223\n",
|
| 169 |
+
"Iteration 18/20: Added 10 new tasks. Total: 233\n",
|
| 170 |
+
"Iteration 19/20: Added 10 new tasks. Total: 243\n",
|
| 171 |
+
"Iteration 20/20: Added 10 new tasks. Total: 253\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"Final task count: 253\n"
|
| 174 |
+
]
|
| 175 |
+
}
|
| 176 |
+
],
|
| 177 |
+
"source": [
|
| 178 |
+
"model_name = \"gpt-5\"\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"# Number of iterations to generate tasks (10 tasks per iteration)\n",
|
| 181 |
+
"num_iterations = 20\n",
|
| 182 |
+
"\n",
|
| 183 |
+
"# Copy the seed tasks to avoid modifying the original\n",
|
| 184 |
+
"all_tasks = tasks_with_difficulty.copy()\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"for i in range(num_iterations):\n",
|
| 187 |
+
" prompt = build_prompt(all_tasks)\n",
|
| 188 |
+
"\n",
|
| 189 |
+
" # Query LLM using litellm with structured output\n",
|
| 190 |
+
" response = litellm.completion(\n",
|
| 191 |
+
" model=model_name,\n",
|
| 192 |
+
" messages=[\n",
|
| 193 |
+
" {\n",
|
| 194 |
+
" \"role\": \"system\",\n",
|
| 195 |
+
" \"content\": \"You are an expert at generating diverse ML/AI task instructions using products from HuggingFace and can enumerate them with proper difficulty.\",\n",
|
| 196 |
+
" },\n",
|
| 197 |
+
" {\"role\": \"user\", \"content\": prompt},\n",
|
| 198 |
+
" ],\n",
|
| 199 |
+
" response_format=GeneratedTasks,\n",
|
| 200 |
+
" )\n",
|
| 201 |
+
"\n",
|
| 202 |
+
" # Parse the structured output\n",
|
| 203 |
+
" generated = GeneratedTasks.model_validate_json(\n",
|
| 204 |
+
" response.choices[0].message.content\n",
|
| 205 |
+
" )\n",
|
| 206 |
+
"\n",
|
| 207 |
+
" # Add new tasks to the dictionary\n",
|
| 208 |
+
" new_count = 0\n",
|
| 209 |
+
" for task in generated.tasks:\n",
|
| 210 |
+
" if task.description not in all_tasks:\n",
|
| 211 |
+
" all_tasks[task.description] = task.difficulty.value\n",
|
| 212 |
+
" new_count += 1\n",
|
| 213 |
+
"\n",
|
| 214 |
+
" print(f\"Iteration {i + 1}/{num_iterations}: Added {new_count} new tasks. Total: {len(all_tasks)}\")\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"# Save to disk\n",
|
| 217 |
+
"with open(\"generated_tasks_with_difficulty.json\", \"w\") as f:\n",
|
| 218 |
+
" json.dump(all_tasks, f, indent=2)\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"print(f\"\\nFinal task count: {len(all_tasks)}\")\n"
|
| 221 |
+
]
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"cell_type": "code",
|
| 225 |
+
"execution_count": 16,
|
| 226 |
+
"id": "9c0ad570",
|
| 227 |
+
"metadata": {},
|
| 228 |
+
"outputs": [
|
| 229 |
+
{
|
| 230 |
+
"name": "stdout",
|
| 231 |
+
"output_type": "stream",
|
| 232 |
+
"text": [
|
| 233 |
+
"\n",
|
| 234 |
+
"Dataset: 253 rows\n",
|
| 235 |
+
"Sample: Evaluate models {M_i} on benchmarks {B_i} (Easy)\n"
|
| 236 |
+
]
|
| 237 |
+
}
|
| 238 |
+
],
|
| 239 |
+
"source": [
|
| 240 |
+
"from datasets import Dataset\n",
|
| 241 |
+
"\n",
|
| 242 |
+
"# Convert dict to proper columns\n",
|
| 243 |
+
"questions = list(all_tasks.keys())\n",
|
| 244 |
+
"difficulties = list(all_tasks.values())\n",
|
| 245 |
+
"data = {\"question\": questions, \"difficulty\": difficulties}\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"dataset = Dataset.from_dict(data)\n",
|
| 248 |
+
"print(f\"\\nDataset: {len(dataset)} rows\")\n",
|
| 249 |
+
"print(f\"Sample: {dataset[0]['question']} ({dataset[0]['difficulty']})\")\n"
|
| 250 |
+
]
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"cell_type": "code",
|
| 254 |
+
"execution_count": 17,
|
| 255 |
+
"id": "427a2186",
|
| 256 |
+
"metadata": {},
|
| 257 |
+
"outputs": [
|
| 258 |
+
{
|
| 259 |
+
"data": {
|
| 260 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 261 |
+
"model_id": "b038f2a6afe84208820c1997e5d15096",
|
| 262 |
+
"version_major": 2,
|
| 263 |
+
"version_minor": 0
|
| 264 |
+
},
|
| 265 |
+
"text/plain": [
|
| 266 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ? shards/s]"
|
| 267 |
+
]
|
| 268 |
+
},
|
| 269 |
+
"metadata": {},
|
| 270 |
+
"output_type": "display_data"
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"data": {
|
| 274 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 275 |
+
"model_id": "9e1c6a36740846fa9b25293abdd4a5e4",
|
| 276 |
+
"version_major": 2,
|
| 277 |
+
"version_minor": 0
|
| 278 |
+
},
|
| 279 |
+
"text/plain": [
|
| 280 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
| 281 |
+
]
|
| 282 |
+
},
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"output_type": "display_data"
|
| 285 |
+
},
|
| 286 |
+
{
|
| 287 |
+
"data": {
|
| 288 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 289 |
+
"model_id": "1b37133b27ec49c5a45d73e8d58f0c5a",
|
| 290 |
+
"version_major": 2,
|
| 291 |
+
"version_minor": 0
|
| 292 |
+
},
|
| 293 |
+
"text/plain": [
|
| 294 |
+
"Processing Files (0 / 0): | | 0.00B / 0.00B "
|
| 295 |
+
]
|
| 296 |
+
},
|
| 297 |
+
"metadata": {},
|
| 298 |
+
"output_type": "display_data"
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"data": {
|
| 302 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 303 |
+
"model_id": "1467e2d055ab42aebd2966972ee54e5b",
|
| 304 |
+
"version_major": 2,
|
| 305 |
+
"version_minor": 0
|
| 306 |
+
},
|
| 307 |
+
"text/plain": [
|
| 308 |
+
"New Data Upload: | | 0.00B / 0.00B "
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
"metadata": {},
|
| 312 |
+
"output_type": "display_data"
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"data": {
|
| 316 |
+
"text/plain": [
|
| 317 |
+
"CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/benchmark-tasks/commit/a96debee2c67ef760ecaea69296f2059f449fad6', commit_message='Upload dataset', commit_description='', oid='a96debee2c67ef760ecaea69296f2059f449fad6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/benchmark-tasks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/benchmark-tasks'), pr_revision=None, pr_num=None)"
|
| 318 |
+
]
|
| 319 |
+
},
|
| 320 |
+
"execution_count": 17,
|
| 321 |
+
"metadata": {},
|
| 322 |
+
"output_type": "execute_result"
|
| 323 |
+
}
|
| 324 |
+
],
|
| 325 |
+
"source": [
|
| 326 |
+
"dataset.push_to_hub(\"akseljoonas/benchmark-tasks\", private=True)"
|
| 327 |
+
]
|
| 328 |
+
},
|
| 329 |
+
{
|
| 330 |
+
"cell_type": "code",
|
| 331 |
+
"execution_count": null,
|
| 332 |
+
"id": "50e67652",
|
| 333 |
+
"metadata": {},
|
| 334 |
+
"outputs": [],
|
| 335 |
+
"source": []
|
| 336 |
+
}
|
| 337 |
+
],
|
| 338 |
+
"metadata": {
|
| 339 |
+
"kernelspec": {
|
| 340 |
+
"display_name": ".venv",
|
| 341 |
+
"language": "python",
|
| 342 |
+
"name": "python3"
|
| 343 |
+
},
|
| 344 |
+
"language_info": {
|
| 345 |
+
"codemirror_mode": {
|
| 346 |
+
"name": "ipython",
|
| 347 |
+
"version": 3
|
| 348 |
+
},
|
| 349 |
+
"file_extension": ".py",
|
| 350 |
+
"mimetype": "text/x-python",
|
| 351 |
+
"name": "python",
|
| 352 |
+
"nbconvert_exporter": "python",
|
| 353 |
+
"pygments_lexer": "ipython3",
|
| 354 |
+
"version": "3.12.11"
|
| 355 |
+
}
|
| 356 |
+
},
|
| 357 |
+
"nbformat": 4,
|
| 358 |
+
"nbformat_minor": 5
|
| 359 |
+
}
|
eval/generated_tasks_with_difficulty.json
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Evaluate models {M_i} on benchmarks {B_i}": "Easy",
|
| 3 |
+
"Train models {M_i} on datasets {D_i} with benchmarks {B_i}": "Medium",
|
| 4 |
+
"Run an ablation for hyperparameter P for model M on dataset D": "Hard",
|
| 5 |
+
"Generate completions with model M on dataset D using engine E": "Medium",
|
| 6 |
+
"Merge models {M_i} using linear averaging to find the best result on benchmarks {B_i}": "Hard",
|
| 7 |
+
"Given datasets {D_i}, ablate the best SFT mixture for model M across benchmarks {B_i}": "Very hard",
|
| 8 |
+
"Decontaminate dataset D against benchmarks {B_i}": "Hard",
|
| 9 |
+
"Benchmark RL framework F for best throughput on G GPUs": "Very hard",
|
| 10 |
+
"Implement post-training algorithm A from paper P in framework F. Validate it runs end-to-end": "Very hard",
|
| 11 |
+
"Implement benchmark B in framework F. Validate it reproduces some published results": "Very hard",
|
| 12 |
+
"Format dataset D for compatibility with framework F on task T": "Easy",
|
| 13 |
+
"Remove the background from this image: [image path]": "Easy",
|
| 14 |
+
"Transcribe all of the audio files in this directory": "Easy",
|
| 15 |
+
"Transcribe all of the audio files in this directory, choose the model that'll be cheapest and also relatively accurate": "Medium (judgment call or interaction needed to figure out what accuracy levels are acceptable)",
|
| 16 |
+
"Remove the background music from this audio file": "Medium (needs to find Gradio Space and call its API0",
|
| 17 |
+
"Change this video track to be from English to Spanish": "Medium (needs to link several models together)",
|
| 18 |
+
"Translate this flyer from English to Spanish, keeping the layout and images the same": "Medium (needs to link several models together)",
|
| 19 |
+
"What's the best model for X?": "Easy",
|
| 20 |
+
"What datasets are available for X? (X={domain x task x modality})": "Easy",
|
| 21 |
+
"Is there a space to do Y?": "Easy",
|
| 22 |
+
"I have this script and this error - what's the issue?": "Medium",
|
| 23 |
+
"This space is broken, how can i fix it?": "Medium",
|
| 24 |
+
"I built a space but it is super slow. What can I do?": "Medium",
|
| 25 |
+
"How can I run modal X locally?": "Medium",
|
| 26 |
+
"I want to build a space with model Y to do X?": "Hard",
|
| 27 |
+
"How can I serve a model with multiple LoRAs?": "Hard",
|
| 28 |
+
"What's the best model for sentiment analysis on financial text?": "Easy",
|
| 29 |
+
"Are there any medical image segmentation datasets on HuggingFace for CT scans?": "Easy",
|
| 30 |
+
"Which text classification models support 4-bit quantization?": "Medium",
|
| 31 |
+
"Are there inference endpoints available for Whisper large-v3?": "Easy",
|
| 32 |
+
"What's the license for the SA-Med2D-20M dataset?": "Easy",
|
| 33 |
+
"Which vision models fit in 8GB VRAM for image segmentation?": "Medium",
|
| 34 |
+
"What datasets are available for 3D medical image segmentation?": "Medium",
|
| 35 |
+
"Is there a space to do text-to-speech with emotion control?": "Medium",
|
| 36 |
+
"I'm getting \"CUDA out of memory\" when loading Llama-2-7b even though nvidia-smi shows I have 6GB free - what's the issue?": "Medium",
|
| 37 |
+
"My Gradio space shows \"Connection errored out\" after working fine yesterday, no code changes - how can I fix it?": "Medium",
|
| 38 |
+
"I built a Gradio space for Stable Diffusion but inference takes 5+ minutes on a 4090 - what can I do?": "Medium",
|
| 39 |
+
"My Whisper model outputs different transcriptions after quantization to int8 - why?": "Medium",
|
| 40 |
+
"Getting \"RuntimeError: CUDA error: out of memory. Tried to allocate 70.00 MiB\" but only 2.87 GiB is allocated - what's happening?": "Medium",
|
| 41 |
+
"My HuggingFace space build fails with \"failed to create containerd task\" - how to fix?": "Medium",
|
| 42 |
+
"DistilBERT model gives \"you should probably train your model\" warning even though it's a pretrained model from the Hub": "Easy",
|
| 43 |
+
"Space was working fine but now receiving build errors - receiving this error even with a new space": "Medium",
|
| 44 |
+
"Inference is correct locally but wrong on deployed space": "Medium",
|
| 45 |
+
"Getting CUDA OOM despite having enough memory according to nvidia-smi": "Medium",
|
| 46 |
+
"How can I run Mistral-7B-v0.1 locally with multiple LoRA adapters?": "Hard",
|
| 47 |
+
"How can I serve Llama-2-7b with vLLM and dynamically load multiple LoRA adapters?": "Hard",
|
| 48 |
+
"How do I batch inference requests in my Gradio space for better throughput?": "Medium",
|
| 49 |
+
"Can I run Whisper large-v3 with faster-whisper for 4x speedup?": "Medium",
|
| 50 |
+
"How to run Llama 2 on CPU after fine-tuning with LoRA?": "Medium",
|
| 51 |
+
"Best way to handle 50+ concurrent requests in a Gradio space without OOM?": "Hard",
|
| 52 |
+
"How do I add custom stopping criteria for text generation with Transformers?": "Hard",
|
| 53 |
+
"Can I merge multiple LoRA adapters before inference to reduce latency?": "Hard",
|
| 54 |
+
"How can I optimize my LLM inference with one base LLM and multiple LoRA adapters?": "Hard",
|
| 55 |
+
"Compare tokenizers {T_i} for model M on tasks {classification, QA}; report accuracy and average sequence length per task": "Medium",
|
| 56 |
+
"Run a LoRA rank sweep (r in {4, 8, 16, 32}) for model M on dataset D; plot validation perplexity vs VRAM usage and select Pareto-optimal settings": "Hard",
|
| 57 |
+
"Build a streaming dataloader from Parquet on S3 with deterministic shuffling across N workers; validate epoch reproducibility": "Very hard",
|
| 58 |
+
"Find three open-source TTS models with emotion control and list their sample rates and licenses": "Easy",
|
| 59 |
+
"Create a retrieval-augmented QA pipeline: index corpus C with FAISS, connect to model M, and benchmark top-1 accuracy and p95 latency": "Hard",
|
| 60 |
+
"Diagnose a Space where memory grows per request; add no-grad guards, free caches, and demonstrate stable RSS over 10,000 calls": "Hard",
|
| 61 |
+
"Deduplicate dataset D using MinHash LSH at Jaccard >= 0.9 and publish a cleaned HF dataset with provenance columns": "Medium",
|
| 62 |
+
"Add special tokens to tokenizer T and resize model M embeddings; resume pretraining for 10k steps without loss spikes": "Hard",
|
| 63 |
+
"Create a HuggingFace Dataset from CSV file data.csv and push to repo username/my_dataset": "Easy",
|
| 64 |
+
"Build a real-time Whisper transcription Space with VAD and chunked decoding; keep end-to-end latency under 200 ms": "Hard",
|
| 65 |
+
"Quantize model M to 4-bit (bnb.int4) with bitsandbytes; compare perplexity and p95 latency to 8-bit on dataset D; select config with <1% perplexity increase": "Medium",
|
| 66 |
+
"Fuse LoRA adapter A into base model M and export a single safetensors checkpoint; verify logits parity (<1e-5 MSE) vs on-the-fly LoRA": "Hard",
|
| 67 |
+
"Redact PII from dataset D using a transformer NER pipeline; produce a cleaned HuggingFace Dataset with per-entity removal stats and provenance": "Medium",
|
| 68 |
+
"Train a SentencePiece tokenizer (vocab=64k, byte fallback) on corpus C; compare tokenization speed, unknown-token rate, and bytes/token vs tokenizer T": "Hard",
|
| 69 |
+
"Build a sharded FAISS IVF-PQ index for 100M embeddings stored on S3; integrate with HF datasets streaming and report recall@10 and QPS": "Very hard",
|
| 70 |
+
"Fine-tune model M with QLoRA using TRL PPO on dataset D; log KL, reward, and throughput; validate no divergence on a held-out eval": "Hard",
|
| 71 |
+
"Resolve HfHubHTTPError 401 when pushing dataset repo R: diagnose token scopes, git-lfs config, and large file thresholds; document the fix": "Medium",
|
| 72 |
+
"Implement a custom Transformers LogitsProcessor that bans repeated bigrams; add unit tests and benchmark generation quality (BLEU) on dataset D": "Hard",
|
| 73 |
+
"List and download all Hub models tagged 'text-classification' with Apache-2.0 license and size <500MB; save model ids and downloads to CSV": "Easy",
|
| 74 |
+
"Enable speculative decoding in vLLM with draft model D for base model M; benchmark tokens/sec speedup at batch sizes {1,4,16} and max_new_tokens {64,256}": "Very hard",
|
| 75 |
+
"Profile model M under torch.compile modes {reduce-overhead, max-autotune} on GPU G; report tokens/sec, peak VRAM, and compile overhead": "Medium",
|
| 76 |
+
"Detect and remove near-duplicate images in dataset D using CLIP ViT-L/14 embeddings at cosine >= 0.95; publish a cleaned dataset with duplicate_group ids": "Medium",
|
| 77 |
+
"Convert a TensorFlow SavedModel of T5-base to Transformers PyTorch format; verify logits parity (MSE < 1e-4) on 1,000 random prompts": "Hard",
|
| 78 |
+
"Enable FlashAttention-2 in a Transformers training loop for model M; benchmark step time and confirm loss parity over 2,000 steps vs baseline": "Hard",
|
| 79 |
+
"Deploy vLLM for model M with hot-swappable LoRA adapters {A_i}; provide an API to switch adapters and demonstrate <200 ms switch latency under load": "Very hard",
|
| 80 |
+
"Implement a custom Trainer callback to log gradient norms, activation histograms, and learning rate; diagnose periodic loss spikes and propose a fix": "Hard",
|
| 81 |
+
"Build a bilingual RAG pipeline indexing corpora {en, es} with FAISS HNSW; evaluate exact match@1 on dataset D and report p95 latency": "Hard",
|
| 82 |
+
"Run a mixed-precision sweep (fp16 vs bf16) for model M on A100 and RTX 3090; compare convergence, throughput, and numerical stability issues": "Medium",
|
| 83 |
+
"Create a Gradio Space that batches Whisper-large-v3 transcription via queue + chunked decoding; maintain real-time factor <= 0.5 on a T4": "Hard",
|
| 84 |
+
"List five OCR datasets on the Hub with line-level annotations; include licenses and approximate image counts": "Easy",
|
| 85 |
+
"List models on the Hub tagged 'summarization' that offer safetensors weights and 4-bit quantization; output model ids": "Easy",
|
| 86 |
+
"Evaluate safety filters of models {M_i} on red-team prompt set R; report jailbreak rate and false positive rate": "Medium",
|
| 87 |
+
"Run a prompt template ablation for chat model M on dataset D; compare {alpaca, chatml, llama2} formats and report exact match and average output length": "Hard",
|
| 88 |
+
"Implement tensor parallelism for model M in framework F and show linear scaling across 2\u20138 GPUs with <=10% gap from ideal": "Very hard",
|
| 89 |
+
"Convert and shard dataset D into WebDataset tar files (~500MB/shard); build a streaming loader with checksum validation": "Medium",
|
| 90 |
+
"Deploy a Spaces app serving Stable Diffusion XL with ControlNet; add output caching and keep p95 latency <1s for 20 concurrent users": "Hard",
|
| 91 |
+
"Diagnose and fix 'shape mismatch' when loading LoRA into model M after tokenizer resize; provide minimal repro and patch": "Medium",
|
| 92 |
+
"Add a detailed model card to repo username/model_M with training data, intended use, limitations, and evaluation results": "Easy",
|
| 93 |
+
"Enable KV cache quantization (int8) in Transformers for model M; compare tokens/sec and ROUGE-L on dataset D vs fp16 cache": "Hard",
|
| 94 |
+
"Detect and redact license-incompatible samples in dataset D by matching SPDX identifiers and source domains; publish a compliance report": "Medium",
|
| 95 |
+
"Profile vLLM serving of model M with paged attention; tune block_size to maximize tokens/sec and report p50/p95 latency and peak VRAM": "Medium",
|
| 96 |
+
"Filter dataset D for toxic content using classifier C; log per-label removal rates and recreate stratified train/valid/test splits": "Medium",
|
| 97 |
+
"Train a unigram tokenizer (vocab=80k) on corpora {en, fr}; fine-tune T5-small and compare BLEU vs a BPE baseline; report tokenization speed and OOV rate": "Hard",
|
| 98 |
+
"Run distributed evaluation of models {M_i} on benchmark B across 4 GPUs with DeepSpeed-Inference; ensure identical metrics across 3 seeds": "Hard",
|
| 99 |
+
"Find three open-source ASR models that provide word-level timestamps; record licenses and expected WER on LibriSpeech": "Easy",
|
| 100 |
+
"Diagnose intermittent 'Address already in use' crashes in a FastAPI Space; add graceful shutdown and port probing, verifying stability over 1,000 restart cycles": "Medium",
|
| 101 |
+
"Export a LoRA-finetuned Llama checkpoint to GGUF for llama.cpp; validate perplexity parity (<=1% drift) on WikiText-2": "Hard",
|
| 102 |
+
"Construct a streaming RAG pipeline over S3-stored corpus C with Chroma; index ~1B tokens, implement shard rebalancing, and benchmark recall@5 and QPS": "Very hard",
|
| 103 |
+
"List Hub datasets tagged 'speech-emotion-recognition' with CC-BY or CC-BY-SA licenses and >=10k utterances; write dataset ids and sizes to JSON": "Easy",
|
| 104 |
+
"Train a summarization reward model via pairwise ranking on dataset D; apply DPO to model M and report ROUGE-L and human win rate": "Hard",
|
| 105 |
+
"Find four open-source OCR models that output line- or paragraph-level text and provide ONNX or TensorRT exports; list their licenses and maximum input resolutions": "Easy",
|
| 106 |
+
"Verify tokenizer special tokens for model M are preserved after adding new tokens; write a unit test that asserts CLS/SEP/PAD ids are unchanged before and after resize": "Medium",
|
| 107 |
+
"Implement a constrained decoder for model M that enforces a JSON schema via a custom Transformers LogitsProcessor; add unit tests and benchmark latency on dataset D": "Hard",
|
| 108 |
+
"Build a multilingual RAG index for 50M documents using mDPR with sharded storage on S3; support hot index reloads and report recall@10 and p95 latency at 100 QPS": "Very hard",
|
| 109 |
+
"Quantize T5-base to 8-bit with bitsandbytes (LLM.int8) and compare ROUGE-L and tokens/sec to fp16 on CNN/DailyMail; keep ROUGE-L drop <=1%": "Medium",
|
| 110 |
+
"Diagnose VRAM growth in a vLLM server at batch size 32; add profiling, fix cache eviction behavior, and demonstrate flat memory over 10,000 requests": "Hard",
|
| 111 |
+
"Convert a HuggingFace TokenizerFast to a SentencePiece model; verify >=99.9% token-level agreement on 10,000 sentences and measure tokenization speed delta": "Medium",
|
| 112 |
+
"Train a multi-task adapter stack for {summarization, QA, NLI} on model M; implement routing by prompt prefix and report per-task metrics and cross-task interference": "Very hard",
|
| 113 |
+
"Assess license compatibility between model M (Apache-2.0) and dataset D (CC-BY-SA); produce a one-paragraph verdict with rationale and reference links": "Easy",
|
| 114 |
+
"Enable FSDP with activation checkpointing for a 13B model across 2\u00d7A100 GPUs; achieve <=10% throughput loss vs baseline and verify loss parity over 1,000 steps": "Hard",
|
| 115 |
+
"List three datasets for code summarization with permissive licenses; output their dataset ids and license names": "Easy",
|
| 116 |
+
"Set up nightly continuous evaluation of model M on benchmarks {B_i}; log metrics to Weights & Biases and alert on >2% regression vs last 7-day rolling mean": "Medium",
|
| 117 |
+
"Implement streaming text generation in a Gradio Space for model M using server-sent events; cap median token emission delay at <50 ms": "Hard",
|
| 118 |
+
"Scale out training of a 7B model with FSDP + ZeRO across 8 GPUs; demonstrate checkpoint save/restore and achieve throughput within 15% of ideal linear scaling": "Very hard",
|
| 119 |
+
"Export a mixture-of-experts PyTorch model to ONNX and run with TensorRT; verify top-1 accuracy within 0.5% of PyTorch on dataset D": "Medium",
|
| 120 |
+
"Identify whether model M supports FlashAttention-2 from its config or source; provide supporting repo links and a yes/no compatibility flag": "Easy",
|
| 121 |
+
"Build an audio deduplication pipeline for dataset D using embedding model E with cosine similarity >= 0.98; publish grouped duplicate ids and a cleaned manifest": "Hard",
|
| 122 |
+
"Diagnose slow tokenization in a Transformers pipeline; profile, switch to a fast tokenizer, and demonstrate 2\u00d7 end-to-end speedup on 1M lines": "Medium",
|
| 123 |
+
"Implement a contrastive preference learning loss in TRL; train model M on dataset D and compare KL, reward variance, and human win rate vs a PPO baseline": "Hard",
|
| 124 |
+
"Build an elastic RAG service with Ray that autoscales FAISS shards on S3, supports live corpus updates, and maintains p95 latency <500 ms at 200 QPS": "Very hard",
|
| 125 |
+
"List five chat-optimized LLMs on the Hub that include a tokenizer chat_template and safetensors weights; output model ids": "Easy",
|
| 126 |
+
"Find three biomedical NER datasets with Apache-2.0 or MIT licenses; return dataset ids and license names": "Easy",
|
| 127 |
+
"Create a dataset viewer Space that streams Parquet shards from the Hub using datasets streaming; implement server-side filtering and pagination": "Medium",
|
| 128 |
+
"Enable gradient checkpointing and optimizer state offloading for model M with Accelerate; report step time and peak VRAM vs baseline on a single A100": "Medium",
|
| 129 |
+
"Diagnose and fix 'size mismatch for position_embeddings' after increasing max_position_embeddings; provide a minimal repro and a migration script": "Medium",
|
| 130 |
+
"Implement a regex-constrained Transformers LogitsProcessor that enforces ISO-8601 timestamps; add unit tests and report generation latency overhead on dataset D": "Hard",
|
| 131 |
+
"Train language-specific LoRA adapters for {en, es, de} on model M; add an automatic language router and report per-language BLEU and cross-language interference": "Hard",
|
| 132 |
+
"Build a speaker diarization + ASR Gradio Space using pyannote and Whisper-large-v3; achieve DER <= 12% and real-time factor <= 0.75 on a T4": "Hard",
|
| 133 |
+
"Implement multi-draft speculative decoding with dynamic draft-model selection per prompt; integrate with vLLM and benchmark tokens/sec speedup at batch sizes {1,8,32}": "Very hard",
|
| 134 |
+
"Convert a TensorFlow DistilBERT SavedModel to ONNX (opset 17) and validate logits parity (MSE < 1e-4) on 1,000 random inputs; measure CPU inference speedup vs TensorFlow": "Medium",
|
| 135 |
+
"Evaluate alignment drift after SFT: compare model M vs base M0 on prompt set P; report win rate, refusal rate, and average output length": "Medium",
|
| 136 |
+
"Enable KV cache int4 quantization in vLLM for model M; benchmark tokens/sec and exact match on dataset D vs fp16 cache": "Hard",
|
| 137 |
+
"Implement variable-length packing in a HF Datasets + Transformers training loop; ensure epoch-level sample coverage matches baseline and no truncation beyond max_length": "Medium",
|
| 138 |
+
"Build a multi-tenant LoRA router over vLLM: on-demand load adapters from the Hub with LRU eviction; sustain 100 tenants and <300 ms adapter swap latency under load": "Very hard",
|
| 139 |
+
"Audit generations for PII leakage on prompt set P using detector C; compute precision, recall, and false positive rate; redact before logging and publish a compliance summary": "Medium",
|
| 140 |
+
"Merge a stack of PEFT adapters {A_i} into base model M to produce a single FP16 checkpoint; validate perplexity drift <=0.5% on dataset D and export safetensors": "Hard",
|
| 141 |
+
"Find three Spaces that demonstrate constrained JSON generation; return Space ids and URLs": "Easy",
|
| 142 |
+
"Deploy a cross-lingual vector search service with multilingual-e5-large; shard FAISS across 3 nodes and measure mAP@10 and p95 latency at 500 QPS": "Very hard",
|
| 143 |
+
"Quantize attention and MLP projections only with bitsandbytes (selective 8-bit); compare peak VRAM, tokens/sec, and ROUGE-L vs full-model 8-bit on dataset D": "Hard",
|
| 144 |
+
"Fix \"Token indices sequence length is longer than the specified maximum\" after tokenizer resize; add truncation with stride and update generation config; verify no validation metric regression": "Medium",
|
| 145 |
+
"Identify splits for dataset D and output split names with sample counts": "Easy",
|
| 146 |
+
"Find five multilingual sentence-embedding models on the Hub with Apache-2.0 license; return model ids": "Easy",
|
| 147 |
+
"Set up CI to run evaluation suite E for model M nightly; fail the job if any metric drops >1% vs 7-day rolling mean": "Medium",
|
| 148 |
+
"Add length normalization to beam search for model M; compare vs baseline on dataset D and report ROUGE-L and average output length": "Medium",
|
| 149 |
+
"Detect per-sample language for dataset D; add a 'lang' column and recreate train/valid/test splits preserving language proportions": "Medium",
|
| 150 |
+
"Benchmark vLLM KV-cache eviction strategies (e.g., LRU vs TTL) for model M at batch sizes {1,8,32}; report tokens/sec and peak VRAM": "Medium",
|
| 151 |
+
"Implement a custom DataCollator that packs multiple documents for summarization with separator tokens; add unit tests to prevent cross-sample leakage": "Hard",
|
| 152 |
+
"Build a PDF-to-dataset pipeline: OCR pages with model Donut, store word-level bboxes, and publish a HuggingFace Dataset with a viewer Space": "Hard",
|
| 153 |
+
"Train a ColBERT reranker on corpus C + pairs dataset D; integrate into a RAG search service and report recall@10 and p95 latency delta": "Hard",
|
| 154 |
+
"Deploy vLLM for model M with multi-GPU tensor-parallel inference across 2 nodes using NCCL; demonstrate near-linear throughput scaling and deterministic outputs across 3 seeds": "Very hard",
|
| 155 |
+
"List four Hub models tagged 'named-entity-recognition' that declare bitsandbytes 8-bit support in their README; output model ids": "Easy",
|
| 156 |
+
"Find three Spaces that provide real-time TTS streaming demos; return Space ids and reported sample rates": "Easy",
|
| 157 |
+
"Create a Spaces app that visualizes transformer attention maps for a ViT model using Captum; keep heatmap rendering under 200 ms for 224x224 images": "Medium",
|
| 158 |
+
"Set up datasets streaming with resumable downloads and exponential backoff for S3-hosted Parquet shards; verify checksum integrity after killing and resuming the job": "Medium",
|
| 159 |
+
"Build a tokenizer migration tool to convert a SentencePiece model to a HuggingFace tokenizers JSON with byte-fallback; assert >=99.95% token-level agreement on 20k sentences and report speed delta": "Medium",
|
| 160 |
+
"Implement a custom DataCollator for span masking with variable block sizes for byte-level BPE; add unit tests and demonstrate MLM loss parity over 10k steps on WikiText-103": "Hard",
|
| 161 |
+
"Add speculative decoding with a small draft model to a Transformers-based text-generation server; expose a per-request flag and benchmark tokens/sec speedup at batch sizes {1,8,32}": "Hard",
|
| 162 |
+
"Train an online knowledge-distillation SFT: teacher M0 -> student M on dataset D; log KL divergence, token agreement, and throughput; cap metric drop at <=2% vs teacher": "Hard",
|
| 163 |
+
"Deploy a multi-region vLLM service on Kubernetes with adaptive batching and hot LoRA adapter loading; sustain 200 QPS with p95 latency <300 ms and zero-downtime rollouts": "Very hard",
|
| 164 |
+
"Build a sharded cross-encoder reranking service with Ray: distribute ColBERT scoring across nodes, integrate with FAISS retrieval, and maintain recall@10 within 1% of single-node baseline at 500 QPS": "Very hard",
|
| 165 |
+
"List four Spaces that perform multilingual OCR with layout extraction; return Space ids and supported languages": "Easy",
|
| 166 |
+
"Find five Hub datasets for code generation evaluation with permissive licenses; output dataset ids and license names": "Easy",
|
| 167 |
+
"Add gradient accumulation and gradient clipping to a Transformers Trainer finetune of model M; report step time, peak VRAM, and validation metric vs baseline": "Medium",
|
| 168 |
+
"Implement document chunking with sliding windows and overlap in a Datasets map pipeline; add doc_id and span indices and verify no segment exceeds max_length": "Medium",
|
| 169 |
+
"Export a fine-tuned BERT model to TorchScript and ONNX; verify logits parity (MSE < 1e-4) on 1,000 samples and compare CPU throughput": "Medium",
|
| 170 |
+
"Diagnose 'pad_token_id is not set' warnings during generation; add a PAD token, resize embeddings, and write a unit test asserting identical logits pre/post fix on 200 prompts": "Medium",
|
| 171 |
+
"Implement diverse beam search (group_beam_search) for model M; evaluate on dataset D and report ROUGE-L, distinct-n, and average output length vs standard beam search": "Hard",
|
| 172 |
+
"Build a multi-modal RAG demo that indexes image captions with CLIP and uses LLM M to answer visual questions; report top-1 accuracy and p95 latency": "Hard",
|
| 173 |
+
"Profile activation and KV-cache memory during generation for model M; log per-layer footprints and reduce peak usage via attention slicing; show tokens/sec and VRAM deltas": "Hard",
|
| 174 |
+
"Construct a 200M-document FAISS hybrid (IVF-PQ + HNSW) index with memory-mapped shards on S3; support live add/delete and benchmark recall@10 and QPS at 300 QPS": "Very hard",
|
| 175 |
+
"List five Hub datasets tagged 'topic-modeling' with MIT or Apache-2.0 licenses; output dataset ids": "Easy",
|
| 176 |
+
"Find three Spaces that offer real-time grammar correction with streaming tokens; return Space ids and URLs": "Easy",
|
| 177 |
+
"Convert a spaCy en_core_web_trf NER model to ONNX and wrap it in a Transformers TokenClassification pipeline; verify entity text/label/span parity on 1,000 sentences": "Medium",
|
| 178 |
+
"Set up a GitHub Actions workflow that snapshots tokenizer T weekly and fails if vocab or special token ids drift vs the last snapshot; upload a diff artifact": "Medium",
|
| 179 |
+
"Profile a Datasets map pipeline on corpus C; refactor to use batched=True, num_proc>1, and caching; achieve >=2\u00d7 speedup while preserving deterministic ordering across runs": "Medium",
|
| 180 |
+
"Implement a custom Transformers StoppingCriteria that halts when JSON braces are balanced or max nesting depth is reached; add unit tests and benchmark latency overhead on dataset D": "Hard",
|
| 181 |
+
"Build a visual-and-tabular RAG pipeline: index images with CLIP and CSV tables with TAPAS; answer mixed queries using LLM M; report EM@1 and p95 latency at 50 QPS": "Hard",
|
| 182 |
+
"Enable KV-cache int4 quantization during generation in Transformers for model M; compare tokens/sec and exact match vs fp16 cache on dataset D; keep metric drop <=1%": "Hard",
|
| 183 |
+
"Implement a hot-reloadable sharded FAISS IVF-PQ index for multilingual-e5-base with live add/delete and background re-training; sustain 200 QPS with p95 latency <400 ms across 3 nodes": "Very hard",
|
| 184 |
+
"Deploy a geo-distributed vLLM + LoRA adapter gateway across two regions with consistent hashing and zero-downtime adapter updates; ensure identical outputs across 3 seeds and report cross-region p95 latency": "Very hard",
|
| 185 |
+
"List five Hub LLM repos that disclose training token counts in their model cards; output model ids and token totals": "Easy",
|
| 186 |
+
"Find two ready-to-use Spaces for speaker diarization compatible with Whisper; return Space ids and URLs": "Easy",
|
| 187 |
+
"Create a hashing-based dataset splitter using column 'doc_id' to produce reproducible train/valid/test; verify identical splits across two machines and Python versions": "Medium",
|
| 188 |
+
"Resolve HTTP 403 when creating an organization dataset via the Hub API; diagnose token scopes and org permissions; provide a minimal repro script and the fix": "Medium",
|
| 189 |
+
"Export a PEFT LoRA adapter from a fine-tuned Llama checkpoint as standalone safetensors with a correct adapter_config.json; push to the Hub and verify PEFT.from_pretrained loads it": "Medium",
|
| 190 |
+
"Enable multi-query attention in model M within Transformers; benchmark tokens/sec and peak VRAM vs multi-head attention and verify perplexity parity over 2,000 steps": "Hard",
|
| 191 |
+
"Audit code dataset D for contamination against {HumanEval, MBPP} using exact substring and 3-gram Jaccard >= 0.9; publish per-source contamination rates and a cleaned dataset": "Hard",
|
| 192 |
+
"Implement contrastive search decoding for model M with tunable alpha; compare ROUGE-L, distinct-n, and latency vs nucleus sampling on dataset D": "Hard",
|
| 193 |
+
"Implement pipeline parallelism for model M across 4 GPUs with Accelerate; achieve near-linear scaling (<=15% gap), support checkpoint save/restore, and ensure deterministic outputs across 3 seeds": "Very hard",
|
| 194 |
+
"Deploy a Spaces app that serves two ASR models with automatic language ID routing; maintain real-time factor <= 0.6 on a single T4 and log per-language latency": "Hard",
|
| 195 |
+
"Benchmark JSON-constrained decoding across models {M_i}; report JSON validity rate, exact match on dataset D, and p95 latency under streaming": "Hard",
|
| 196 |
+
"Filter a multilingual dataset D to non-English using fastText language ID; recreate stratified splits and report per-language retention and drop rates": "Medium",
|
| 197 |
+
"Enable paged attention in a custom Transformers generation loop for model M; verify token-level parity on 500 prompts and measure peak VRAM change": "Hard",
|
| 198 |
+
"Shard a 1B-token text corpus into deterministic HF Datasets processing across 16 workers; validate byte-for-byte identical outputs across two runs": "Very hard",
|
| 199 |
+
"Compare LoRA vs QLoRA fine-tunes of Mistral-7B on GSM8K; track loss, exact match, and throughput; select the lowest-VRAM config within 2% EM of best": "Hard",
|
| 200 |
+
"Deploy a quantized T5 encoder-decoder on Triton Inference Server via a Python backend; add token streaming and achieve >=1.5x throughput vs PyTorch baseline": "Hard",
|
| 201 |
+
"Find three Spaces that perform audio source separation (vocals/music); return Space ids and reported sample rates": "Easy",
|
| 202 |
+
"Merge a PEFT IA3 adapter stack into Llama-3-8B base weights; verify perplexity drift <=0.3% on WikiText-103 and export safetensors": "Hard",
|
| 203 |
+
"Resolve DeepSpeed ZeRO-3 stalls during S3 checkpointing; implement async multipart uploads and show stable 5-minute checkpoint cadence over 2 hours": "Very hard",
|
| 204 |
+
"Set up CI to run contamination checks on dataset R against {TruthfulQA, SQuAD} using 4-gram overlap; fail if rate >0.5% and attach offending ids as artifacts": "Medium",
|
| 205 |
+
"List four Hub datasets for sarcasm detection in English; return dataset ids and license tags": "Easy",
|
| 206 |
+
"Identify whether tokenizer T enables byte_fallback in tokenizer.json; output true/false and the file path": "Easy",
|
| 207 |
+
"Find three Spaces that showcase streaming chat with token-by-token updates; return Space ids and whether they use SSE or websockets": "Easy",
|
| 208 |
+
"Create a Datasets loader that parses Praat TextGrid files into word-level timestamps aligned with audio; publish a dataset with an 'audio' column and validate 100 sample alignments": "Medium",
|
| 209 |
+
"Set up a GitHub Actions workflow that lints model cards for repos {R_i} to require intended use, training data, and limitations; fail PRs and post a summary comment on violations": "Medium",
|
| 210 |
+
"Containerize a Gradio Space with optional FlashAttention build: detect GPU capability at startup, compile kernels if supported, and fall back gracefully on unsupported GPUs; test on T4 and A100": "Medium",
|
| 211 |
+
"Evaluate long-context retrieval via needle-in-a-haystack for models {M_i} at context lengths {8k, 32k, 64k}; report retrieval accuracy, tokens/sec, and the max stable context length": "Hard",
|
| 212 |
+
"Implement a curriculum sampler as a HuggingFace Trainer callback that schedules sample difficulty over epochs; compare convergence and final eval metrics vs random sampling": "Hard",
|
| 213 |
+
"Add on-the-fly near-duplicate filtering during training using SimHash over token ids; log per-epoch removal rates and verify no convergence regressions vs a deduplicated baseline": "Hard",
|
| 214 |
+
"Deploy a dual-backend inference router using vLLM and TensorRT-LLM that selects backend per prompt length to minimize latency; maintain deterministic outputs across 3 seeds and sustain 300 QPS with p95 latency SLOs": "Very hard",
|
| 215 |
+
"Identify max_position_embeddings and whether rope_scaling is enabled for model M from its config; output both values.": "Easy",
|
| 216 |
+
"List five Vision Transformer models on the Hub that provide safetensors and have a default image size >= 384; output model ids.": "Easy",
|
| 217 |
+
"Find three Spaces that stream machine-translation outputs token-by-token; return Space ids and whether they use SSE or websockets.": "Easy",
|
| 218 |
+
"Diagnose bursts of [UNK] after adding special tokens to tokenizer T; enable byte_fallback, retrain embeddings for 2k steps, and show unknown-token rate <= baseline+0.1% on corpus C.": "Medium",
|
| 219 |
+
"Create a dataset viewer Space for a dataset with a nested JSON column; convert to Arrow struct arrays, implement server-side filtering on nested keys, and verify row counts match the source.": "Medium",
|
| 220 |
+
"Set up a GitHub Action that hits /health and a no-op inference on Space S after each deploy; fail if cold-start median latency >10s and attach server logs as an artifact.": "Medium",
|
| 221 |
+
"Implement a SQL grammar-constrained Transformers LogitsProcessor using an LL(1) parser; evaluate on Spider dev and report exact match and p95 latency overhead vs nucleus sampling.": "Hard",
|
| 222 |
+
"Add CPU-tier KV-cache offloading with pinned memory for model M in a custom generation loop; compare tokens/sec and peak VRAM vs baseline at context lengths {4k, 16k, 32k}.": "Hard",
|
| 223 |
+
"Deploy a batched cross-encoder reranker microservice using bge-reranker-base; keep recall@10 within 1% of single-request baseline and achieve >=2\u00d7 QPS at 100 concurrent users.": "Hard",
|
| 224 |
+
"Build a heterogeneous inference gateway that routes requests to vLLM or llama.cpp based on prompt length and GPU load; ensure identical normalized outputs across 3 seeds and sustain 200 QPS with p95 latency <300 ms.": "Very hard",
|
| 225 |
+
"Determine whether tokenizer T strips accents (strip_accents); output true/false and the file path where the setting is defined.": "Easy",
|
| 226 |
+
"List four Hub datasets for hate-speech detection in English; return dataset ids and license tags.": "Easy",
|
| 227 |
+
"Write a Datasets loader for a paginated OAuth2 REST API; cache pages, support streaming, and provide deterministic sharding across 8 workers; verify identical row counts across two runs.": "Medium",
|
| 228 |
+
"Add request-level caching (ETag/If-None-Match) to a Gradio summarization Space; achieve >=1.8\u00d7 QPS at 50 concurrent users and report cache hit ratio and p95 latency.": "Medium",
|
| 229 |
+
"Enable HuggingFace tokenizers parallelism and batched encoding for corpus C; benchmark throughput and memory on 10M lines and ensure deterministic outputs across 3 runs.": "Medium",
|
| 230 |
+
"Set up CI to lint dataset cards in repos {R_i} for required fields {license, citation, dataset_summary}; fail PRs and post a summary comment with missing keys.": "Medium",
|
| 231 |
+
"Run a parameter-efficient finetuning sweep comparing LoRA, IA3, and prefix-tuning on RoBERTa-base for MNLI; report accuracy, training time, and peak VRAM; select a Pareto-optimal config.": "Hard",
|
| 232 |
+
"Implement a Transformers LogitsProcessor that enforces balanced parentheses and proper quoted-string escaping; add unit tests and benchmark latency overhead on dataset D.": "Hard",
|
| 233 |
+
"Export Whisper-medium to ONNX with dynamic axes and int8 weights; verify word-timestamp parity on 500 clips and measure CPU real-time factor improvement >=1.3\u00d7 vs PyTorch.": "Hard",
|
| 234 |
+
"Deploy a geo-replicated RAG service: shard FAISS HNSW across three regions with conflict-free index metadata sync; sustain 300 QPS with p95 latency <450 ms and recall@10 within 1% of single-region baseline.": "Very hard",
|
| 235 |
+
"Compare cased vs uncased tokenization for BERT on CoNLL-2003 NER; train both, and report F1, average tokens per sentence, and training time.": "Medium",
|
| 236 |
+
"Create a HuggingFace Datasets loader for EPUB files: extract chapter text and embedded images into Arrow columns, support streaming and deterministic sharding across 8 workers; verify identical row counts across two runs.": "Medium",
|
| 237 |
+
"Configure a Hub webhook to trigger CI when a model card (README.md) changes; fail the job if sections {intended use, limitations} are missing and post a checklist comment on the PR.": "Medium",
|
| 238 |
+
"Add a reranking cache to a RAG service keyed by (query, candidate_ids); achieve >=50% cache hit at 100 QPS and keep recall@10 within 0.5% of baseline.": "Hard",
|
| 239 |
+
"Fix torch.compile graph breaks in a Transformers training loop; patch non-compilable ops, re-enable compilation, and demonstrate >=1.4\u00d7 step-time speedup with matching loss over 2,000 steps.": "Hard",
|
| 240 |
+
"Compute 95% bootstrap confidence intervals for ROUGE-L on dataset D over 3 random seeds; flag regressions when the new CI lies entirely below last week's baseline CI.": "Medium",
|
| 241 |
+
"Build a batch image-captioning Space with ViT-GPT2: accept ZIP uploads, use queue-based batching, and keep p95 latency <2s for 32 images.": "Medium",
|
| 242 |
+
"Implement hybrid parallelism (tensor + pipeline) for a 13B encoder-decoder using Accelerate; scale across 8 GPUs with <=15% gap from linear, support elastic resize (8->6 GPUs) without losing determinism, and verify checkpoint save/restore.": "Very hard",
|
| 243 |
+
"Find five Spaces that stream live vision-language captioning (e.g., LLaVA or BLIP); return Space ids and reported FPS.": "Easy",
|
| 244 |
+
"Identify whether tokenizer T applies Unicode normalization (NFKC/NFC/NFD/NFKD) and where it is configured; output the mode and file path.": "Easy",
|
| 245 |
+
"Identify whether model repo M stores weights exclusively as safetensors; output true/false and list the .safetensors file paths.": "Easy",
|
| 246 |
+
"List three multilingual sentence-embedding models on the Hub that provide ONNX exports; return model ids.": "Easy",
|
| 247 |
+
"Determine if tokenizer T lowercases text (do_lower_case or lowercase flag); output true/false and the file path or JSON key where it is set.": "Easy",
|
| 248 |
+
"Set up a GitHub Action to run a smoke-test text generation for model M on each push; fail if median time to first token >2s and attach container logs as an artifact.": "Medium",
|
| 249 |
+
"Create a Datasets preprocessing pipeline that tokenizes to max_length=512 with stride=64 and retains an 'orig_text' column; verify row counts match input and no NaNs after caching.": "Medium",
|
| 250 |
+
"Resolve 'git-lfs: command not found' when pushing model repo R to the Hub; install and configure Git LFS, set an appropriate large file threshold, and provide a minimal repro plus the verified fix.": "Medium",
|
| 251 |
+
"Enable KV-cache CPU offloading in a custom Transformers generation loop for model M; benchmark tokens/sec and peak VRAM vs baseline at context lengths {4k, 8k}.": "Hard",
|
| 252 |
+
"Implement LoRA rank warmup (r: 4\u219232 over the first 1,000 steps) in a custom Trainer; fine-tune model M on dataset D and report validation perplexity and peak VRAM vs fixed r=32.": "Hard",
|
| 253 |
+
"Export Whisper-small to TensorRT via ONNX (opset 18) with dynamic axes; verify word-timestamp parity (median diff \u22640.05s) on 300 clips and measure \u22651.3\u00d7 GPU speedup vs PyTorch.": "Hard",
|
| 254 |
+
"Deploy a multi-tenant RAG service that hot-loads per-tenant FAISS indices from S3, shares a reranker, and sustains 200 QPS with p95 latency <350 ms across 1,000 tenants; maintain recall@10 within 1% of a single-tenant baseline.": "Very hard"
|
| 255 |
+
}
|