Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

akseljoonas HF Staff commited on Dec 11, 2025

Commit

158d846

1 Parent(s): 8f4b322

intermediate commit until i let amp loose

Browse files

Files changed (5) hide show

eval/.amp_batch_solve.py.swp +0 -0
eval/amp_batch_solve.py +106 -0
eval/amp_solve.py +31 -0
eval/eval_set.ipynb +359 -0
eval/generated_tasks_with_difficulty.json +255 -0

eval/.amp_batch_solve.py.swp ADDED Viewed

Binary file (12.3 kB). View file

eval/amp_batch_solve.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import asyncio
+import json
+import os
+from pathlib import Path
+import threading
+from amp_sdk import AmpOptions, execute
+# Thread-safe file writing
+file_lock = threading.Lock()
+async def solve_task(
+    question: str, difficulty: str, task_idx: int, total: int, semaphore: asyncio.Semaphore
+) -> dict:
+    """Solve a single task using Amp SDK."""
+    async with semaphore:
+        print(f"[{task_idx}/{total}] Starting: {question[:60]}...")
+        messages = []
+        solution = None
+        try:
+            async for message in execute(
+                question,
+                AmpOptions(
+                    cwd=os.getcwd(),
+                    visibility="workspace",
+                    dangerously_allow_all=True,
+                ),
+            ):
+                messages.append(message.model_dump())
+                # Extract the final text response as solution
+                if message.type == "assistant":
+                    content = message.message.get("content", [])
+                    for item in content:
+                        if isinstance(item, dict) and item.get("type") == "text":
+                            solution = item.get("text")
+                elif message.type == "result":
+                    if message.result:
+                        solution = message.result
+            print(f"[{task_idx}/{total}] ✓ Done: {question[:60]}...")
+            return {
+                "question": question,
+                "difficulty": difficulty,
+                "solution": solution,
+                "messages": messages,
+            }
+        except Exception as e:
+            print(f"[{task_idx}/{total}] ✗ Error: {e}")
+            return {
+                "question": question,
+                "difficulty": difficulty,
+                "solution": None,
+                "messages": messages,
+                "error": str(e),
+            }
+def write_result(output_path: Path, result: dict):
+    """Thread-safe write to output file."""
+    with file_lock:
+        with open(output_path, "a") as f:
+            f.write(json.dumps(result) + "\n")
+async def main():
+    # Load tasks
+    tasks_path = Path(__file__).parent / "generated_tasks_with_difficulty.json"
+    with open(tasks_path) as f:
+        tasks = json.load(f)
+    # Output file - clear it first
+    output_path = Path(__file__).parent / "solved_tasks.jsonl"
+    output_path.write_text("")
+    # Semaphore to limit concurrency
+    max_concurrent = 20
+    semaphore = asyncio.Semaphore(max_concurrent)
+    total = len(tasks)
+    print(f"Processing {total} tasks with {max_concurrent} concurrent agents...")
+    async def process_and_save(question: str, difficulty: str, idx: int):
+        result = await solve_task(question, difficulty, idx, total, semaphore)
+        write_result(output_path, result)
+        return result
+    # Create all tasks
+    coroutines = [
+        process_and_save(question, difficulty, i + 1)
+        for i, (question, difficulty) in enumerate(tasks.items())
+    ]
+    # Run all concurrently (semaphore limits actual parallelism)
+    results = await asyncio.gather(*coroutines, return_exceptions=True)
+    successful = sum(1 for r in results if isinstance(r, dict) and "error" not in r)
+    print(f"\nCompleted: {successful}/{total} successful")
+    print(f"Results saved to {output_path}")
+if __name__ == "__main__":
+    asyncio.run(main())

eval/amp_solve.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import asyncio
+import os
+from amp_sdk import AmpOptions, execute
+prompt = """
+what account am I logged in as?
+"""
+async def main():
+    # Use the toolbox directory to share tools with Amp
+    toolbox_dir = os.path.join(os.getcwd(), "toolbox")
+    messages = []
+    async for message in execute(
+        prompt,
+        AmpOptions(
+            cwd=os.getcwd(),
+            toolbox=toolbox_dir,
+            visibility="workspace",
+            dangerously_allow_all=True,
+        ),
+    ):
+        messages.append(message)
+    for msg in messages:
+        print(msg.model_dump_json(indent=2))
+if __name__ == "__main__":
+    asyncio.run(main())

eval/eval_set.ipynb ADDED Viewed

	@@ -0,0 +1,359 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "19f3dd6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Seed examples for task bootstrapping\n",
+    "tasks_with_difficulty = {\n",
+    "    # lewis\n",
+    "    \"Evaluate models {M_i} on benchmarks {B_i}\": \"Easy\",\n",
+    "    \"Train models {M_i} on datasets {D_i} with benchmarks {B_i}\": \"Medium\",\n",
+    "    \"Run an ablation for hyperparameter P for model M on dataset D\": \"Hard\",\n",
+    "    \"Generate completions with model M on dataset D using engine E\": \"Medium\",\n",
+    "    \"Merge models {M_i} using linear averaging to find the best result on benchmarks {B_i}\": \"Hard\",\n",
+    "    \"Given datasets {D_i}, ablate the best SFT mixture for model M across benchmarks {B_i}\": \"Very hard\",\n",
+    "    \"Decontaminate dataset D against benchmarks {B_i}\": \"Hard\",\n",
+    "    \"Benchmark RL framework F for best throughput on G GPUs\": \"Very hard\",\n",
+    "    \"Implement post-training algorithm A from paper P in framework F. Validate it runs end-to-end\": \"Very hard\",\n",
+    "    \"Implement benchmark B in framework F. Validate it reproduces some published results\": \"Very hard\",\n",
+    "    \"Format dataset D for compatibility with framework F on task T\": \"Easy\",\n",
+    "\n",
+    "    # abubakar\n",
+    "    \"Remove the background from this image: [image path]\": \"Easy\",\n",
+    "    \"Transcribe all of the audio files in this directory\": \"Easy\",\n",
+    "    \"Transcribe all of the audio files in this directory, choose the model that'll be cheapest and also relatively accurate\": \"Medium (judgment call or interaction needed to figure out what accuracy levels are acceptable)\",\n",
+    "    \"Remove the background music from this audio file\": \"Medium (needs to find Gradio Space and call its API0\",\n",
+    "    \"Change this video track to be from English to Spanish\": \"Medium (needs to link several models together)\",\n",
+    "    \"Translate this flyer from English to Spanish, keeping the layout and images the same\": \"Medium (needs to link several models together)\",\n",
+    "\n",
+    "    # leandro\n",
+    "    \"What's the best model for X?\": \"Easy\",\n",
+    "    \"What datasets are available for X? (X={domain x task x modality})\": \"Easy\",\n",
+    "    \"Is there a space to do Y?\": \"Easy\",\n",
+    "    \"I have this script and this error - what's the issue?\": \"Medium\",\n",
+    "    \"This space is broken, how can i fix it?\": \"Medium\",\n",
+    "    \"I built a space but it is super slow. What can I do?\": \"Medium\",\n",
+    "    \"How can I run modal X locally?\": \"Medium\",\n",
+    "    \"I want to build a space with model Y to do X?\": \"Hard\",\n",
+    "    \"How can I serve a model with multiple LoRAs?\": \"Hard\",\n",
+    "\n",
+    "    # claude\n",
+    "    \"What's the best model for sentiment analysis on financial text?\": \"Easy\",\n",
+    "    \"Are there any medical image segmentation datasets on HuggingFace for CT scans?\": \"Easy\",\n",
+    "    \"Which text classification models support 4-bit quantization?\": \"Medium\",\n",
+    "    \"Are there inference endpoints available for Whisper large-v3?\": \"Easy\",\n",
+    "    \"What's the license for the SA-Med2D-20M dataset?\": \"Easy\",\n",
+    "    \"Which vision models fit in 8GB VRAM for image segmentation?\": \"Medium\",\n",
+    "    \"What datasets are available for 3D medical image segmentation?\": \"Medium\",\n",
+    "    \"Is there a space to do text-to-speech with emotion control?\": \"Medium\",\n",
+    "    \"I'm getting \\\"CUDA out of memory\\\" when loading Llama-2-7b even though nvidia-smi shows I have 6GB free - what's the issue?\": \"Medium\",\n",
+    "    \"My Gradio space shows \\\"Connection errored out\\\" after working fine yesterday, no code changes - how can I fix it?\": \"Medium\",\n",
+    "    \"I built a Gradio space for Stable Diffusion but inference takes 5+ minutes on a 4090 - what can I do?\": \"Medium\",\n",
+    "    \"My Whisper model outputs different transcriptions after quantization to int8 - why?\": \"Medium\",\n",
+    "    \"Getting \\\"RuntimeError: CUDA error: out of memory. Tried to allocate 70.00 MiB\\\" but only 2.87 GiB is allocated - what's happening?\": \"Medium\",\n",
+    "    \"My HuggingFace space build fails with \\\"failed to create containerd task\\\" - how to fix?\": \"Medium\",\n",
+    "    \"DistilBERT model gives \\\"you should probably train your model\\\" warning even though it's a pretrained model from the Hub\": \"Easy\",\n",
+    "    \"Space was working fine but now receiving build errors - receiving this error even with a new space\": \"Medium\",\n",
+    "    \"Inference is correct locally but wrong on deployed space\": \"Medium\",\n",
+    "    \"Getting CUDA OOM despite having enough memory according to nvidia-smi\": \"Medium\",\n",
+    "    \"How can I run Mistral-7B-v0.1 locally with multiple LoRA adapters?\": \"Hard\",\n",
+    "    \"How can I serve Llama-2-7b with vLLM and dynamically load multiple LoRA adapters?\": \"Hard\",\n",
+    "    \"How do I batch inference requests in my Gradio space for better throughput?\": \"Medium\",\n",
+    "    \"Can I run Whisper large-v3 with faster-whisper for 4x speedup?\": \"Medium\",\n",
+    "    \"How to run Llama 2 on CPU after fine-tuning with LoRA?\": \"Medium\",\n",
+    "    \"Best way to handle 50+ concurrent requests in a Gradio space without OOM?\": \"Hard\",\n",
+    "    \"How do I add custom stopping criteria for text generation with Transformers?\": \"Hard\",\n",
+    "    \"Can I merge multiple LoRA adapters before inference to reduce latency?\": \"Hard\",\n",
+    "    \"How can I optimize my LLM inference with one base LLM and multiple LoRA adapters?\": \"Hard\",\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c7014bef",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "53"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(tasks_with_difficulty)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a8bd7ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import litellm\n",
+    "import json\n",
+    "from pydantic import BaseModel\n",
+    "from enum import Enum\n",
+    "\n",
+    "\n",
+    "class Difficulty(str, Enum):\n",
+    "    EASY = \"Easy\"\n",
+    "    MEDIUM = \"Medium\"\n",
+    "    HARD = \"Hard\"\n",
+    "    VERY_HARD = \"Very hard\"\n",
+    "\n",
+    "\n",
+    "class Task(BaseModel):\n",
+    "    description: str\n",
+    "    difficulty: Difficulty\n",
+    "\n",
+    "\n",
+    "class GeneratedTasks(BaseModel):\n",
+    "    tasks: list[Task]\n",
+    "\n",
+    "\n",
+    "def build_prompt(tasks_dict: dict[str, str]) -> str:\n",
+    "    task_descriptions = \"\".join(\n",
+    "        [f'- \"{task}\" [{difficulty}]\\n' for task, difficulty in tasks_dict.items()]\n",
+    "    )\n",
+    "\n",
+    "    return f\"\"\"Given the following examples of tasks (with their estimated difficulty levels in brackets):\n",
+    "\n",
+    "{task_descriptions}\n",
+    "\n",
+    "Generate exactly 10 new unique tasks with their difficulty levels (Easy, Medium, Hard, or Very hard).\n",
+    "The new tasks should be bootstrapped by analogy or creative mutation of the provided ones, but not be direct copies.\n",
+    "Vary the domains, instructions, and scenario details. Write crisp, concrete task phrasing. Preserve variety in both tasks and difficulties.\n",
+    "Do not repeat any of the input tasks verbatim. Create plausible, meaningful tasks relevant to LLM training, evaluation, dataprocessing, issue handling, tooling, etc.\n",
+    "\"\"\"\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "85ef3dcb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration 1/20: Added 10 new tasks. Total: 63\n",
+      "Iteration 2/20: Added 10 new tasks. Total: 73\n",
+      "Iteration 3/20: Added 10 new tasks. Total: 83\n",
+      "Iteration 4/20: Added 10 new tasks. Total: 93\n",
+      "Iteration 5/20: Added 10 new tasks. Total: 103\n",
+      "Iteration 6/20: Added 10 new tasks. Total: 113\n",
+      "Iteration 7/20: Added 10 new tasks. Total: 123\n",
+      "Iteration 8/20: Added 10 new tasks. Total: 133\n",
+      "Iteration 9/20: Added 10 new tasks. Total: 143\n",
+      "Iteration 10/20: Added 10 new tasks. Total: 153\n",
+      "Iteration 11/20: Added 10 new tasks. Total: 163\n",
+      "Iteration 12/20: Added 10 new tasks. Total: 173\n",
+      "Iteration 13/20: Added 10 new tasks. Total: 183\n",
+      "Iteration 14/20: Added 10 new tasks. Total: 193\n",
+      "Iteration 15/20: Added 10 new tasks. Total: 203\n",
+      "Iteration 16/20: Added 10 new tasks. Total: 213\n",
+      "Iteration 17/20: Added 10 new tasks. Total: 223\n",
+      "Iteration 18/20: Added 10 new tasks. Total: 233\n",
+      "Iteration 19/20: Added 10 new tasks. Total: 243\n",
+      "Iteration 20/20: Added 10 new tasks. Total: 253\n",
+      "\n",
+      "Final task count: 253\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"gpt-5\"\n",
+    "\n",
+    "# Number of iterations to generate tasks (10 tasks per iteration)\n",
+    "num_iterations = 20\n",
+    "\n",
+    "# Copy the seed tasks to avoid modifying the original\n",
+    "all_tasks = tasks_with_difficulty.copy()\n",
+    "\n",
+    "for i in range(num_iterations):\n",
+    "    prompt = build_prompt(all_tasks)\n",
+    "\n",
+    "    # Query LLM using litellm with structured output\n",
+    "    response = litellm.completion(\n",
+    "        model=model_name,\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": \"You are an expert at generating diverse ML/AI task instructions using products from HuggingFace and can enumerate them with proper difficulty.\",\n",
+    "            },\n",
+    "            {\"role\": \"user\", \"content\": prompt},\n",
+    "        ],\n",
+    "        response_format=GeneratedTasks,\n",
+    "    )\n",
+    "\n",
+    "    # Parse the structured output\n",
+    "    generated = GeneratedTasks.model_validate_json(\n",
+    "        response.choices[0].message.content\n",
+    "    )\n",
+    "\n",
+    "    # Add new tasks to the dictionary\n",
+    "    new_count = 0\n",
+    "    for task in generated.tasks:\n",
+    "        if task.description not in all_tasks:\n",
+    "            all_tasks[task.description] = task.difficulty.value\n",
+    "            new_count += 1\n",
+    "\n",
+    "    print(f\"Iteration {i + 1}/{num_iterations}: Added {new_count} new tasks. Total: {len(all_tasks)}\")\n",
+    "\n",
+    "# Save to disk\n",
+    "with open(\"generated_tasks_with_difficulty.json\", \"w\") as f:\n",
+    "    json.dump(all_tasks, f, indent=2)\n",
+    "\n",
+    "print(f\"\\nFinal task count: {len(all_tasks)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "9c0ad570",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Dataset: 253 rows\n",
+      "Sample: Evaluate models {M_i} on benchmarks {B_i} (Easy)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "# Convert dict to proper columns\n",
+    "questions = list(all_tasks.keys())\n",
+    "difficulties = list(all_tasks.values())\n",
+    "data = {\"question\": questions, \"difficulty\": difficulties}\n",
+    "\n",
+    "dataset = Dataset.from_dict(data)\n",
+    "print(f\"\\nDataset: {len(dataset)} rows\")\n",
+    "print(f\"Sample: {dataset[0]['question']} ({dataset[0]['difficulty']})\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "427a2186",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b038f2a6afe84208820c1997e5d15096",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9e1c6a36740846fa9b25293abdd4a5e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1b37133b27ec49c5a45d73e8d58f0c5a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Processing Files (0 / 0): |          |  0.00B /  0.00B            "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1467e2d055ab42aebd2966972ee54e5b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "New Data Upload: |          |  0.00B /  0.00B            "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/benchmark-tasks/commit/a96debee2c67ef760ecaea69296f2059f449fad6', commit_message='Upload dataset', commit_description='', oid='a96debee2c67ef760ecaea69296f2059f449fad6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/benchmark-tasks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/benchmark-tasks'), pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.push_to_hub(\"akseljoonas/benchmark-tasks\", private=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50e67652",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

eval/generated_tasks_with_difficulty.json ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+  "Evaluate models {M_i} on benchmarks {B_i}": "Easy",
+  "Train models {M_i} on datasets {D_i} with benchmarks {B_i}": "Medium",
+  "Run an ablation for hyperparameter P for model M on dataset D": "Hard",
+  "Generate completions with model M on dataset D using engine E": "Medium",
+  "Merge models {M_i} using linear averaging to find the best result on benchmarks {B_i}": "Hard",
+  "Given datasets {D_i}, ablate the best SFT mixture for model M across benchmarks {B_i}": "Very hard",
+  "Decontaminate dataset D against benchmarks {B_i}": "Hard",
+  "Benchmark RL framework F for best throughput on G GPUs": "Very hard",
+  "Implement post-training algorithm A from paper P in framework F. Validate it runs end-to-end": "Very hard",
+  "Implement benchmark B in framework F. Validate it reproduces some published results": "Very hard",
+  "Format dataset D for compatibility with framework F on task T": "Easy",
+  "Remove the background from this image: [image path]": "Easy",
+  "Transcribe all of the audio files in this directory": "Easy",
+  "Transcribe all of the audio files in this directory, choose the model that'll be cheapest and also relatively accurate": "Medium (judgment call or interaction needed to figure out what accuracy levels are acceptable)",
+  "Remove the background music from this audio file": "Medium (needs to find Gradio Space and call its API0",
+  "Change this video track to be from English to Spanish": "Medium (needs to link several models together)",
+  "Translate this flyer from English to Spanish, keeping the layout and images the same": "Medium (needs to link several models together)",
+  "What's the best model for X?": "Easy",
+  "What datasets are available for X? (X={domain x task x modality})": "Easy",
+  "Is there a space to do Y?": "Easy",
+  "I have this script and this error - what's the issue?": "Medium",
+  "This space is broken, how can i fix it?": "Medium",
+  "I built a space but it is super slow. What can I do?": "Medium",
+  "How can I run modal X locally?": "Medium",
+  "I want to build a space with model Y to do X?": "Hard",
+  "How can I serve a model with multiple LoRAs?": "Hard",
+  "What's the best model for sentiment analysis on financial text?": "Easy",
+  "Are there any medical image segmentation datasets on HuggingFace for CT scans?": "Easy",
+  "Which text classification models support 4-bit quantization?": "Medium",
+  "Are there inference endpoints available for Whisper large-v3?": "Easy",
+  "What's the license for the SA-Med2D-20M dataset?": "Easy",
+  "Which vision models fit in 8GB VRAM for image segmentation?": "Medium",
+  "What datasets are available for 3D medical image segmentation?": "Medium",
+  "Is there a space to do text-to-speech with emotion control?": "Medium",
+  "I'm getting \"CUDA out of memory\" when loading Llama-2-7b even though nvidia-smi shows I have 6GB free - what's the issue?": "Medium",
+  "My Gradio space shows \"Connection errored out\" after working fine yesterday, no code changes - how can I fix it?": "Medium",
+  "I built a Gradio space for Stable Diffusion but inference takes 5+ minutes on a 4090 - what can I do?": "Medium",
+  "My Whisper model outputs different transcriptions after quantization to int8 - why?": "Medium",
+  "Getting \"RuntimeError: CUDA error: out of memory. Tried to allocate 70.00 MiB\" but only 2.87 GiB is allocated - what's happening?": "Medium",
+  "My HuggingFace space build fails with \"failed to create containerd task\" - how to fix?": "Medium",
+  "DistilBERT model gives \"you should probably train your model\" warning even though it's a pretrained model from the Hub": "Easy",
+  "Space was working fine but now receiving build errors - receiving this error even with a new space": "Medium",
+  "Inference is correct locally but wrong on deployed space": "Medium",
+  "Getting CUDA OOM despite having enough memory according to nvidia-smi": "Medium",
+  "How can I run Mistral-7B-v0.1 locally with multiple LoRA adapters?": "Hard",
+  "How can I serve Llama-2-7b with vLLM and dynamically load multiple LoRA adapters?": "Hard",
+  "How do I batch inference requests in my Gradio space for better throughput?": "Medium",
+  "Can I run Whisper large-v3 with faster-whisper for 4x speedup?": "Medium",
+  "How to run Llama 2 on CPU after fine-tuning with LoRA?": "Medium",
+  "Best way to handle 50+ concurrent requests in a Gradio space without OOM?": "Hard",
+  "How do I add custom stopping criteria for text generation with Transformers?": "Hard",
+  "Can I merge multiple LoRA adapters before inference to reduce latency?": "Hard",
+  "How can I optimize my LLM inference with one base LLM and multiple LoRA adapters?": "Hard",
+  "Compare tokenizers {T_i} for model M on tasks {classification, QA}; report accuracy and average sequence length per task": "Medium",
+  "Run a LoRA rank sweep (r in {4, 8, 16, 32}) for model M on dataset D; plot validation perplexity vs VRAM usage and select Pareto-optimal settings": "Hard",
+  "Build a streaming dataloader from Parquet on S3 with deterministic shuffling across N workers; validate epoch reproducibility": "Very hard",
+  "Find three open-source TTS models with emotion control and list their sample rates and licenses": "Easy",
+  "Create a retrieval-augmented QA pipeline: index corpus C with FAISS, connect to model M, and benchmark top-1 accuracy and p95 latency": "Hard",
+  "Diagnose a Space where memory grows per request; add no-grad guards, free caches, and demonstrate stable RSS over 10,000 calls": "Hard",
+  "Deduplicate dataset D using MinHash LSH at Jaccard >= 0.9 and publish a cleaned HF dataset with provenance columns": "Medium",
+  "Add special tokens to tokenizer T and resize model M embeddings; resume pretraining for 10k steps without loss spikes": "Hard",
+  "Create a HuggingFace Dataset from CSV file data.csv and push to repo username/my_dataset": "Easy",
+  "Build a real-time Whisper transcription Space with VAD and chunked decoding; keep end-to-end latency under 200 ms": "Hard",
+  "Quantize model M to 4-bit (bnb.int4) with bitsandbytes; compare perplexity and p95 latency to 8-bit on dataset D; select config with <1% perplexity increase": "Medium",
+  "Fuse LoRA adapter A into base model M and export a single safetensors checkpoint; verify logits parity (<1e-5 MSE) vs on-the-fly LoRA": "Hard",
+  "Redact PII from dataset D using a transformer NER pipeline; produce a cleaned HuggingFace Dataset with per-entity removal stats and provenance": "Medium",
+  "Train a SentencePiece tokenizer (vocab=64k, byte fallback) on corpus C; compare tokenization speed, unknown-token rate, and bytes/token vs tokenizer T": "Hard",
+  "Build a sharded FAISS IVF-PQ index for 100M embeddings stored on S3; integrate with HF datasets streaming and report recall@10 and QPS": "Very hard",
+  "Fine-tune model M with QLoRA using TRL PPO on dataset D; log KL, reward, and throughput; validate no divergence on a held-out eval": "Hard",
+  "Resolve HfHubHTTPError 401 when pushing dataset repo R: diagnose token scopes, git-lfs config, and large file thresholds; document the fix": "Medium",
+  "Implement a custom Transformers LogitsProcessor that bans repeated bigrams; add unit tests and benchmark generation quality (BLEU) on dataset D": "Hard",
+  "List and download all Hub models tagged 'text-classification' with Apache-2.0 license and size <500MB; save model ids and downloads to CSV": "Easy",
+  "Enable speculative decoding in vLLM with draft model D for base model M; benchmark tokens/sec speedup at batch sizes {1,4,16} and max_new_tokens {64,256}": "Very hard",
+  "Profile model M under torch.compile modes {reduce-overhead, max-autotune} on GPU G; report tokens/sec, peak VRAM, and compile overhead": "Medium",
+  "Detect and remove near-duplicate images in dataset D using CLIP ViT-L/14 embeddings at cosine >= 0.95; publish a cleaned dataset with duplicate_group ids": "Medium",
+  "Convert a TensorFlow SavedModel of T5-base to Transformers PyTorch format; verify logits parity (MSE < 1e-4) on 1,000 random prompts": "Hard",
+  "Enable FlashAttention-2 in a Transformers training loop for model M; benchmark step time and confirm loss parity over 2,000 steps vs baseline": "Hard",
+  "Deploy vLLM for model M with hot-swappable LoRA adapters {A_i}; provide an API to switch adapters and demonstrate <200 ms switch latency under load": "Very hard",
+  "Implement a custom Trainer callback to log gradient norms, activation histograms, and learning rate; diagnose periodic loss spikes and propose a fix": "Hard",
+  "Build a bilingual RAG pipeline indexing corpora {en, es} with FAISS HNSW; evaluate exact match@1 on dataset D and report p95 latency": "Hard",
+  "Run a mixed-precision sweep (fp16 vs bf16) for model M on A100 and RTX 3090; compare convergence, throughput, and numerical stability issues": "Medium",
+  "Create a Gradio Space that batches Whisper-large-v3 transcription via queue + chunked decoding; maintain real-time factor <= 0.5 on a T4": "Hard",
+  "List five OCR datasets on the Hub with line-level annotations; include licenses and approximate image counts": "Easy",
+  "List models on the Hub tagged 'summarization' that offer safetensors weights and 4-bit quantization; output model ids": "Easy",
+  "Evaluate safety filters of models {M_i} on red-team prompt set R; report jailbreak rate and false positive rate": "Medium",
+  "Run a prompt template ablation for chat model M on dataset D; compare {alpaca, chatml, llama2} formats and report exact match and average output length": "Hard",
+  "Implement tensor parallelism for model M in framework F and show linear scaling across 2\u20138 GPUs with <=10% gap from ideal": "Very hard",
+  "Convert and shard dataset D into WebDataset tar files (~500MB/shard); build a streaming loader with checksum validation": "Medium",
+  "Deploy a Spaces app serving Stable Diffusion XL with ControlNet; add output caching and keep p95 latency <1s for 20 concurrent users": "Hard",
+  "Diagnose and fix 'shape mismatch' when loading LoRA into model M after tokenizer resize; provide minimal repro and patch": "Medium",
+  "Add a detailed model card to repo username/model_M with training data, intended use, limitations, and evaluation results": "Easy",
+  "Enable KV cache quantization (int8) in Transformers for model M; compare tokens/sec and ROUGE-L on dataset D vs fp16 cache": "Hard",
+  "Detect and redact license-incompatible samples in dataset D by matching SPDX identifiers and source domains; publish a compliance report": "Medium",
+  "Profile vLLM serving of model M with paged attention; tune block_size to maximize tokens/sec and report p50/p95 latency and peak VRAM": "Medium",
+  "Filter dataset D for toxic content using classifier C; log per-label removal rates and recreate stratified train/valid/test splits": "Medium",
+  "Train a unigram tokenizer (vocab=80k) on corpora {en, fr}; fine-tune T5-small and compare BLEU vs a BPE baseline; report tokenization speed and OOV rate": "Hard",
+  "Run distributed evaluation of models {M_i} on benchmark B across 4 GPUs with DeepSpeed-Inference; ensure identical metrics across 3 seeds": "Hard",
+  "Find three open-source ASR models that provide word-level timestamps; record licenses and expected WER on LibriSpeech": "Easy",
+  "Diagnose intermittent 'Address already in use' crashes in a FastAPI Space; add graceful shutdown and port probing, verifying stability over 1,000 restart cycles": "Medium",
+  "Export a LoRA-finetuned Llama checkpoint to GGUF for llama.cpp; validate perplexity parity (<=1% drift) on WikiText-2": "Hard",
+  "Construct a streaming RAG pipeline over S3-stored corpus C with Chroma; index ~1B tokens, implement shard rebalancing, and benchmark recall@5 and QPS": "Very hard",
+  "List Hub datasets tagged 'speech-emotion-recognition' with CC-BY or CC-BY-SA licenses and >=10k utterances; write dataset ids and sizes to JSON": "Easy",
+  "Train a summarization reward model via pairwise ranking on dataset D; apply DPO to model M and report ROUGE-L and human win rate": "Hard",
+  "Find four open-source OCR models that output line- or paragraph-level text and provide ONNX or TensorRT exports; list their licenses and maximum input resolutions": "Easy",
+  "Verify tokenizer special tokens for model M are preserved after adding new tokens; write a unit test that asserts CLS/SEP/PAD ids are unchanged before and after resize": "Medium",
+  "Implement a constrained decoder for model M that enforces a JSON schema via a custom Transformers LogitsProcessor; add unit tests and benchmark latency on dataset D": "Hard",
+  "Build a multilingual RAG index for 50M documents using mDPR with sharded storage on S3; support hot index reloads and report recall@10 and p95 latency at 100 QPS": "Very hard",
+  "Quantize T5-base to 8-bit with bitsandbytes (LLM.int8) and compare ROUGE-L and tokens/sec to fp16 on CNN/DailyMail; keep ROUGE-L drop <=1%": "Medium",
+  "Diagnose VRAM growth in a vLLM server at batch size 32; add profiling, fix cache eviction behavior, and demonstrate flat memory over 10,000 requests": "Hard",
+  "Convert a HuggingFace TokenizerFast to a SentencePiece model; verify >=99.9% token-level agreement on 10,000 sentences and measure tokenization speed delta": "Medium",
+  "Train a multi-task adapter stack for {summarization, QA, NLI} on model M; implement routing by prompt prefix and report per-task metrics and cross-task interference": "Very hard",
+  "Assess license compatibility between model M (Apache-2.0) and dataset D (CC-BY-SA); produce a one-paragraph verdict with rationale and reference links": "Easy",
+  "Enable FSDP with activation checkpointing for a 13B model across 2\u00d7A100 GPUs; achieve <=10% throughput loss vs baseline and verify loss parity over 1,000 steps": "Hard",
+  "List three datasets for code summarization with permissive licenses; output their dataset ids and license names": "Easy",
+  "Set up nightly continuous evaluation of model M on benchmarks {B_i}; log metrics to Weights & Biases and alert on >2% regression vs last 7-day rolling mean": "Medium",
+  "Implement streaming text generation in a Gradio Space for model M using server-sent events; cap median token emission delay at <50 ms": "Hard",
+  "Scale out training of a 7B model with FSDP + ZeRO across 8 GPUs; demonstrate checkpoint save/restore and achieve throughput within 15% of ideal linear scaling": "Very hard",
+  "Export a mixture-of-experts PyTorch model to ONNX and run with TensorRT; verify top-1 accuracy within 0.5% of PyTorch on dataset D": "Medium",
+  "Identify whether model M supports FlashAttention-2 from its config or source; provide supporting repo links and a yes/no compatibility flag": "Easy",
+  "Build an audio deduplication pipeline for dataset D using embedding model E with cosine similarity >= 0.98; publish grouped duplicate ids and a cleaned manifest": "Hard",
+  "Diagnose slow tokenization in a Transformers pipeline; profile, switch to a fast tokenizer, and demonstrate 2\u00d7 end-to-end speedup on 1M lines": "Medium",
+  "Implement a contrastive preference learning loss in TRL; train model M on dataset D and compare KL, reward variance, and human win rate vs a PPO baseline": "Hard",
+  "Build an elastic RAG service with Ray that autoscales FAISS shards on S3, supports live corpus updates, and maintains p95 latency <500 ms at 200 QPS": "Very hard",
+  "List five chat-optimized LLMs on the Hub that include a tokenizer chat_template and safetensors weights; output model ids": "Easy",
+  "Find three biomedical NER datasets with Apache-2.0 or MIT licenses; return dataset ids and license names": "Easy",
+  "Create a dataset viewer Space that streams Parquet shards from the Hub using datasets streaming; implement server-side filtering and pagination": "Medium",
+  "Enable gradient checkpointing and optimizer state offloading for model M with Accelerate; report step time and peak VRAM vs baseline on a single A100": "Medium",
+  "Diagnose and fix 'size mismatch for position_embeddings' after increasing max_position_embeddings; provide a minimal repro and a migration script": "Medium",
+  "Implement a regex-constrained Transformers LogitsProcessor that enforces ISO-8601 timestamps; add unit tests and report generation latency overhead on dataset D": "Hard",
+  "Train language-specific LoRA adapters for {en, es, de} on model M; add an automatic language router and report per-language BLEU and cross-language interference": "Hard",
+  "Build a speaker diarization + ASR Gradio Space using pyannote and Whisper-large-v3; achieve DER <= 12% and real-time factor <= 0.75 on a T4": "Hard",
+  "Implement multi-draft speculative decoding with dynamic draft-model selection per prompt; integrate with vLLM and benchmark tokens/sec speedup at batch sizes {1,8,32}": "Very hard",
+  "Convert a TensorFlow DistilBERT SavedModel to ONNX (opset 17) and validate logits parity (MSE < 1e-4) on 1,000 random inputs; measure CPU inference speedup vs TensorFlow": "Medium",
+  "Evaluate alignment drift after SFT: compare model M vs base M0 on prompt set P; report win rate, refusal rate, and average output length": "Medium",
+  "Enable KV cache int4 quantization in vLLM for model M; benchmark tokens/sec and exact match on dataset D vs fp16 cache": "Hard",
+  "Implement variable-length packing in a HF Datasets + Transformers training loop; ensure epoch-level sample coverage matches baseline and no truncation beyond max_length": "Medium",
+  "Build a multi-tenant LoRA router over vLLM: on-demand load adapters from the Hub with LRU eviction; sustain 100 tenants and <300 ms adapter swap latency under load": "Very hard",
+  "Audit generations for PII leakage on prompt set P using detector C; compute precision, recall, and false positive rate; redact before logging and publish a compliance summary": "Medium",
+  "Merge a stack of PEFT adapters {A_i} into base model M to produce a single FP16 checkpoint; validate perplexity drift <=0.5% on dataset D and export safetensors": "Hard",
+  "Find three Spaces that demonstrate constrained JSON generation; return Space ids and URLs": "Easy",
+  "Deploy a cross-lingual vector search service with multilingual-e5-large; shard FAISS across 3 nodes and measure mAP@10 and p95 latency at 500 QPS": "Very hard",
+  "Quantize attention and MLP projections only with bitsandbytes (selective 8-bit); compare peak VRAM, tokens/sec, and ROUGE-L vs full-model 8-bit on dataset D": "Hard",
+  "Fix \"Token indices sequence length is longer than the specified maximum\" after tokenizer resize; add truncation with stride and update generation config; verify no validation metric regression": "Medium",
+  "Identify splits for dataset D and output split names with sample counts": "Easy",
+  "Find five multilingual sentence-embedding models on the Hub with Apache-2.0 license; return model ids": "Easy",
+  "Set up CI to run evaluation suite E for model M nightly; fail the job if any metric drops >1% vs 7-day rolling mean": "Medium",
+  "Add length normalization to beam search for model M; compare vs baseline on dataset D and report ROUGE-L and average output length": "Medium",
+  "Detect per-sample language for dataset D; add a 'lang' column and recreate train/valid/test splits preserving language proportions": "Medium",
+  "Benchmark vLLM KV-cache eviction strategies (e.g., LRU vs TTL) for model M at batch sizes {1,8,32}; report tokens/sec and peak VRAM": "Medium",
+  "Implement a custom DataCollator that packs multiple documents for summarization with separator tokens; add unit tests to prevent cross-sample leakage": "Hard",
+  "Build a PDF-to-dataset pipeline: OCR pages with model Donut, store word-level bboxes, and publish a HuggingFace Dataset with a viewer Space": "Hard",
+  "Train a ColBERT reranker on corpus C + pairs dataset D; integrate into a RAG search service and report recall@10 and p95 latency delta": "Hard",
+  "Deploy vLLM for model M with multi-GPU tensor-parallel inference across 2 nodes using NCCL; demonstrate near-linear throughput scaling and deterministic outputs across 3 seeds": "Very hard",
+  "List four Hub models tagged 'named-entity-recognition' that declare bitsandbytes 8-bit support in their README; output model ids": "Easy",
+  "Find three Spaces that provide real-time TTS streaming demos; return Space ids and reported sample rates": "Easy",
+  "Create a Spaces app that visualizes transformer attention maps for a ViT model using Captum; keep heatmap rendering under 200 ms for 224x224 images": "Medium",
+  "Set up datasets streaming with resumable downloads and exponential backoff for S3-hosted Parquet shards; verify checksum integrity after killing and resuming the job": "Medium",
+  "Build a tokenizer migration tool to convert a SentencePiece model to a HuggingFace tokenizers JSON with byte-fallback; assert >=99.95% token-level agreement on 20k sentences and report speed delta": "Medium",
+  "Implement a custom DataCollator for span masking with variable block sizes for byte-level BPE; add unit tests and demonstrate MLM loss parity over 10k steps on WikiText-103": "Hard",
+  "Add speculative decoding with a small draft model to a Transformers-based text-generation server; expose a per-request flag and benchmark tokens/sec speedup at batch sizes {1,8,32}": "Hard",
+  "Train an online knowledge-distillation SFT: teacher M0 -> student M on dataset D; log KL divergence, token agreement, and throughput; cap metric drop at <=2% vs teacher": "Hard",
+  "Deploy a multi-region vLLM service on Kubernetes with adaptive batching and hot LoRA adapter loading; sustain 200 QPS with p95 latency <300 ms and zero-downtime rollouts": "Very hard",
+  "Build a sharded cross-encoder reranking service with Ray: distribute ColBERT scoring across nodes, integrate with FAISS retrieval, and maintain recall@10 within 1% of single-node baseline at 500 QPS": "Very hard",
+  "List four Spaces that perform multilingual OCR with layout extraction; return Space ids and supported languages": "Easy",
+  "Find five Hub datasets for code generation evaluation with permissive licenses; output dataset ids and license names": "Easy",
+  "Add gradient accumulation and gradient clipping to a Transformers Trainer finetune of model M; report step time, peak VRAM, and validation metric vs baseline": "Medium",
+  "Implement document chunking with sliding windows and overlap in a Datasets map pipeline; add doc_id and span indices and verify no segment exceeds max_length": "Medium",
+  "Export a fine-tuned BERT model to TorchScript and ONNX; verify logits parity (MSE < 1e-4) on 1,000 samples and compare CPU throughput": "Medium",
+  "Diagnose 'pad_token_id is not set' warnings during generation; add a PAD token, resize embeddings, and write a unit test asserting identical logits pre/post fix on 200 prompts": "Medium",
+  "Implement diverse beam search (group_beam_search) for model M; evaluate on dataset D and report ROUGE-L, distinct-n, and average output length vs standard beam search": "Hard",
+  "Build a multi-modal RAG demo that indexes image captions with CLIP and uses LLM M to answer visual questions; report top-1 accuracy and p95 latency": "Hard",
+  "Profile activation and KV-cache memory during generation for model M; log per-layer footprints and reduce peak usage via attention slicing; show tokens/sec and VRAM deltas": "Hard",
+  "Construct a 200M-document FAISS hybrid (IVF-PQ + HNSW) index with memory-mapped shards on S3; support live add/delete and benchmark recall@10 and QPS at 300 QPS": "Very hard",
+  "List five Hub datasets tagged 'topic-modeling' with MIT or Apache-2.0 licenses; output dataset ids": "Easy",
+  "Find three Spaces that offer real-time grammar correction with streaming tokens; return Space ids and URLs": "Easy",
+  "Convert a spaCy en_core_web_trf NER model to ONNX and wrap it in a Transformers TokenClassification pipeline; verify entity text/label/span parity on 1,000 sentences": "Medium",
+  "Set up a GitHub Actions workflow that snapshots tokenizer T weekly and fails if vocab or special token ids drift vs the last snapshot; upload a diff artifact": "Medium",
+  "Profile a Datasets map pipeline on corpus C; refactor to use batched=True, num_proc>1, and caching; achieve >=2\u00d7 speedup while preserving deterministic ordering across runs": "Medium",
+  "Implement a custom Transformers StoppingCriteria that halts when JSON braces are balanced or max nesting depth is reached; add unit tests and benchmark latency overhead on dataset D": "Hard",
+  "Build a visual-and-tabular RAG pipeline: index images with CLIP and CSV tables with TAPAS; answer mixed queries using LLM M; report EM@1 and p95 latency at 50 QPS": "Hard",
+  "Enable KV-cache int4 quantization during generation in Transformers for model M; compare tokens/sec and exact match vs fp16 cache on dataset D; keep metric drop <=1%": "Hard",
+  "Implement a hot-reloadable sharded FAISS IVF-PQ index for multilingual-e5-base with live add/delete and background re-training; sustain 200 QPS with p95 latency <400 ms across 3 nodes": "Very hard",
+  "Deploy a geo-distributed vLLM + LoRA adapter gateway across two regions with consistent hashing and zero-downtime adapter updates; ensure identical outputs across 3 seeds and report cross-region p95 latency": "Very hard",
+  "List five Hub LLM repos that disclose training token counts in their model cards; output model ids and token totals": "Easy",
+  "Find two ready-to-use Spaces for speaker diarization compatible with Whisper; return Space ids and URLs": "Easy",
+  "Create a hashing-based dataset splitter using column 'doc_id' to produce reproducible train/valid/test; verify identical splits across two machines and Python versions": "Medium",
+  "Resolve HTTP 403 when creating an organization dataset via the Hub API; diagnose token scopes and org permissions; provide a minimal repro script and the fix": "Medium",
+  "Export a PEFT LoRA adapter from a fine-tuned Llama checkpoint as standalone safetensors with a correct adapter_config.json; push to the Hub and verify PEFT.from_pretrained loads it": "Medium",
+  "Enable multi-query attention in model M within Transformers; benchmark tokens/sec and peak VRAM vs multi-head attention and verify perplexity parity over 2,000 steps": "Hard",
+  "Audit code dataset D for contamination against {HumanEval, MBPP} using exact substring and 3-gram Jaccard >= 0.9; publish per-source contamination rates and a cleaned dataset": "Hard",
+  "Implement contrastive search decoding for model M with tunable alpha; compare ROUGE-L, distinct-n, and latency vs nucleus sampling on dataset D": "Hard",
+  "Implement pipeline parallelism for model M across 4 GPUs with Accelerate; achieve near-linear scaling (<=15% gap), support checkpoint save/restore, and ensure deterministic outputs across 3 seeds": "Very hard",
+  "Deploy a Spaces app that serves two ASR models with automatic language ID routing; maintain real-time factor <= 0.6 on a single T4 and log per-language latency": "Hard",
+  "Benchmark JSON-constrained decoding across models {M_i}; report JSON validity rate, exact match on dataset D, and p95 latency under streaming": "Hard",
+  "Filter a multilingual dataset D to non-English using fastText language ID; recreate stratified splits and report per-language retention and drop rates": "Medium",
+  "Enable paged attention in a custom Transformers generation loop for model M; verify token-level parity on 500 prompts and measure peak VRAM change": "Hard",
+  "Shard a 1B-token text corpus into deterministic HF Datasets processing across 16 workers; validate byte-for-byte identical outputs across two runs": "Very hard",
+  "Compare LoRA vs QLoRA fine-tunes of Mistral-7B on GSM8K; track loss, exact match, and throughput; select the lowest-VRAM config within 2% EM of best": "Hard",
+  "Deploy a quantized T5 encoder-decoder on Triton Inference Server via a Python backend; add token streaming and achieve >=1.5x throughput vs PyTorch baseline": "Hard",
+  "Find three Spaces that perform audio source separation (vocals/music); return Space ids and reported sample rates": "Easy",
+  "Merge a PEFT IA3 adapter stack into Llama-3-8B base weights; verify perplexity drift <=0.3% on WikiText-103 and export safetensors": "Hard",
+  "Resolve DeepSpeed ZeRO-3 stalls during S3 checkpointing; implement async multipart uploads and show stable 5-minute checkpoint cadence over 2 hours": "Very hard",
+  "Set up CI to run contamination checks on dataset R against {TruthfulQA, SQuAD} using 4-gram overlap; fail if rate >0.5% and attach offending ids as artifacts": "Medium",
+  "List four Hub datasets for sarcasm detection in English; return dataset ids and license tags": "Easy",
+  "Identify whether tokenizer T enables byte_fallback in tokenizer.json; output true/false and the file path": "Easy",
+  "Find three Spaces that showcase streaming chat with token-by-token updates; return Space ids and whether they use SSE or websockets": "Easy",
+  "Create a Datasets loader that parses Praat TextGrid files into word-level timestamps aligned with audio; publish a dataset with an 'audio' column and validate 100 sample alignments": "Medium",
+  "Set up a GitHub Actions workflow that lints model cards for repos {R_i} to require intended use, training data, and limitations; fail PRs and post a summary comment on violations": "Medium",
+  "Containerize a Gradio Space with optional FlashAttention build: detect GPU capability at startup, compile kernels if supported, and fall back gracefully on unsupported GPUs; test on T4 and A100": "Medium",
+  "Evaluate long-context retrieval via needle-in-a-haystack for models {M_i} at context lengths {8k, 32k, 64k}; report retrieval accuracy, tokens/sec, and the max stable context length": "Hard",
+  "Implement a curriculum sampler as a HuggingFace Trainer callback that schedules sample difficulty over epochs; compare convergence and final eval metrics vs random sampling": "Hard",
+  "Add on-the-fly near-duplicate filtering during training using SimHash over token ids; log per-epoch removal rates and verify no convergence regressions vs a deduplicated baseline": "Hard",
+  "Deploy a dual-backend inference router using vLLM and TensorRT-LLM that selects backend per prompt length to minimize latency; maintain deterministic outputs across 3 seeds and sustain 300 QPS with p95 latency SLOs": "Very hard",
+  "Identify max_position_embeddings and whether rope_scaling is enabled for model M from its config; output both values.": "Easy",
+  "List five Vision Transformer models on the Hub that provide safetensors and have a default image size >= 384; output model ids.": "Easy",
+  "Find three Spaces that stream machine-translation outputs token-by-token; return Space ids and whether they use SSE or websockets.": "Easy",
+  "Diagnose bursts of [UNK] after adding special tokens to tokenizer T; enable byte_fallback, retrain embeddings for 2k steps, and show unknown-token rate <= baseline+0.1% on corpus C.": "Medium",
+  "Create a dataset viewer Space for a dataset with a nested JSON column; convert to Arrow struct arrays, implement server-side filtering on nested keys, and verify row counts match the source.": "Medium",
+  "Set up a GitHub Action that hits /health and a no-op inference on Space S after each deploy; fail if cold-start median latency >10s and attach server logs as an artifact.": "Medium",
+  "Implement a SQL grammar-constrained Transformers LogitsProcessor using an LL(1) parser; evaluate on Spider dev and report exact match and p95 latency overhead vs nucleus sampling.": "Hard",
+  "Add CPU-tier KV-cache offloading with pinned memory for model M in a custom generation loop; compare tokens/sec and peak VRAM vs baseline at context lengths {4k, 16k, 32k}.": "Hard",
+  "Deploy a batched cross-encoder reranker microservice using bge-reranker-base; keep recall@10 within 1% of single-request baseline and achieve >=2\u00d7 QPS at 100 concurrent users.": "Hard",
+  "Build a heterogeneous inference gateway that routes requests to vLLM or llama.cpp based on prompt length and GPU load; ensure identical normalized outputs across 3 seeds and sustain 200 QPS with p95 latency <300 ms.": "Very hard",
+  "Determine whether tokenizer T strips accents (strip_accents); output true/false and the file path where the setting is defined.": "Easy",
+  "List four Hub datasets for hate-speech detection in English; return dataset ids and license tags.": "Easy",
+  "Write a Datasets loader for a paginated OAuth2 REST API; cache pages, support streaming, and provide deterministic sharding across 8 workers; verify identical row counts across two runs.": "Medium",
+  "Add request-level caching (ETag/If-None-Match) to a Gradio summarization Space; achieve >=1.8\u00d7 QPS at 50 concurrent users and report cache hit ratio and p95 latency.": "Medium",
+  "Enable HuggingFace tokenizers parallelism and batched encoding for corpus C; benchmark throughput and memory on 10M lines and ensure deterministic outputs across 3 runs.": "Medium",
+  "Set up CI to lint dataset cards in repos {R_i} for required fields {license, citation, dataset_summary}; fail PRs and post a summary comment with missing keys.": "Medium",
+  "Run a parameter-efficient finetuning sweep comparing LoRA, IA3, and prefix-tuning on RoBERTa-base for MNLI; report accuracy, training time, and peak VRAM; select a Pareto-optimal config.": "Hard",
+  "Implement a Transformers LogitsProcessor that enforces balanced parentheses and proper quoted-string escaping; add unit tests and benchmark latency overhead on dataset D.": "Hard",
+  "Export Whisper-medium to ONNX with dynamic axes and int8 weights; verify word-timestamp parity on 500 clips and measure CPU real-time factor improvement >=1.3\u00d7 vs PyTorch.": "Hard",
+  "Deploy a geo-replicated RAG service: shard FAISS HNSW across three regions with conflict-free index metadata sync; sustain 300 QPS with p95 latency <450 ms and recall@10 within 1% of single-region baseline.": "Very hard",
+  "Compare cased vs uncased tokenization for BERT on CoNLL-2003 NER; train both, and report F1, average tokens per sentence, and training time.": "Medium",
+  "Create a HuggingFace Datasets loader for EPUB files: extract chapter text and embedded images into Arrow columns, support streaming and deterministic sharding across 8 workers; verify identical row counts across two runs.": "Medium",
+  "Configure a Hub webhook to trigger CI when a model card (README.md) changes; fail the job if sections {intended use, limitations} are missing and post a checklist comment on the PR.": "Medium",
+  "Add a reranking cache to a RAG service keyed by (query, candidate_ids); achieve >=50% cache hit at 100 QPS and keep recall@10 within 0.5% of baseline.": "Hard",
+  "Fix torch.compile graph breaks in a Transformers training loop; patch non-compilable ops, re-enable compilation, and demonstrate >=1.4\u00d7 step-time speedup with matching loss over 2,000 steps.": "Hard",
+  "Compute 95% bootstrap confidence intervals for ROUGE-L on dataset D over 3 random seeds; flag regressions when the new CI lies entirely below last week's baseline CI.": "Medium",
+  "Build a batch image-captioning Space with ViT-GPT2: accept ZIP uploads, use queue-based batching, and keep p95 latency <2s for 32 images.": "Medium",
+  "Implement hybrid parallelism (tensor + pipeline) for a 13B encoder-decoder using Accelerate; scale across 8 GPUs with <=15% gap from linear, support elastic resize (8->6 GPUs) without losing determinism, and verify checkpoint save/restore.": "Very hard",
+  "Find five Spaces that stream live vision-language captioning (e.g., LLaVA or BLIP); return Space ids and reported FPS.": "Easy",
+  "Identify whether tokenizer T applies Unicode normalization (NFKC/NFC/NFD/NFKD) and where it is configured; output the mode and file path.": "Easy",
+  "Identify whether model repo M stores weights exclusively as safetensors; output true/false and list the .safetensors file paths.": "Easy",
+  "List three multilingual sentence-embedding models on the Hub that provide ONNX exports; return model ids.": "Easy",
+  "Determine if tokenizer T lowercases text (do_lower_case or lowercase flag); output true/false and the file path or JSON key where it is set.": "Easy",
+  "Set up a GitHub Action to run a smoke-test text generation for model M on each push; fail if median time to first token >2s and attach container logs as an artifact.": "Medium",
+  "Create a Datasets preprocessing pipeline that tokenizes to max_length=512 with stride=64 and retains an 'orig_text' column; verify row counts match input and no NaNs after caching.": "Medium",
+  "Resolve 'git-lfs: command not found' when pushing model repo R to the Hub; install and configure Git LFS, set an appropriate large file threshold, and provide a minimal repro plus the verified fix.": "Medium",
+  "Enable KV-cache CPU offloading in a custom Transformers generation loop for model M; benchmark tokens/sec and peak VRAM vs baseline at context lengths {4k, 8k}.": "Hard",
+  "Implement LoRA rank warmup (r: 4\u219232 over the first 1,000 steps) in a custom Trainer; fine-tune model M on dataset D and report validation perplexity and peak VRAM vs fixed r=32.": "Hard",
+  "Export Whisper-small to TensorRT via ONNX (opset 18) with dynamic axes; verify word-timestamp parity (median diff \u22640.05s) on 300 clips and measure \u22651.3\u00d7 GPU speedup vs PyTorch.": "Hard",
+  "Deploy a multi-tenant RAG service that hot-loads per-tenant FAISS indices from S3, shares a reranker, and sustains 200 QPS with p95 latency <350 ms across 1,000 tenants; maintain recall@10 within 1% of a single-tenant baseline.": "Very hard"
+}