asdf98 commited on
Commit
8dcc13b
Β·
verified Β·
1 Parent(s): fbc4da7

Upload EthicalHacking_LFM2.5_Ultimate_Colab.ipynb

Browse files
EthicalHacking_LFM2.5_Ultimate_Colab.ipynb CHANGED
@@ -11,7 +11,7 @@
11
  "**πŸ“Š Datasets:** Your choice β€” cybersecurity, general chat, multilingual, coding, or mix them! \n",
12
  "**⚑ Framework:** Unsloth + TRL SFTTrainer β€” 2Γ— faster, 70% less VRAM \n",
13
  "\n",
14
- "> ⚠️ Pick any dataset below. Default is cybersecurity. Mix datasets for hybrid tuning.\n",
15
  "\n",
16
  "---\n",
17
  "\n",
@@ -22,11 +22,10 @@
22
  "| Parameters | 1.2B |\n",
23
  "| 4-bit VRAM | ~1.0 GB |\n",
24
  "| Context | 128K tokens |\n",
25
- "| Batch size on T4 | **4-8** |\n",
26
  "| Training headroom | **~14 GB free** |\n",
27
  "\n",
28
- "**Unsloth docs:** https://unsloth.ai/docs/models/tutorials/lfm2.5 \n",
29
- "**Official notebook:** https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Liquid_LFM2_(1.2B)-Conversational.ipynb"
30
  ]
31
  },
32
  {
@@ -60,7 +59,7 @@
60
  "outputs": [],
61
  "source": [
62
  "from huggingface_hub import login\n",
63
- "# login(token=\"hf_YOUR_TOKEN\") # ← uncomment and paste your token"
64
  ]
65
  },
66
  {
@@ -79,7 +78,6 @@
79
  "from unsloth import FastLanguageModel\n",
80
  "import torch\n",
81
  "\n",
82
- "# ==================== T4-COLAB HYPERPARAMETERS (LFM2.5) ====================\n",
83
  "MAX_SEQ_LENGTH = 4096\n",
84
  "LORA_R = 128\n",
85
  "LORA_ALPHA = 128\n",
@@ -93,7 +91,6 @@
93
  "PACKING = True\n",
94
  "SAMPLE_SIZE = 50000\n",
95
  "HUB_MODEL_ID = \"your-username/lfm25-lora\"\n",
96
- "# ========================================================================\n",
97
  "\n",
98
  "model, tokenizer = FastLanguageModel.from_pretrained(\n",
99
  " model_name=\"unsloth/LFM2.5-1.2B-Instruct\",\n",
@@ -129,17 +126,16 @@
129
  "source": [
130
  "## 4️⃣ 🎯 CHOOSE YOUR DATASET(S)\n",
131
  "\n",
132
- "Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
133
- "\n",
134
- "| Choice | Dataset | Size | Format | Best For |\n",
135
  "|--------|---------|------|--------|----------|\n",
136
- "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K | system/user/assistant | **Ethical hacking education** |\n",
137
- "| `\"ultrachat\"` | UltraChat 200K SFT | 200K | messages | General conversation |\n",
138
- "| `\"openhermes\"` | OpenHermes 2.5 | 1M+ | conversations | Reasoning, coding |\n",
139
- "| `\"sharegpt_en\"` | ShareGPT English | ~90K | conversations | Multi-turn dialogue |\n",
140
- "| `\"sharegpt_de\"` | ShareGPT German | ~104K | conversations | German fine-tuning |\n",
141
- "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K | conversations | Hindi fine-tuning |\n",
142
- "| `\"custom_mix\"` | Your mix | β€” | varies | Combine multiple |"
 
143
  ]
144
  },
145
  {
@@ -150,10 +146,6 @@
150
  "source": [
151
  "from datasets import load_dataset, concatenate_datasets\n",
152
  "\n",
153
- "# ═══════════════════════════════════════════════════════════════\n",
154
- "# SELECT YOUR DATASET β€” UNCOMMENT ONE LINE\n",
155
- "# ═══════════════════════════════════════════════════════════════\n",
156
- "\n",
157
  "DATASET_CHOICE = \"cybersecurity\"\n",
158
  "\n",
159
  "# DATASET_CHOICE = \"ultrachat\"\n",
@@ -161,6 +153,7 @@
161
  "# DATASET_CHOICE = \"sharegpt_en\"\n",
162
  "# DATASET_CHOICE = \"sharegpt_de\"\n",
163
  "# DATASET_CHOICE = \"sharegpt_hi\"\n",
 
164
  "# DATASET_CHOICE = \"custom_mix\"\n",
165
  "\n",
166
  "CUSTOM_DATASETS = [\n",
@@ -214,6 +207,17 @@
214
  " msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
215
  " return {\"messages\": msgs}\n",
216
  "\n",
 
 
 
 
 
 
 
 
 
 
 
217
  "all_datasets = []\n",
218
  "\n",
219
  "if DATASET_CHOICE == \"cybersecurity\":\n",
@@ -240,6 +244,11 @@
240
  " ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
241
  " all_datasets.append(ds)\n",
242
  "\n",
 
 
 
 
 
243
  "elif DATASET_CHOICE == \"custom_mix\":\n",
244
  " for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
245
  " ds = load_dataset(ds_id, split=split)\n",
@@ -247,6 +256,7 @@
247
  " ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
248
  " if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
249
  " elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
 
250
  " all_datasets.append(ds)\n",
251
  "\n",
252
  "else:\n",
@@ -440,6 +450,7 @@
440
  "| **Liquid AI Models** | https://www.liquid.ai/models |\n",
441
  "| **LFM2.5-1.2B-Instruct** | https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct |\n",
442
  "| **Unsloth LFM2.5 Docs** | https://unsloth.ai/docs/models/tutorials/lfm2.5 |\n",
 
443
  "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
444
  "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
445
  "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",
 
11
  "**πŸ“Š Datasets:** Your choice β€” cybersecurity, general chat, multilingual, coding, or mix them! \n",
12
  "**⚑ Framework:** Unsloth + TRL SFTTrainer β€” 2Γ— faster, 70% less VRAM \n",
13
  "\n",
14
+ "> ⚠️ Default is cybersecurity. Pick general-purpose datasets for other domains.\n",
15
  "\n",
16
  "---\n",
17
  "\n",
 
22
  "| Parameters | 1.2B |\n",
23
  "| 4-bit VRAM | ~1.0 GB |\n",
24
  "| Context | 128K tokens |\n",
25
+ "| Batch size | **4-8** |\n",
26
  "| Training headroom | **~14 GB free** |\n",
27
  "\n",
28
+ "**Unsloth docs:** https://unsloth.ai/docs/models/tutorials/lfm2.5"
 
29
  ]
30
  },
31
  {
 
59
  "outputs": [],
60
  "source": [
61
  "from huggingface_hub import login\n",
62
+ "# login(token=\"hf_YOUR_TOKEN\")"
63
  ]
64
  },
65
  {
 
78
  "from unsloth import FastLanguageModel\n",
79
  "import torch\n",
80
  "\n",
 
81
  "MAX_SEQ_LENGTH = 4096\n",
82
  "LORA_R = 128\n",
83
  "LORA_ALPHA = 128\n",
 
91
  "PACKING = True\n",
92
  "SAMPLE_SIZE = 50000\n",
93
  "HUB_MODEL_ID = \"your-username/lfm25-lora\"\n",
 
94
  "\n",
95
  "model, tokenizer = FastLanguageModel.from_pretrained(\n",
96
  " model_name=\"unsloth/LFM2.5-1.2B-Instruct\",\n",
 
126
  "source": [
127
  "## 4️⃣ 🎯 CHOOSE YOUR DATASET(S)\n",
128
  "\n",
129
+ "| Choice | Dataset | Rows | Format | Best For |\n",
 
 
130
  "|--------|---------|------|--------|----------|\n",
131
+ "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K→50K | system/user/assistant | Ethical hacking education |\n",
132
+ "| `\"ultrachat\"` | UltraChat 200K SFT | 200K→50K | messages | General conversation |\n",
133
+ "| `\"openhermes\"` | OpenHermes 2.5 | 1M+β†’50K | conversations | Reasoning, coding |\n",
134
+ "| `\"sharegpt_en\"` | ShareGPT English | ~90K→50K | conversations | Multi-turn dialogue |\n",
135
+ "| `\"sharegpt_de\"` | ShareGPT German | ~104K→50K | conversations | German fine-tuning |\n",
136
+ "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K→50K | conversations | Hindi fine-tuning |\n",
137
+ "| `\"code_corpus\"` | [Code Corpus](https://huggingface.co/datasets/krystv/code-corpus-llm-training) | 240K→50K | text (code files) | **Code completion, coding assistant** |\n",
138
+ "| `\"custom_mix\"` | Mix of your choice | β€” | varies | Combine datasets |"
139
  ]
140
  },
141
  {
 
146
  "source": [
147
  "from datasets import load_dataset, concatenate_datasets\n",
148
  "\n",
 
 
 
 
149
  "DATASET_CHOICE = \"cybersecurity\"\n",
150
  "\n",
151
  "# DATASET_CHOICE = \"ultrachat\"\n",
 
153
  "# DATASET_CHOICE = \"sharegpt_en\"\n",
154
  "# DATASET_CHOICE = \"sharegpt_de\"\n",
155
  "# DATASET_CHOICE = \"sharegpt_hi\"\n",
156
+ "# DATASET_CHOICE = \"code_corpus\"\n",
157
  "# DATASET_CHOICE = \"custom_mix\"\n",
158
  "\n",
159
  "CUSTOM_DATASETS = [\n",
 
207
  " msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
208
  " return {\"messages\": msgs}\n",
209
  "\n",
210
+ "def _convert_code_corpus(example):\n",
211
+ " code_text = example[\"text\"]\n",
212
+ " domain = example.get(\"domain\", \"code\")\n",
213
+ " repo = example.get(\"repo\", \"unknown\")\n",
214
+ " lang = example.get(\"language\", \"\")\n",
215
+ " user_prompt = f\"Here is a code snippet from the {domain} domain (repo: {repo}, language: {lang}). Please explain or improve it.\"\n",
216
+ " return {\"messages\": [\n",
217
+ " {\"role\": \"user\", \"content\": user_prompt},\n",
218
+ " {\"role\": \"assistant\", \"content\": code_text},\n",
219
+ " ]}\n",
220
+ "\n",
221
  "all_datasets = []\n",
222
  "\n",
223
  "if DATASET_CHOICE == \"cybersecurity\":\n",
 
244
  " ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
245
  " all_datasets.append(ds)\n",
246
  "\n",
247
+ "elif DATASET_CHOICE == \"code_corpus\":\n",
248
+ " ds = load_dataset(\"krystv/code-corpus-llm-training\", split=\"train\")\n",
249
+ " ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
250
+ " all_datasets.append(ds)\n",
251
+ "\n",
252
  "elif DATASET_CHOICE == \"custom_mix\":\n",
253
  " for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
254
  " ds = load_dataset(ds_id, split=split)\n",
 
256
  " ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
257
  " if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
258
  " elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
259
+ " elif fmt == \"text\": ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
260
  " all_datasets.append(ds)\n",
261
  "\n",
262
  "else:\n",
 
450
  "| **Liquid AI Models** | https://www.liquid.ai/models |\n",
451
  "| **LFM2.5-1.2B-Instruct** | https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct |\n",
452
  "| **Unsloth LFM2.5 Docs** | https://unsloth.ai/docs/models/tutorials/lfm2.5 |\n",
453
+ "| **Code Corpus LLM Training** | https://huggingface.co/datasets/krystv/code-corpus-llm-training |\n",
454
  "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
455
  "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
456
  "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",