Upload EthicalHacking_LFM2.5_Ultimate_Colab.ipynb
Browse files
EthicalHacking_LFM2.5_Ultimate_Colab.ipynb
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
"**π Datasets:** Your choice β cybersecurity, general chat, multilingual, coding, or mix them! \n",
|
| 12 |
"**β‘ Framework:** Unsloth + TRL SFTTrainer β 2Γ faster, 70% less VRAM \n",
|
| 13 |
"\n",
|
| 14 |
-
"> β οΈ
|
| 15 |
"\n",
|
| 16 |
"---\n",
|
| 17 |
"\n",
|
|
@@ -22,11 +22,10 @@
|
|
| 22 |
"| Parameters | 1.2B |\n",
|
| 23 |
"| 4-bit VRAM | ~1.0 GB |\n",
|
| 24 |
"| Context | 128K tokens |\n",
|
| 25 |
-
"| Batch size
|
| 26 |
"| Training headroom | **~14 GB free** |\n",
|
| 27 |
"\n",
|
| 28 |
-
"**Unsloth docs:** https://unsloth.ai/docs/models/tutorials/lfm2.5
|
| 29 |
-
"**Official notebook:** https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Liquid_LFM2_(1.2B)-Conversational.ipynb"
|
| 30 |
]
|
| 31 |
},
|
| 32 |
{
|
|
@@ -60,7 +59,7 @@
|
|
| 60 |
"outputs": [],
|
| 61 |
"source": [
|
| 62 |
"from huggingface_hub import login\n",
|
| 63 |
-
"# login(token=\"hf_YOUR_TOKEN\")
|
| 64 |
]
|
| 65 |
},
|
| 66 |
{
|
|
@@ -79,7 +78,6 @@
|
|
| 79 |
"from unsloth import FastLanguageModel\n",
|
| 80 |
"import torch\n",
|
| 81 |
"\n",
|
| 82 |
-
"# ==================== T4-COLAB HYPERPARAMETERS (LFM2.5) ====================\n",
|
| 83 |
"MAX_SEQ_LENGTH = 4096\n",
|
| 84 |
"LORA_R = 128\n",
|
| 85 |
"LORA_ALPHA = 128\n",
|
|
@@ -93,7 +91,6 @@
|
|
| 93 |
"PACKING = True\n",
|
| 94 |
"SAMPLE_SIZE = 50000\n",
|
| 95 |
"HUB_MODEL_ID = \"your-username/lfm25-lora\"\n",
|
| 96 |
-
"# ========================================================================\n",
|
| 97 |
"\n",
|
| 98 |
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
|
| 99 |
" model_name=\"unsloth/LFM2.5-1.2B-Instruct\",\n",
|
|
@@ -129,17 +126,16 @@
|
|
| 129 |
"source": [
|
| 130 |
"## 4οΈβ£ π― CHOOSE YOUR DATASET(S)\n",
|
| 131 |
"\n",
|
| 132 |
-
"
|
| 133 |
-
"\n",
|
| 134 |
-
"| Choice | Dataset | Size | Format | Best For |\n",
|
| 135 |
"|--------|---------|------|--------|----------|\n",
|
| 136 |
-
"| `\"cybersecurity\"` | Fenrir + Trendyol | 153K | system/user/assistant |
|
| 137 |
-
"| `\"ultrachat\"` | UltraChat 200K SFT | 200K | messages | General conversation |\n",
|
| 138 |
-
"| `\"openhermes\"` | OpenHermes 2.5 | 1M+ | conversations | Reasoning, coding |\n",
|
| 139 |
-
"| `\"sharegpt_en\"` | ShareGPT English | ~90K | conversations | Multi-turn dialogue |\n",
|
| 140 |
-
"| `\"sharegpt_de\"` | ShareGPT German | ~104K | conversations | German fine-tuning |\n",
|
| 141 |
-
"| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K | conversations | Hindi fine-tuning |\n",
|
| 142 |
-
"| `\"
|
|
|
|
| 143 |
]
|
| 144 |
},
|
| 145 |
{
|
|
@@ -150,10 +146,6 @@
|
|
| 150 |
"source": [
|
| 151 |
"from datasets import load_dataset, concatenate_datasets\n",
|
| 152 |
"\n",
|
| 153 |
-
"# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
|
| 154 |
-
"# SELECT YOUR DATASET β UNCOMMENT ONE LINE\n",
|
| 155 |
-
"# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
|
| 156 |
-
"\n",
|
| 157 |
"DATASET_CHOICE = \"cybersecurity\"\n",
|
| 158 |
"\n",
|
| 159 |
"# DATASET_CHOICE = \"ultrachat\"\n",
|
|
@@ -161,6 +153,7 @@
|
|
| 161 |
"# DATASET_CHOICE = \"sharegpt_en\"\n",
|
| 162 |
"# DATASET_CHOICE = \"sharegpt_de\"\n",
|
| 163 |
"# DATASET_CHOICE = \"sharegpt_hi\"\n",
|
|
|
|
| 164 |
"# DATASET_CHOICE = \"custom_mix\"\n",
|
| 165 |
"\n",
|
| 166 |
"CUSTOM_DATASETS = [\n",
|
|
@@ -214,6 +207,17 @@
|
|
| 214 |
" msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
|
| 215 |
" return {\"messages\": msgs}\n",
|
| 216 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
"all_datasets = []\n",
|
| 218 |
"\n",
|
| 219 |
"if DATASET_CHOICE == \"cybersecurity\":\n",
|
|
@@ -240,6 +244,11 @@
|
|
| 240 |
" ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
| 241 |
" all_datasets.append(ds)\n",
|
| 242 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
"elif DATASET_CHOICE == \"custom_mix\":\n",
|
| 244 |
" for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
|
| 245 |
" ds = load_dataset(ds_id, split=split)\n",
|
|
@@ -247,6 +256,7 @@
|
|
| 247 |
" ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
|
| 248 |
" if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
|
| 249 |
" elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
|
|
|
| 250 |
" all_datasets.append(ds)\n",
|
| 251 |
"\n",
|
| 252 |
"else:\n",
|
|
@@ -440,6 +450,7 @@
|
|
| 440 |
"| **Liquid AI Models** | https://www.liquid.ai/models |\n",
|
| 441 |
"| **LFM2.5-1.2B-Instruct** | https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct |\n",
|
| 442 |
"| **Unsloth LFM2.5 Docs** | https://unsloth.ai/docs/models/tutorials/lfm2.5 |\n",
|
|
|
|
| 443 |
"| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
|
| 444 |
"| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
|
| 445 |
"| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",
|
|
|
|
| 11 |
"**π Datasets:** Your choice β cybersecurity, general chat, multilingual, coding, or mix them! \n",
|
| 12 |
"**β‘ Framework:** Unsloth + TRL SFTTrainer β 2Γ faster, 70% less VRAM \n",
|
| 13 |
"\n",
|
| 14 |
+
"> β οΈ Default is cybersecurity. Pick general-purpose datasets for other domains.\n",
|
| 15 |
"\n",
|
| 16 |
"---\n",
|
| 17 |
"\n",
|
|
|
|
| 22 |
"| Parameters | 1.2B |\n",
|
| 23 |
"| 4-bit VRAM | ~1.0 GB |\n",
|
| 24 |
"| Context | 128K tokens |\n",
|
| 25 |
+
"| Batch size | **4-8** |\n",
|
| 26 |
"| Training headroom | **~14 GB free** |\n",
|
| 27 |
"\n",
|
| 28 |
+
"**Unsloth docs:** https://unsloth.ai/docs/models/tutorials/lfm2.5"
|
|
|
|
| 29 |
]
|
| 30 |
},
|
| 31 |
{
|
|
|
|
| 59 |
"outputs": [],
|
| 60 |
"source": [
|
| 61 |
"from huggingface_hub import login\n",
|
| 62 |
+
"# login(token=\"hf_YOUR_TOKEN\")"
|
| 63 |
]
|
| 64 |
},
|
| 65 |
{
|
|
|
|
| 78 |
"from unsloth import FastLanguageModel\n",
|
| 79 |
"import torch\n",
|
| 80 |
"\n",
|
|
|
|
| 81 |
"MAX_SEQ_LENGTH = 4096\n",
|
| 82 |
"LORA_R = 128\n",
|
| 83 |
"LORA_ALPHA = 128\n",
|
|
|
|
| 91 |
"PACKING = True\n",
|
| 92 |
"SAMPLE_SIZE = 50000\n",
|
| 93 |
"HUB_MODEL_ID = \"your-username/lfm25-lora\"\n",
|
|
|
|
| 94 |
"\n",
|
| 95 |
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
|
| 96 |
" model_name=\"unsloth/LFM2.5-1.2B-Instruct\",\n",
|
|
|
|
| 126 |
"source": [
|
| 127 |
"## 4οΈβ£ π― CHOOSE YOUR DATASET(S)\n",
|
| 128 |
"\n",
|
| 129 |
+
"| Choice | Dataset | Rows | Format | Best For |\n",
|
|
|
|
|
|
|
| 130 |
"|--------|---------|------|--------|----------|\n",
|
| 131 |
+
"| `\"cybersecurity\"` | Fenrir + Trendyol | 153Kβ50K | system/user/assistant | Ethical hacking education |\n",
|
| 132 |
+
"| `\"ultrachat\"` | UltraChat 200K SFT | 200Kβ50K | messages | General conversation |\n",
|
| 133 |
+
"| `\"openhermes\"` | OpenHermes 2.5 | 1M+β50K | conversations | Reasoning, coding |\n",
|
| 134 |
+
"| `\"sharegpt_en\"` | ShareGPT English | ~90Kβ50K | conversations | Multi-turn dialogue |\n",
|
| 135 |
+
"| `\"sharegpt_de\"` | ShareGPT German | ~104Kβ50K | conversations | German fine-tuning |\n",
|
| 136 |
+
"| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153Kβ50K | conversations | Hindi fine-tuning |\n",
|
| 137 |
+
"| `\"code_corpus\"` | [Code Corpus](https://huggingface.co/datasets/krystv/code-corpus-llm-training) | 240Kβ50K | text (code files) | **Code completion, coding assistant** |\n",
|
| 138 |
+
"| `\"custom_mix\"` | Mix of your choice | β | varies | Combine datasets |"
|
| 139 |
]
|
| 140 |
},
|
| 141 |
{
|
|
|
|
| 146 |
"source": [
|
| 147 |
"from datasets import load_dataset, concatenate_datasets\n",
|
| 148 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
"DATASET_CHOICE = \"cybersecurity\"\n",
|
| 150 |
"\n",
|
| 151 |
"# DATASET_CHOICE = \"ultrachat\"\n",
|
|
|
|
| 153 |
"# DATASET_CHOICE = \"sharegpt_en\"\n",
|
| 154 |
"# DATASET_CHOICE = \"sharegpt_de\"\n",
|
| 155 |
"# DATASET_CHOICE = \"sharegpt_hi\"\n",
|
| 156 |
+
"# DATASET_CHOICE = \"code_corpus\"\n",
|
| 157 |
"# DATASET_CHOICE = \"custom_mix\"\n",
|
| 158 |
"\n",
|
| 159 |
"CUSTOM_DATASETS = [\n",
|
|
|
|
| 207 |
" msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
|
| 208 |
" return {\"messages\": msgs}\n",
|
| 209 |
"\n",
|
| 210 |
+
"def _convert_code_corpus(example):\n",
|
| 211 |
+
" code_text = example[\"text\"]\n",
|
| 212 |
+
" domain = example.get(\"domain\", \"code\")\n",
|
| 213 |
+
" repo = example.get(\"repo\", \"unknown\")\n",
|
| 214 |
+
" lang = example.get(\"language\", \"\")\n",
|
| 215 |
+
" user_prompt = f\"Here is a code snippet from the {domain} domain (repo: {repo}, language: {lang}). Please explain or improve it.\"\n",
|
| 216 |
+
" return {\"messages\": [\n",
|
| 217 |
+
" {\"role\": \"user\", \"content\": user_prompt},\n",
|
| 218 |
+
" {\"role\": \"assistant\", \"content\": code_text},\n",
|
| 219 |
+
" ]}\n",
|
| 220 |
+
"\n",
|
| 221 |
"all_datasets = []\n",
|
| 222 |
"\n",
|
| 223 |
"if DATASET_CHOICE == \"cybersecurity\":\n",
|
|
|
|
| 244 |
" ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
| 245 |
" all_datasets.append(ds)\n",
|
| 246 |
"\n",
|
| 247 |
+
"elif DATASET_CHOICE == \"code_corpus\":\n",
|
| 248 |
+
" ds = load_dataset(\"krystv/code-corpus-llm-training\", split=\"train\")\n",
|
| 249 |
+
" ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
|
| 250 |
+
" all_datasets.append(ds)\n",
|
| 251 |
+
"\n",
|
| 252 |
"elif DATASET_CHOICE == \"custom_mix\":\n",
|
| 253 |
" for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
|
| 254 |
" ds = load_dataset(ds_id, split=split)\n",
|
|
|
|
| 256 |
" ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
|
| 257 |
" if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
|
| 258 |
" elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
| 259 |
+
" elif fmt == \"text\": ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
|
| 260 |
" all_datasets.append(ds)\n",
|
| 261 |
"\n",
|
| 262 |
"else:\n",
|
|
|
|
| 450 |
"| **Liquid AI Models** | https://www.liquid.ai/models |\n",
|
| 451 |
"| **LFM2.5-1.2B-Instruct** | https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct |\n",
|
| 452 |
"| **Unsloth LFM2.5 Docs** | https://unsloth.ai/docs/models/tutorials/lfm2.5 |\n",
|
| 453 |
+
"| **Code Corpus LLM Training** | https://huggingface.co/datasets/krystv/code-corpus-llm-training |\n",
|
| 454 |
"| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
|
| 455 |
"| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
|
| 456 |
"| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",
|