Upload EthicalHacking_Gemma4_E2B_Colab.ipynb
Browse files
EthicalHacking_Gemma4_E2B_Colab.ipynb
CHANGED
|
@@ -26,8 +26,7 @@
|
|
| 26 |
"| Max seq length | **2048 max** |\n",
|
| 27 |
"| LoRA rank | **8** (save VRAM) |\n",
|
| 28 |
"\n",
|
| 29 |
-
"**Unsloth docs:** https://unsloth.ai/docs/models/gemma-4/train
|
| 30 |
-
"**Official notebook:** https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma4_(E2B)-Text.ipynb"
|
| 31 |
]
|
| 32 |
},
|
| 33 |
{
|
|
@@ -61,7 +60,7 @@
|
|
| 61 |
"outputs": [],
|
| 62 |
"source": [
|
| 63 |
"from huggingface_hub import login\n",
|
| 64 |
-
"# login(token=\"hf_YOUR_TOKEN\")
|
| 65 |
]
|
| 66 |
},
|
| 67 |
{
|
|
@@ -93,23 +92,21 @@
|
|
| 93 |
"from unsloth import FastLanguageModel\n",
|
| 94 |
"import torch\n",
|
| 95 |
"\n",
|
| 96 |
-
"
|
| 97 |
-
"
|
| 98 |
-
"
|
| 99 |
-
"
|
| 100 |
-
"
|
| 101 |
-
"
|
| 102 |
-
"
|
| 103 |
-
"
|
| 104 |
-
"
|
| 105 |
-
"
|
| 106 |
-
"
|
| 107 |
-
"
|
| 108 |
-
"SAMPLE_SIZE = 50000 \n",
|
| 109 |
"HUB_MODEL_ID = \"your-username/gemma4-e2b-lora\"\n",
|
| 110 |
-
"# ================================================================================\n",
|
| 111 |
"\n",
|
| 112 |
-
"MODEL_NAME = \"unsloth/gemma-4-E2B-it-unsloth-bnb-4bit\"
|
| 113 |
"\n",
|
| 114 |
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
|
| 115 |
" model_name=MODEL_NAME,\n",
|
|
@@ -126,7 +123,7 @@
|
|
| 126 |
" lora_alpha=LORA_ALPHA,\n",
|
| 127 |
" lora_dropout=0,\n",
|
| 128 |
" bias=\"none\",\n",
|
| 129 |
-
" use_gradient_checkpointing=\"unsloth\",
|
| 130 |
" random_state=3407,\n",
|
| 131 |
" use_rslora=False,\n",
|
| 132 |
" loftq_config=None,\n",
|
|
@@ -147,15 +144,16 @@
|
|
| 147 |
"\n",
|
| 148 |
"Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
|
| 149 |
"\n",
|
| 150 |
-
"| Choice | Dataset |
|
| 151 |
"|--------|---------|------|--------|----------|\n",
|
| 152 |
-
"| `\"cybersecurity\"` | Fenrir + Trendyol | 153K | system/user/assistant |
|
| 153 |
-
"| `\"ultrachat\"` | UltraChat 200K SFT | 200K | messages | General conversation |\n",
|
| 154 |
-
"| `\"openhermes\"` | OpenHermes 2.5 | 1M+ | conversations | Reasoning, coding |\n",
|
| 155 |
-
"| `\"sharegpt_en\"` | ShareGPT English | ~90K | conversations | Multi-turn dialogue |\n",
|
| 156 |
-
"| `\"sharegpt_de\"` | ShareGPT German | ~104K | conversations | German fine-tuning |\n",
|
| 157 |
-
"| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K | conversations | Hindi fine-tuning |\n",
|
| 158 |
-
"| `\"
|
|
|
|
| 159 |
]
|
| 160 |
},
|
| 161 |
{
|
|
@@ -166,10 +164,6 @@
|
|
| 166 |
"source": [
|
| 167 |
"from datasets import load_dataset, concatenate_datasets\n",
|
| 168 |
"\n",
|
| 169 |
-
"# ═══════════════════════════════════════════════════════════════\n",
|
| 170 |
-
"# SELECT YOUR DATASET — UNCOMMENT ONE LINE\n",
|
| 171 |
-
"# ═══════════════════════════════════════════════════════════════\n",
|
| 172 |
-
"\n",
|
| 173 |
"DATASET_CHOICE = \"cybersecurity\"\n",
|
| 174 |
"\n",
|
| 175 |
"# DATASET_CHOICE = \"ultrachat\"\n",
|
|
@@ -177,6 +171,7 @@
|
|
| 177 |
"# DATASET_CHOICE = \"sharegpt_en\"\n",
|
| 178 |
"# DATASET_CHOICE = \"sharegpt_de\"\n",
|
| 179 |
"# DATASET_CHOICE = \"sharegpt_hi\"\n",
|
|
|
|
| 180 |
"# DATASET_CHOICE = \"custom_mix\"\n",
|
| 181 |
"\n",
|
| 182 |
"CUSTOM_DATASETS = [\n",
|
|
@@ -230,6 +225,17 @@
|
|
| 230 |
" msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
|
| 231 |
" return {\"messages\": msgs}\n",
|
| 232 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
"all_datasets = []\n",
|
| 234 |
"\n",
|
| 235 |
"if DATASET_CHOICE == \"cybersecurity\":\n",
|
|
@@ -256,6 +262,11 @@
|
|
| 256 |
" ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
| 257 |
" all_datasets.append(ds)\n",
|
| 258 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
"elif DATASET_CHOICE == \"custom_mix\":\n",
|
| 260 |
" for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
|
| 261 |
" ds = load_dataset(ds_id, split=split)\n",
|
|
@@ -263,6 +274,7 @@
|
|
| 263 |
" ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
|
| 264 |
" if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
|
| 265 |
" elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
|
|
|
| 266 |
" all_datasets.append(ds)\n",
|
| 267 |
"\n",
|
| 268 |
"else:\n",
|
|
@@ -460,6 +472,7 @@
|
|
| 460 |
"| **Gemma 4 Paper** | https://storage.googleapis.com/deepmind-media/gemma/gemma-4-report.pdf |\n",
|
| 461 |
"| **Gemma 4 E2B** | https://huggingface.co/google/gemma-4-E2B-it |\n",
|
| 462 |
"| **Unsloth Gemma-4 Train** | https://unsloth.ai/docs/models/gemma-4/train |\n",
|
|
|
|
| 463 |
"| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
|
| 464 |
"| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
|
| 465 |
"| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",
|
|
|
|
| 26 |
"| Max seq length | **2048 max** |\n",
|
| 27 |
"| LoRA rank | **8** (save VRAM) |\n",
|
| 28 |
"\n",
|
| 29 |
+
"**Unsloth docs:** https://unsloth.ai/docs/models/gemma-4/train"
|
|
|
|
| 30 |
]
|
| 31 |
},
|
| 32 |
{
|
|
|
|
| 60 |
"outputs": [],
|
| 61 |
"source": [
|
| 62 |
"from huggingface_hub import login\n",
|
| 63 |
+
"# login(token=\"hf_YOUR_TOKEN\")"
|
| 64 |
]
|
| 65 |
},
|
| 66 |
{
|
|
|
|
| 92 |
"from unsloth import FastLanguageModel\n",
|
| 93 |
"import torch\n",
|
| 94 |
"\n",
|
| 95 |
+
"MAX_SEQ_LENGTH = 2048\n",
|
| 96 |
+
"LORA_R = 8\n",
|
| 97 |
+
"LORA_ALPHA = 8\n",
|
| 98 |
+
"BATCH_SIZE = 1\n",
|
| 99 |
+
"GRAD_ACCUM = 8\n",
|
| 100 |
+
"LEARNING_RATE = 2e-4\n",
|
| 101 |
+
"MAX_STEPS = 4000\n",
|
| 102 |
+
"WARMUP_STEPS = 100\n",
|
| 103 |
+
"LOGGING_STEPS = 50\n",
|
| 104 |
+
"SAVE_STEPS = 500\n",
|
| 105 |
+
"PACKING = False\n",
|
| 106 |
+
"SAMPLE_SIZE = 50000\n",
|
|
|
|
| 107 |
"HUB_MODEL_ID = \"your-username/gemma4-e2b-lora\"\n",
|
|
|
|
| 108 |
"\n",
|
| 109 |
+
"MODEL_NAME = \"unsloth/gemma-4-E2B-it-unsloth-bnb-4bit\"\n",
|
| 110 |
"\n",
|
| 111 |
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
|
| 112 |
" model_name=MODEL_NAME,\n",
|
|
|
|
| 123 |
" lora_alpha=LORA_ALPHA,\n",
|
| 124 |
" lora_dropout=0,\n",
|
| 125 |
" bias=\"none\",\n",
|
| 126 |
+
" use_gradient_checkpointing=\"unsloth\",\n",
|
| 127 |
" random_state=3407,\n",
|
| 128 |
" use_rslora=False,\n",
|
| 129 |
" loftq_config=None,\n",
|
|
|
|
| 144 |
"\n",
|
| 145 |
"Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
|
| 146 |
"\n",
|
| 147 |
+
"| Choice | Dataset | Rows | Format | Best For |\n",
|
| 148 |
"|--------|---------|------|--------|----------|\n",
|
| 149 |
+
"| `\"cybersecurity\"` | Fenrir + Trendyol | 153K→50K | system/user/assistant | Ethical hacking education |\n",
|
| 150 |
+
"| `\"ultrachat\"` | UltraChat 200K SFT | 200K→50K | messages | General conversation |\n",
|
| 151 |
+
"| `\"openhermes\"` | OpenHermes 2.5 | 1M+→50K | conversations | Reasoning, coding |\n",
|
| 152 |
+
"| `\"sharegpt_en\"` | ShareGPT English | ~90K→50K | conversations | Multi-turn dialogue |\n",
|
| 153 |
+
"| `\"sharegpt_de\"` | ShareGPT German | ~104K→50K | conversations | German fine-tuning |\n",
|
| 154 |
+
"| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K→50K | conversations | Hindi fine-tuning |\n",
|
| 155 |
+
"| `\"code_corpus\"` | [Code Corpus](https://huggingface.co/datasets/krystv/code-corpus-llm-training) | 240K→50K | text (code files) | **Code completion** |\n",
|
| 156 |
+
"| `\"custom_mix\"` | Mix of your choice | — | varies | Combine datasets |"
|
| 157 |
]
|
| 158 |
},
|
| 159 |
{
|
|
|
|
| 164 |
"source": [
|
| 165 |
"from datasets import load_dataset, concatenate_datasets\n",
|
| 166 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
"DATASET_CHOICE = \"cybersecurity\"\n",
|
| 168 |
"\n",
|
| 169 |
"# DATASET_CHOICE = \"ultrachat\"\n",
|
|
|
|
| 171 |
"# DATASET_CHOICE = \"sharegpt_en\"\n",
|
| 172 |
"# DATASET_CHOICE = \"sharegpt_de\"\n",
|
| 173 |
"# DATASET_CHOICE = \"sharegpt_hi\"\n",
|
| 174 |
+
"# DATASET_CHOICE = \"code_corpus\"\n",
|
| 175 |
"# DATASET_CHOICE = \"custom_mix\"\n",
|
| 176 |
"\n",
|
| 177 |
"CUSTOM_DATASETS = [\n",
|
|
|
|
| 225 |
" msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
|
| 226 |
" return {\"messages\": msgs}\n",
|
| 227 |
"\n",
|
| 228 |
+
"def _convert_code_corpus(example):\n",
|
| 229 |
+
" code_text = example[\"text\"]\n",
|
| 230 |
+
" domain = example.get(\"domain\", \"code\")\n",
|
| 231 |
+
" repo = example.get(\"repo\", \"unknown\")\n",
|
| 232 |
+
" lang = example.get(\"language\", \"\")\n",
|
| 233 |
+
" user_prompt = f\"Here is a code snippet from the {domain} domain (repo: {repo}, language: {lang}). Please explain or improve it.\"\n",
|
| 234 |
+
" return {\"messages\": [\n",
|
| 235 |
+
" {\"role\": \"user\", \"content\": user_prompt},\n",
|
| 236 |
+
" {\"role\": \"assistant\", \"content\": code_text},\n",
|
| 237 |
+
" ]}\n",
|
| 238 |
+
"\n",
|
| 239 |
"all_datasets = []\n",
|
| 240 |
"\n",
|
| 241 |
"if DATASET_CHOICE == \"cybersecurity\":\n",
|
|
|
|
| 262 |
" ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
| 263 |
" all_datasets.append(ds)\n",
|
| 264 |
"\n",
|
| 265 |
+
"elif DATASET_CHOICE == \"code_corpus\":\n",
|
| 266 |
+
" ds = load_dataset(\"krystv/code-corpus-llm-training\", split=\"train\")\n",
|
| 267 |
+
" ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
|
| 268 |
+
" all_datasets.append(ds)\n",
|
| 269 |
+
"\n",
|
| 270 |
"elif DATASET_CHOICE == \"custom_mix\":\n",
|
| 271 |
" for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
|
| 272 |
" ds = load_dataset(ds_id, split=split)\n",
|
|
|
|
| 274 |
" ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
|
| 275 |
" if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
|
| 276 |
" elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
|
| 277 |
+
" elif fmt == \"text\": ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
|
| 278 |
" all_datasets.append(ds)\n",
|
| 279 |
"\n",
|
| 280 |
"else:\n",
|
|
|
|
| 472 |
"| **Gemma 4 Paper** | https://storage.googleapis.com/deepmind-media/gemma/gemma-4-report.pdf |\n",
|
| 473 |
"| **Gemma 4 E2B** | https://huggingface.co/google/gemma-4-E2B-it |\n",
|
| 474 |
"| **Unsloth Gemma-4 Train** | https://unsloth.ai/docs/models/gemma-4/train |\n",
|
| 475 |
+
"| **Code Corpus LLM Training** | https://huggingface.co/datasets/krystv/code-corpus-llm-training |\n",
|
| 476 |
"| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
|
| 477 |
"| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
|
| 478 |
"| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",
|