asdf98 commited on
Commit
2fab0ea
·
verified ·
1 Parent(s): 8dcc13b

Upload EthicalHacking_Gemma4_E2B_Colab.ipynb

Browse files
Files changed (1) hide show
  1. EthicalHacking_Gemma4_E2B_Colab.ipynb +44 -31
EthicalHacking_Gemma4_E2B_Colab.ipynb CHANGED
@@ -26,8 +26,7 @@
26
  "| Max seq length | **2048 max** |\n",
27
  "| LoRA rank | **8** (save VRAM) |\n",
28
  "\n",
29
- "**Unsloth docs:** https://unsloth.ai/docs/models/gemma-4/train \n",
30
- "**Official notebook:** https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma4_(E2B)-Text.ipynb"
31
  ]
32
  },
33
  {
@@ -61,7 +60,7 @@
61
  "outputs": [],
62
  "source": [
63
  "from huggingface_hub import login\n",
64
- "# login(token=\"hf_YOUR_TOKEN\") # ← uncomment and paste your token"
65
  ]
66
  },
67
  {
@@ -93,23 +92,21 @@
93
  "from unsloth import FastLanguageModel\n",
94
  "import torch\n",
95
  "\n",
96
- "# ==================== T4-COLAB HYPERPARAMETERS (Gemma-4 E2B) ====================\n",
97
- "MAX_SEQ_LENGTH = 2048 # DO NOT exceed 2048 on T4\n",
98
- "LORA_R = 8 # small rank for memory\n",
99
- "LORA_ALPHA = 8 \n",
100
- "BATCH_SIZE = 1 # MUST be 1 on T4\n",
101
- "GRAD_ACCUM = 8 # effective batch = 8\n",
102
- "LEARNING_RATE = 2e-4 \n",
103
- "MAX_STEPS = 4000 \n",
104
- "WARMUP_STEPS = 100 \n",
105
- "LOGGING_STEPS = 50 \n",
106
- "SAVE_STEPS = 500 \n",
107
- "PACKING = False # False = safer memory\n",
108
- "SAMPLE_SIZE = 50000 \n",
109
  "HUB_MODEL_ID = \"your-username/gemma4-e2b-lora\"\n",
110
- "# ================================================================================\n",
111
  "\n",
112
- "MODEL_NAME = \"unsloth/gemma-4-E2B-it-unsloth-bnb-4bit\" # ~7.6GB download\n",
113
  "\n",
114
  "model, tokenizer = FastLanguageModel.from_pretrained(\n",
115
  " model_name=MODEL_NAME,\n",
@@ -126,7 +123,7 @@
126
  " lora_alpha=LORA_ALPHA,\n",
127
  " lora_dropout=0,\n",
128
  " bias=\"none\",\n",
129
- " use_gradient_checkpointing=\"unsloth\", # CRITICAL for T4\n",
130
  " random_state=3407,\n",
131
  " use_rslora=False,\n",
132
  " loftq_config=None,\n",
@@ -147,15 +144,16 @@
147
  "\n",
148
  "Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
149
  "\n",
150
- "| Choice | Dataset | Size | Format | Best For |\n",
151
  "|--------|---------|------|--------|----------|\n",
152
- "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K | system/user/assistant | **Ethical hacking education** |\n",
153
- "| `\"ultrachat\"` | UltraChat 200K SFT | 200K | messages | General conversation |\n",
154
- "| `\"openhermes\"` | OpenHermes 2.5 | 1M+ | conversations | Reasoning, coding |\n",
155
- "| `\"sharegpt_en\"` | ShareGPT English | ~90K | conversations | Multi-turn dialogue |\n",
156
- "| `\"sharegpt_de\"` | ShareGPT German | ~104K | conversations | German fine-tuning |\n",
157
- "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K | conversations | Hindi fine-tuning |\n",
158
- "| `\"custom_mix\"` | Your mix | | varies | Combine multiple |"
 
159
  ]
160
  },
161
  {
@@ -166,10 +164,6 @@
166
  "source": [
167
  "from datasets import load_dataset, concatenate_datasets\n",
168
  "\n",
169
- "# ═══════════════════════════════════════════════════════════════\n",
170
- "# SELECT YOUR DATASET — UNCOMMENT ONE LINE\n",
171
- "# ═══════════════════════════════════════════════════════════════\n",
172
- "\n",
173
  "DATASET_CHOICE = \"cybersecurity\"\n",
174
  "\n",
175
  "# DATASET_CHOICE = \"ultrachat\"\n",
@@ -177,6 +171,7 @@
177
  "# DATASET_CHOICE = \"sharegpt_en\"\n",
178
  "# DATASET_CHOICE = \"sharegpt_de\"\n",
179
  "# DATASET_CHOICE = \"sharegpt_hi\"\n",
 
180
  "# DATASET_CHOICE = \"custom_mix\"\n",
181
  "\n",
182
  "CUSTOM_DATASETS = [\n",
@@ -230,6 +225,17 @@
230
  " msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
231
  " return {\"messages\": msgs}\n",
232
  "\n",
 
 
 
 
 
 
 
 
 
 
 
233
  "all_datasets = []\n",
234
  "\n",
235
  "if DATASET_CHOICE == \"cybersecurity\":\n",
@@ -256,6 +262,11 @@
256
  " ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
257
  " all_datasets.append(ds)\n",
258
  "\n",
 
 
 
 
 
259
  "elif DATASET_CHOICE == \"custom_mix\":\n",
260
  " for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
261
  " ds = load_dataset(ds_id, split=split)\n",
@@ -263,6 +274,7 @@
263
  " ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
264
  " if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
265
  " elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
 
266
  " all_datasets.append(ds)\n",
267
  "\n",
268
  "else:\n",
@@ -460,6 +472,7 @@
460
  "| **Gemma 4 Paper** | https://storage.googleapis.com/deepmind-media/gemma/gemma-4-report.pdf |\n",
461
  "| **Gemma 4 E2B** | https://huggingface.co/google/gemma-4-E2B-it |\n",
462
  "| **Unsloth Gemma-4 Train** | https://unsloth.ai/docs/models/gemma-4/train |\n",
 
463
  "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
464
  "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
465
  "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",
 
26
  "| Max seq length | **2048 max** |\n",
27
  "| LoRA rank | **8** (save VRAM) |\n",
28
  "\n",
29
+ "**Unsloth docs:** https://unsloth.ai/docs/models/gemma-4/train"
 
30
  ]
31
  },
32
  {
 
60
  "outputs": [],
61
  "source": [
62
  "from huggingface_hub import login\n",
63
+ "# login(token=\"hf_YOUR_TOKEN\")"
64
  ]
65
  },
66
  {
 
92
  "from unsloth import FastLanguageModel\n",
93
  "import torch\n",
94
  "\n",
95
+ "MAX_SEQ_LENGTH = 2048\n",
96
+ "LORA_R = 8\n",
97
+ "LORA_ALPHA = 8\n",
98
+ "BATCH_SIZE = 1\n",
99
+ "GRAD_ACCUM = 8\n",
100
+ "LEARNING_RATE = 2e-4\n",
101
+ "MAX_STEPS = 4000\n",
102
+ "WARMUP_STEPS = 100\n",
103
+ "LOGGING_STEPS = 50\n",
104
+ "SAVE_STEPS = 500\n",
105
+ "PACKING = False\n",
106
+ "SAMPLE_SIZE = 50000\n",
 
107
  "HUB_MODEL_ID = \"your-username/gemma4-e2b-lora\"\n",
 
108
  "\n",
109
+ "MODEL_NAME = \"unsloth/gemma-4-E2B-it-unsloth-bnb-4bit\"\n",
110
  "\n",
111
  "model, tokenizer = FastLanguageModel.from_pretrained(\n",
112
  " model_name=MODEL_NAME,\n",
 
123
  " lora_alpha=LORA_ALPHA,\n",
124
  " lora_dropout=0,\n",
125
  " bias=\"none\",\n",
126
+ " use_gradient_checkpointing=\"unsloth\",\n",
127
  " random_state=3407,\n",
128
  " use_rslora=False,\n",
129
  " loftq_config=None,\n",
 
144
  "\n",
145
  "Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
146
  "\n",
147
+ "| Choice | Dataset | Rows | Format | Best For |\n",
148
  "|--------|---------|------|--------|----------|\n",
149
+ "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K→50K | system/user/assistant | Ethical hacking education |\n",
150
+ "| `\"ultrachat\"` | UltraChat 200K SFT | 200K→50K | messages | General conversation |\n",
151
+ "| `\"openhermes\"` | OpenHermes 2.5 | 1M+→50K | conversations | Reasoning, coding |\n",
152
+ "| `\"sharegpt_en\"` | ShareGPT English | ~90K→50K | conversations | Multi-turn dialogue |\n",
153
+ "| `\"sharegpt_de\"` | ShareGPT German | ~104K→50K | conversations | German fine-tuning |\n",
154
+ "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K→50K | conversations | Hindi fine-tuning |\n",
155
+ "| `\"code_corpus\"` | [Code Corpus](https://huggingface.co/datasets/krystv/code-corpus-llm-training) | 240K→50K | text (code files) | **Code completion** |\n",
156
+ "| `\"custom_mix\"` | Mix of your choice | — | varies | Combine datasets |"
157
  ]
158
  },
159
  {
 
164
  "source": [
165
  "from datasets import load_dataset, concatenate_datasets\n",
166
  "\n",
 
 
 
 
167
  "DATASET_CHOICE = \"cybersecurity\"\n",
168
  "\n",
169
  "# DATASET_CHOICE = \"ultrachat\"\n",
 
171
  "# DATASET_CHOICE = \"sharegpt_en\"\n",
172
  "# DATASET_CHOICE = \"sharegpt_de\"\n",
173
  "# DATASET_CHOICE = \"sharegpt_hi\"\n",
174
+ "# DATASET_CHOICE = \"code_corpus\"\n",
175
  "# DATASET_CHOICE = \"custom_mix\"\n",
176
  "\n",
177
  "CUSTOM_DATASETS = [\n",
 
225
  " msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
226
  " return {\"messages\": msgs}\n",
227
  "\n",
228
+ "def _convert_code_corpus(example):\n",
229
+ " code_text = example[\"text\"]\n",
230
+ " domain = example.get(\"domain\", \"code\")\n",
231
+ " repo = example.get(\"repo\", \"unknown\")\n",
232
+ " lang = example.get(\"language\", \"\")\n",
233
+ " user_prompt = f\"Here is a code snippet from the {domain} domain (repo: {repo}, language: {lang}). Please explain or improve it.\"\n",
234
+ " return {\"messages\": [\n",
235
+ " {\"role\": \"user\", \"content\": user_prompt},\n",
236
+ " {\"role\": \"assistant\", \"content\": code_text},\n",
237
+ " ]}\n",
238
+ "\n",
239
  "all_datasets = []\n",
240
  "\n",
241
  "if DATASET_CHOICE == \"cybersecurity\":\n",
 
262
  " ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
263
  " all_datasets.append(ds)\n",
264
  "\n",
265
+ "elif DATASET_CHOICE == \"code_corpus\":\n",
266
+ " ds = load_dataset(\"krystv/code-corpus-llm-training\", split=\"train\")\n",
267
+ " ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
268
+ " all_datasets.append(ds)\n",
269
+ "\n",
270
  "elif DATASET_CHOICE == \"custom_mix\":\n",
271
  " for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
272
  " ds = load_dataset(ds_id, split=split)\n",
 
274
  " ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
275
  " if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
276
  " elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
277
+ " elif fmt == \"text\": ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
278
  " all_datasets.append(ds)\n",
279
  "\n",
280
  "else:\n",
 
472
  "| **Gemma 4 Paper** | https://storage.googleapis.com/deepmind-media/gemma/gemma-4-report.pdf |\n",
473
  "| **Gemma 4 E2B** | https://huggingface.co/google/gemma-4-E2B-it |\n",
474
  "| **Unsloth Gemma-4 Train** | https://unsloth.ai/docs/models/gemma-4/train |\n",
475
+ "| **Code Corpus LLM Training** | https://huggingface.co/datasets/krystv/code-corpus-llm-training |\n",
476
  "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
477
  "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
478
  "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",