asdf98
/

ethical-hacking-llm-colab

Model card Files Files and versions

xet

Community

asdf98 commited on 19 days ago

Commit

2fab0ea

verified ·

1 Parent(s): 8dcc13b

Upload EthicalHacking_Gemma4_E2B_Colab.ipynb

Browse files

Files changed (1) hide show

EthicalHacking_Gemma4_E2B_Colab.ipynb +44 -31

EthicalHacking_Gemma4_E2B_Colab.ipynb CHANGED Viewed

@@ -26,8 +26,7 @@
     "| Max seq length | **2048 max** |\n",
     "| LoRA rank | **8** (save VRAM) |\n",
     "\n",
-    "**Unsloth docs:** https://unsloth.ai/docs/models/gemma-4/train  \n",
-    "**Official notebook:** https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma4_(E2B)-Text.ipynb"
    ]
   },
   {
@@ -61,7 +60,7 @@
    "outputs": [],
    "source": [
     "from huggingface_hub import login\n",
-    "# login(token=\"hf_YOUR_TOKEN\")   # ← uncomment and paste your token"
    ]
   },
   {
@@ -93,23 +92,21 @@
     "from unsloth import FastLanguageModel\n",
     "import torch\n",
     "\n",
-    "# ==================== T4-COLAB HYPERPARAMETERS (Gemma-4 E2B) ====================\n",
-    "MAX_SEQ_LENGTH = 2048          # DO NOT exceed 2048 on T4\n",
-    "LORA_R = 8                     # small rank for memory\n",
-    "LORA_ALPHA = 8                 \n",
-    "BATCH_SIZE = 1                 # MUST be 1 on T4\n",
-    "GRAD_ACCUM = 8                 # effective batch = 8\n",
-    "LEARNING_RATE = 2e-4           \n",
-    "MAX_STEPS = 4000               \n",
-    "WARMUP_STEPS = 100             \n",
-    "LOGGING_STEPS = 50             \n",
-    "SAVE_STEPS = 500               \n",
-    "PACKING = False                # False = safer memory\n",
-    "SAMPLE_SIZE = 50000            \n",
     "HUB_MODEL_ID = \"your-username/gemma4-e2b-lora\"\n",
-    "# ================================================================================\n",
     "\n",
-    "MODEL_NAME = \"unsloth/gemma-4-E2B-it-unsloth-bnb-4bit\"  # ~7.6GB download\n",
     "\n",
     "model, tokenizer = FastLanguageModel.from_pretrained(\n",
     "    model_name=MODEL_NAME,\n",
@@ -126,7 +123,7 @@
     "    lora_alpha=LORA_ALPHA,\n",
     "    lora_dropout=0,\n",
     "    bias=\"none\",\n",
-    "    use_gradient_checkpointing=\"unsloth\",  # CRITICAL for T4\n",
     "    random_state=3407,\n",
     "    use_rslora=False,\n",
     "    loftq_config=None,\n",
@@ -147,15 +144,16 @@
     "\n",
     "Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
     "\n",
-    "| Choice | Dataset | Size | Format | Best For |\n",
     "|--------|---------|------|--------|----------|\n",
-    "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K | system/user/assistant | **Ethical hacking education** |\n",
-    "| `\"ultrachat\"` | UltraChat 200K SFT | 200K | messages | General conversation |\n",
-    "| `\"openhermes\"` | OpenHermes 2.5 | 1M+ | conversations | Reasoning, coding |\n",
-    "| `\"sharegpt_en\"` | ShareGPT English | ~90K | conversations | Multi-turn dialogue |\n",
-    "| `\"sharegpt_de\"` | ShareGPT German | ~104K | conversations | German fine-tuning |\n",
-    "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K | conversations | Hindi fine-tuning |\n",
-    "| `\"custom_mix\"` | Your mix | — | varies | Combine multiple |"
    ]
   },
   {
@@ -166,10 +164,6 @@
    "source": [
     "from datasets import load_dataset, concatenate_datasets\n",
     "\n",
-    "# ═══════════════════════════════════════════════════════════════\n",
-    "#   SELECT YOUR DATASET — UNCOMMENT ONE LINE\n",
-    "# ═══════════════════════════════════════════════════════════════\n",
-    "\n",
     "DATASET_CHOICE = \"cybersecurity\"\n",
     "\n",
     "# DATASET_CHOICE = \"ultrachat\"\n",
@@ -177,6 +171,7 @@
     "# DATASET_CHOICE = \"sharegpt_en\"\n",
     "# DATASET_CHOICE = \"sharegpt_de\"\n",
     "# DATASET_CHOICE = \"sharegpt_hi\"\n",
     "# DATASET_CHOICE = \"custom_mix\"\n",
     "\n",
     "CUSTOM_DATASETS = [\n",
@@ -230,6 +225,17 @@
     "        msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
     "    return {\"messages\": msgs}\n",
     "\n",
     "all_datasets = []\n",
     "\n",
     "if DATASET_CHOICE == \"cybersecurity\":\n",
@@ -256,6 +262,11 @@
     "    ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
     "    all_datasets.append(ds)\n",
     "\n",
     "elif DATASET_CHOICE == \"custom_mix\":\n",
     "    for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
     "        ds = load_dataset(ds_id, split=split)\n",
@@ -263,6 +274,7 @@
     "            ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
     "        if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
     "        elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
     "        all_datasets.append(ds)\n",
     "\n",
     "else:\n",
@@ -460,6 +472,7 @@
     "| **Gemma 4 Paper** | https://storage.googleapis.com/deepmind-media/gemma/gemma-4-report.pdf |\n",
     "| **Gemma 4 E2B** | https://huggingface.co/google/gemma-4-E2B-it |\n",
     "| **Unsloth Gemma-4 Train** | https://unsloth.ai/docs/models/gemma-4/train |\n",
     "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
     "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
     "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",

     "| Max seq length | **2048 max** |\n",
     "| LoRA rank | **8** (save VRAM) |\n",
     "\n",
+    "**Unsloth docs:** https://unsloth.ai/docs/models/gemma-4/train"
    ]
   },
   {
    "outputs": [],
    "source": [
     "from huggingface_hub import login\n",
+    "# login(token=\"hf_YOUR_TOKEN\")"
    ]
   },
   {
     "from unsloth import FastLanguageModel\n",
     "import torch\n",
     "\n",
+    "MAX_SEQ_LENGTH = 2048\n",
+    "LORA_R = 8\n",
+    "LORA_ALPHA = 8\n",
+    "BATCH_SIZE = 1\n",
+    "GRAD_ACCUM = 8\n",
+    "LEARNING_RATE = 2e-4\n",
+    "MAX_STEPS = 4000\n",
+    "WARMUP_STEPS = 100\n",
+    "LOGGING_STEPS = 50\n",
+    "SAVE_STEPS = 500\n",
+    "PACKING = False\n",
+    "SAMPLE_SIZE = 50000\n",
     "HUB_MODEL_ID = \"your-username/gemma4-e2b-lora\"\n",
     "\n",
+    "MODEL_NAME = \"unsloth/gemma-4-E2B-it-unsloth-bnb-4bit\"\n",
     "\n",
     "model, tokenizer = FastLanguageModel.from_pretrained(\n",
     "    model_name=MODEL_NAME,\n",
     "    lora_alpha=LORA_ALPHA,\n",
     "    lora_dropout=0,\n",
     "    bias=\"none\",\n",
+    "    use_gradient_checkpointing=\"unsloth\",\n",
     "    random_state=3407,\n",
     "    use_rslora=False,\n",
     "    loftq_config=None,\n",
     "\n",
     "Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
     "\n",
+    "| Choice | Dataset | Rows | Format | Best For |\n",
     "|--------|---------|------|--------|----------|\n",
+    "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K→50K | system/user/assistant | Ethical hacking education |\n",
+    "| `\"ultrachat\"` | UltraChat 200K SFT | 200K→50K | messages | General conversation |\n",
+    "| `\"openhermes\"` | OpenHermes 2.5 | 1M+→50K | conversations | Reasoning, coding |\n",
+    "| `\"sharegpt_en\"` | ShareGPT English | ~90K→50K | conversations | Multi-turn dialogue |\n",
+    "| `\"sharegpt_de\"` | ShareGPT German | ~104K→50K | conversations | German fine-tuning |\n",
+    "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K→50K | conversations | Hindi fine-tuning |\n",
+    "| `\"code_corpus\"` | [Code Corpus](https://huggingface.co/datasets/krystv/code-corpus-llm-training) | 240K→50K | text (code files) | **Code completion** |\n",
+    "| `\"custom_mix\"` | Mix of your choice | — | varies | Combine datasets |"
    ]
   },
   {
    "source": [
     "from datasets import load_dataset, concatenate_datasets\n",
     "\n",
     "DATASET_CHOICE = \"cybersecurity\"\n",
     "\n",
     "# DATASET_CHOICE = \"ultrachat\"\n",
     "# DATASET_CHOICE = \"sharegpt_en\"\n",
     "# DATASET_CHOICE = \"sharegpt_de\"\n",
     "# DATASET_CHOICE = \"sharegpt_hi\"\n",
+    "# DATASET_CHOICE = \"code_corpus\"\n",
     "# DATASET_CHOICE = \"custom_mix\"\n",
     "\n",
     "CUSTOM_DATASETS = [\n",
     "        msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
     "    return {\"messages\": msgs}\n",
     "\n",
+    "def _convert_code_corpus(example):\n",
+    "    code_text = example[\"text\"]\n",
+    "    domain = example.get(\"domain\", \"code\")\n",
+    "    repo = example.get(\"repo\", \"unknown\")\n",
+    "    lang = example.get(\"language\", \"\")\n",
+    "    user_prompt = f\"Here is a code snippet from the {domain} domain (repo: {repo}, language: {lang}). Please explain or improve it.\"\n",
+    "    return {\"messages\": [\n",
+    "        {\"role\": \"user\",      \"content\": user_prompt},\n",
+    "        {\"role\": \"assistant\", \"content\": code_text},\n",
+    "    ]}\n",
+    "\n",
     "all_datasets = []\n",
     "\n",
     "if DATASET_CHOICE == \"cybersecurity\":\n",
     "    ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
     "    all_datasets.append(ds)\n",
     "\n",
+    "elif DATASET_CHOICE == \"code_corpus\":\n",
+    "    ds = load_dataset(\"krystv/code-corpus-llm-training\", split=\"train\")\n",
+    "    ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
+    "    all_datasets.append(ds)\n",
+    "\n",
     "elif DATASET_CHOICE == \"custom_mix\":\n",
     "    for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
     "        ds = load_dataset(ds_id, split=split)\n",
     "            ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
     "        if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
     "        elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
+    "        elif fmt == \"text\": ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
     "        all_datasets.append(ds)\n",
     "\n",
     "else:\n",
     "| **Gemma 4 Paper** | https://storage.googleapis.com/deepmind-media/gemma/gemma-4-report.pdf |\n",
     "| **Gemma 4 E2B** | https://huggingface.co/google/gemma-4-E2B-it |\n",
     "| **Unsloth Gemma-4 Train** | https://unsloth.ai/docs/models/gemma-4/train |\n",
+    "| **Code Corpus LLM Training** | https://huggingface.co/datasets/krystv/code-corpus-llm-training |\n",
     "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
     "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
     "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",