asdf98
/

ethical-hacking-llm-colab

Model card Files Files and versions

xet

Community

asdf98 commited on 19 days ago

Commit

8dcc13b

verified ·

1 Parent(s): fbc4da7

Upload EthicalHacking_LFM2.5_Ultimate_Colab.ipynb

Browse files

Files changed (1) hide show

EthicalHacking_LFM2.5_Ultimate_Colab.ipynb +32 -21

EthicalHacking_LFM2.5_Ultimate_Colab.ipynb CHANGED Viewed

@@ -11,7 +11,7 @@
     "**📊 Datasets:** Your choice — cybersecurity, general chat, multilingual, coding, or mix them!  \n",
     "**⚡ Framework:** Unsloth + TRL SFTTrainer — 2× faster, 70% less VRAM  \n",
     "\n",
-    "> ⚠️ Pick any dataset below. Default is cybersecurity. Mix datasets for hybrid tuning.\n",
     "\n",
     "---\n",
     "\n",
@@ -22,11 +22,10 @@
     "| Parameters | 1.2B |\n",
     "| 4-bit VRAM | ~1.0 GB |\n",
     "| Context | 128K tokens |\n",
-    "| Batch size on T4 | **4-8** |\n",
     "| Training headroom | **~14 GB free** |\n",
     "\n",
-    "**Unsloth docs:** https://unsloth.ai/docs/models/tutorials/lfm2.5  \n",
-    "**Official notebook:** https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Liquid_LFM2_(1.2B)-Conversational.ipynb"
    ]
   },
   {
@@ -60,7 +59,7 @@
    "outputs": [],
    "source": [
     "from huggingface_hub import login\n",
-    "# login(token=\"hf_YOUR_TOKEN\")   # ← uncomment and paste your token"
    ]
   },
   {
@@ -79,7 +78,6 @@
     "from unsloth import FastLanguageModel\n",
     "import torch\n",
     "\n",
-    "# ==================== T4-COLAB HYPERPARAMETERS (LFM2.5) ====================\n",
     "MAX_SEQ_LENGTH = 4096\n",
     "LORA_R = 128\n",
     "LORA_ALPHA = 128\n",
@@ -93,7 +91,6 @@
     "PACKING = True\n",
     "SAMPLE_SIZE = 50000\n",
     "HUB_MODEL_ID = \"your-username/lfm25-lora\"\n",
-    "# ========================================================================\n",
     "\n",
     "model, tokenizer = FastLanguageModel.from_pretrained(\n",
     "    model_name=\"unsloth/LFM2.5-1.2B-Instruct\",\n",
@@ -129,17 +126,16 @@
    "source": [
     "## 4️⃣ 🎯 CHOOSE YOUR DATASET(S)\n",
     "\n",
-    "Uncomment **ONE** `DATASET_CHOICE` line. Mix datasets with `custom_mix`.\n",
-    "\n",
-    "| Choice | Dataset | Size | Format | Best For |\n",
     "|--------|---------|------|--------|----------|\n",
-    "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K | system/user/assistant | **Ethical hacking education** |\n",
-    "| `\"ultrachat\"` | UltraChat 200K SFT | 200K | messages | General conversation |\n",
-    "| `\"openhermes\"` | OpenHermes 2.5 | 1M+ | conversations | Reasoning, coding |\n",
-    "| `\"sharegpt_en\"` | ShareGPT English | ~90K | conversations | Multi-turn dialogue |\n",
-    "| `\"sharegpt_de\"` | ShareGPT German | ~104K | conversations | German fine-tuning |\n",
-    "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K | conversations | Hindi fine-tuning |\n",
-    "| `\"custom_mix\"` | Your mix | — | varies | Combine multiple |"
    ]
   },
   {
@@ -150,10 +146,6 @@
    "source": [
     "from datasets import load_dataset, concatenate_datasets\n",
     "\n",
-    "# ═══════════════════════════════════════════════════════════════\n",
-    "#   SELECT YOUR DATASET — UNCOMMENT ONE LINE\n",
-    "# ═══════════════════════════════════════════════════════════════\n",
-    "\n",
     "DATASET_CHOICE = \"cybersecurity\"\n",
     "\n",
     "# DATASET_CHOICE = \"ultrachat\"\n",
@@ -161,6 +153,7 @@
     "# DATASET_CHOICE = \"sharegpt_en\"\n",
     "# DATASET_CHOICE = \"sharegpt_de\"\n",
     "# DATASET_CHOICE = \"sharegpt_hi\"\n",
     "# DATASET_CHOICE = \"custom_mix\"\n",
     "\n",
     "CUSTOM_DATASETS = [\n",
@@ -214,6 +207,17 @@
     "        msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
     "    return {\"messages\": msgs}\n",
     "\n",
     "all_datasets = []\n",
     "\n",
     "if DATASET_CHOICE == \"cybersecurity\":\n",
@@ -240,6 +244,11 @@
     "    ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
     "    all_datasets.append(ds)\n",
     "\n",
     "elif DATASET_CHOICE == \"custom_mix\":\n",
     "    for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
     "        ds = load_dataset(ds_id, split=split)\n",
@@ -247,6 +256,7 @@
     "            ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
     "        if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
     "        elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
     "        all_datasets.append(ds)\n",
     "\n",
     "else:\n",
@@ -440,6 +450,7 @@
     "| **Liquid AI Models** | https://www.liquid.ai/models |\n",
     "| **LFM2.5-1.2B-Instruct** | https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct |\n",
     "| **Unsloth LFM2.5 Docs** | https://unsloth.ai/docs/models/tutorials/lfm2.5 |\n",
     "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
     "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
     "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",

     "**📊 Datasets:** Your choice — cybersecurity, general chat, multilingual, coding, or mix them!  \n",
     "**⚡ Framework:** Unsloth + TRL SFTTrainer — 2× faster, 70% less VRAM  \n",
     "\n",
+    "> ⚠️ Default is cybersecurity. Pick general-purpose datasets for other domains.\n",
     "\n",
     "---\n",
     "\n",
     "| Parameters | 1.2B |\n",
     "| 4-bit VRAM | ~1.0 GB |\n",
     "| Context | 128K tokens |\n",
+    "| Batch size | **4-8** |\n",
     "| Training headroom | **~14 GB free** |\n",
     "\n",
+    "**Unsloth docs:** https://unsloth.ai/docs/models/tutorials/lfm2.5"
    ]
   },
   {
    "outputs": [],
    "source": [
     "from huggingface_hub import login\n",
+    "# login(token=\"hf_YOUR_TOKEN\")"
    ]
   },
   {
     "from unsloth import FastLanguageModel\n",
     "import torch\n",
     "\n",
     "MAX_SEQ_LENGTH = 4096\n",
     "LORA_R = 128\n",
     "LORA_ALPHA = 128\n",
     "PACKING = True\n",
     "SAMPLE_SIZE = 50000\n",
     "HUB_MODEL_ID = \"your-username/lfm25-lora\"\n",
     "\n",
     "model, tokenizer = FastLanguageModel.from_pretrained(\n",
     "    model_name=\"unsloth/LFM2.5-1.2B-Instruct\",\n",
    "source": [
     "## 4️⃣ 🎯 CHOOSE YOUR DATASET(S)\n",
     "\n",
+    "| Choice | Dataset | Rows | Format | Best For |\n",
     "|--------|---------|------|--------|----------|\n",
+    "| `\"cybersecurity\"` | Fenrir + Trendyol | 153K→50K | system/user/assistant | Ethical hacking education |\n",
+    "| `\"ultrachat\"` | UltraChat 200K SFT | 200K→50K | messages | General conversation |\n",
+    "| `\"openhermes\"` | OpenHermes 2.5 | 1M+→50K | conversations | Reasoning, coding |\n",
+    "| `\"sharegpt_en\"` | ShareGPT English | ~90K→50K | conversations | Multi-turn dialogue |\n",
+    "| `\"sharegpt_de\"` | ShareGPT German | ~104K→50K | conversations | German fine-tuning |\n",
+    "| `\"sharegpt_hi\"` | ShareGPT Hindi | ~153K→50K | conversations | Hindi fine-tuning |\n",
+    "| `\"code_corpus\"` | [Code Corpus](https://huggingface.co/datasets/krystv/code-corpus-llm-training) | 240K→50K | text (code files) | **Code completion, coding assistant** |\n",
+    "| `\"custom_mix\"` | Mix of your choice | — | varies | Combine datasets |"
    ]
   },
   {
    "source": [
     "from datasets import load_dataset, concatenate_datasets\n",
     "\n",
     "DATASET_CHOICE = \"cybersecurity\"\n",
     "\n",
     "# DATASET_CHOICE = \"ultrachat\"\n",
     "# DATASET_CHOICE = \"sharegpt_en\"\n",
     "# DATASET_CHOICE = \"sharegpt_de\"\n",
     "# DATASET_CHOICE = \"sharegpt_hi\"\n",
+    "# DATASET_CHOICE = \"code_corpus\"\n",
     "# DATASET_CHOICE = \"custom_mix\"\n",
     "\n",
     "CUSTOM_DATASETS = [\n",
     "        msgs.append({\"role\": role, \"content\": turn[\"value\"]})\n",
     "    return {\"messages\": msgs}\n",
     "\n",
+    "def _convert_code_corpus(example):\n",
+    "    code_text = example[\"text\"]\n",
+    "    domain = example.get(\"domain\", \"code\")\n",
+    "    repo = example.get(\"repo\", \"unknown\")\n",
+    "    lang = example.get(\"language\", \"\")\n",
+    "    user_prompt = f\"Here is a code snippet from the {domain} domain (repo: {repo}, language: {lang}). Please explain or improve it.\"\n",
+    "    return {\"messages\": [\n",
+    "        {\"role\": \"user\",      \"content\": user_prompt},\n",
+    "        {\"role\": \"assistant\", \"content\": code_text},\n",
+    "    ]}\n",
+    "\n",
     "all_datasets = []\n",
     "\n",
     "if DATASET_CHOICE == \"cybersecurity\":\n",
     "    ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
     "    all_datasets.append(ds)\n",
     "\n",
+    "elif DATASET_CHOICE == \"code_corpus\":\n",
+    "    ds = load_dataset(\"krystv/code-corpus-llm-training\", split=\"train\")\n",
+    "    ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
+    "    all_datasets.append(ds)\n",
+    "\n",
     "elif DATASET_CHOICE == \"custom_mix\":\n",
     "    for ds_id, split, n_rows, fmt in CUSTOM_DATASETS:\n",
     "        ds = load_dataset(ds_id, split=split)\n",
     "            ds = ds.shuffle(seed=3407).select(range(n_rows))\n",
     "        if fmt == \"messages\": ds = ds.map(_convert_ultrachat, remove_columns=ds.column_names, batched=False)\n",
     "        elif fmt == \"conversations\": ds = ds.map(_convert_conversations, remove_columns=ds.column_names, batched=False)\n",
+    "        elif fmt == \"text\": ds = ds.map(_convert_code_corpus, remove_columns=ds.column_names, batched=False)\n",
     "        all_datasets.append(ds)\n",
     "\n",
     "else:\n",
     "| **Liquid AI Models** | https://www.liquid.ai/models |\n",
     "| **LFM2.5-1.2B-Instruct** | https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct |\n",
     "| **Unsloth LFM2.5 Docs** | https://unsloth.ai/docs/models/tutorials/lfm2.5 |\n",
+    "| **Code Corpus LLM Training** | https://huggingface.co/datasets/krystv/code-corpus-llm-training |\n",
     "| **UltraChat 200K** | https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k |\n",
     "| **OpenHermes 2.5** | https://huggingface.co/datasets/teknium/OpenHermes-2.5 |\n",
     "| **ShareGPT Multilingual** | https://huggingface.co/datasets/deepmage121/ShareGPT_multilingual |\n",